mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Allow updating pre-crawler settings from add-ons (#6568)
This commit is contained in:
parent
e0c828b7f6
commit
9bc0029d27
@ -32,7 +32,7 @@ This is an example where two add-ons are enabled in a project's
|
||||
Writing your own add-ons
|
||||
========================
|
||||
|
||||
Add-ons are Python classes that include the following method:
|
||||
Add-ons are Python classes that include one or both of the following methods:
|
||||
|
||||
.. method:: update_settings(settings)
|
||||
|
||||
@ -45,6 +45,15 @@ Add-ons are Python classes that include the following method:
|
||||
:param settings: The settings object storing Scrapy/component configuration
|
||||
:type settings: :class:`~scrapy.settings.Settings`
|
||||
|
||||
.. classmethod:: update_pre_crawler_settings(cls, settings)
|
||||
|
||||
Use this class method instead of the :meth:`update_settings` method to
|
||||
update :ref:`pre-crawler settings <pre-crawler-settings>` whose value is
|
||||
used before the :class:`~scrapy.crawler.Crawler` object is created.
|
||||
|
||||
:param settings: The settings object storing Scrapy/component configuration
|
||||
:type settings: :class:`~scrapy.settings.BaseSettings`
|
||||
|
||||
They can also have the following method:
|
||||
|
||||
.. classmethod:: from_crawler(cls, crawler)
|
||||
|
@ -246,24 +246,10 @@ Same example but running the spiders sequentially by chaining the deferreds:
|
||||
crawl()
|
||||
reactor.run() # the script will block here until the last crawl call is finished
|
||||
|
||||
Different spiders can set different values for the same setting, but when they
|
||||
run in the same process it may be impossible, by design or because of some
|
||||
limitations, to use these different values. What happens in practice is
|
||||
different for different settings:
|
||||
|
||||
* :setting:`SPIDER_LOADER_CLASS` and the ones used by its value
|
||||
(:setting:`SPIDER_MODULES`, :setting:`SPIDER_LOADER_WARN_ONLY` for the
|
||||
default one) cannot be read from the per-spider settings. These are applied
|
||||
when the :class:`~scrapy.crawler.CrawlerRunner` or
|
||||
:class:`~scrapy.crawler.CrawlerProcess` object is created.
|
||||
* For :setting:`TWISTED_REACTOR` and :setting:`ASYNCIO_EVENT_LOOP` the first
|
||||
available value is used, and if a spider requests a different reactor an
|
||||
exception will be raised. These are applied when the reactor is installed.
|
||||
* For :setting:`REACTOR_THREADPOOL_MAXSIZE`, :setting:`DNS_RESOLVER` and the
|
||||
ones used by the resolver (:setting:`DNSCACHE_ENABLED`,
|
||||
:setting:`DNSCACHE_SIZE`, :setting:`DNS_TIMEOUT` for ones included in Scrapy)
|
||||
the first available value is used. These are applied when the reactor is
|
||||
started.
|
||||
.. note:: When running multiple spiders in the same process, :ref:`reactor
|
||||
settings <reactor-settings>` should not have a different value per spider.
|
||||
Also, :ref:`pre-crawler settings <pre-crawler-settings>` cannot be defined
|
||||
per spider.
|
||||
|
||||
.. seealso:: :ref:`run-from-script`.
|
||||
|
||||
|
@ -33,42 +33,48 @@ Python :ref:`import search path <tut-searchpath>`.
|
||||
Populating the settings
|
||||
=======================
|
||||
|
||||
Settings can be populated using different mechanisms, each of which having a
|
||||
different precedence. Here is the list of them in decreasing order of
|
||||
precedence:
|
||||
Settings can be populated using different mechanisms, each of which has a
|
||||
different precedence:
|
||||
|
||||
1. Command line options (most precedence)
|
||||
2. Settings per-spider
|
||||
3. Project settings module
|
||||
4. Settings set by add-ons
|
||||
5. Default settings per-command
|
||||
6. Default global settings (less precedence)
|
||||
1. :ref:`Command-line settings <cli-settings>` (highest precedence)
|
||||
2. :ref:`Spider settings <spider-settings>`
|
||||
3. :ref:`Project settings <project-settings>`
|
||||
4. :ref:`Add-on settings <addon-settings>`
|
||||
5. :ref:`Command-specific default settings <cmd-default-settings>`
|
||||
6. :ref:`Global default settings <default-settings>` (lowest precedence)
|
||||
|
||||
The population of these settings sources is taken care of internally, but a
|
||||
manual handling is possible using API calls. See the
|
||||
:ref:`topics-api-settings` topic for reference.
|
||||
.. _cli-settings:
|
||||
|
||||
These mechanisms are described in more detail below.
|
||||
1. Command-line settings
|
||||
------------------------
|
||||
|
||||
1. Command line options
|
||||
-----------------------
|
||||
Settings set in the command line have the highest precedence, overriding any
|
||||
other settings.
|
||||
|
||||
Arguments provided by the command line are the ones that take most precedence,
|
||||
overriding any other options. You can explicitly override one (or more)
|
||||
settings using the ``-s`` (or ``--set``) command line option.
|
||||
You can explicitly override one or more settings using the ``-s`` (or
|
||||
``--set``) command-line option.
|
||||
|
||||
.. highlight:: sh
|
||||
|
||||
Example::
|
||||
|
||||
scrapy crawl myspider -s LOG_FILE=scrapy.log
|
||||
scrapy crawl myspider -s LOG_LEVEL=INFO -s LOG_FILE=scrapy.log
|
||||
|
||||
2. Settings per-spider
|
||||
----------------------
|
||||
.. _spider-settings:
|
||||
|
||||
Spiders (See the :ref:`topics-spiders` chapter for reference) can define their
|
||||
own settings that will take precedence and override the project ones. One way
|
||||
to do so is by setting their :attr:`~scrapy.Spider.custom_settings` attribute:
|
||||
2. Spider settings
|
||||
------------------
|
||||
|
||||
:ref:`Spiders <topics-spiders>` can define their own settings that will take
|
||||
precedence and override the project ones.
|
||||
|
||||
.. note:: :ref:`Pre-crawler settings <pre-crawler-settings>` cannot be defined
|
||||
per spider, and :ref:`reactor settings <reactor-settings>` should not have
|
||||
a different value per spider when :ref:`running multiple spiders in the
|
||||
same process <run-multiple-spiders>`.
|
||||
|
||||
One way to do so is by setting their :attr:`~scrapy.Spider.custom_settings`
|
||||
attribute:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -83,7 +89,7 @@ to do so is by setting their :attr:`~scrapy.Spider.custom_settings` attribute:
|
||||
}
|
||||
|
||||
It's often better to implement :meth:`~scrapy.Spider.update_settings` instead,
|
||||
and settings set there should use the "spider" priority explicitly:
|
||||
and settings set there should use the ``"spider"`` priority explicitly:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -121,27 +127,37 @@ arguments <spiderargs>` or other logic:
|
||||
)
|
||||
return spider
|
||||
|
||||
3. Project settings module
|
||||
--------------------------
|
||||
.. _project-settings:
|
||||
|
||||
The project settings module is the standard configuration file for your Scrapy
|
||||
project, it's where most of your custom settings will be populated. For a
|
||||
standard Scrapy project, this means you'll be adding or changing the settings
|
||||
in the ``settings.py`` file created for your project.
|
||||
3. Project settings
|
||||
-------------------
|
||||
|
||||
4. Settings set by add-ons
|
||||
--------------------------
|
||||
Scrapy projects include a settings module, usually a file called
|
||||
``settings.py``, where you should populate most settings that apply to all your
|
||||
spiders.
|
||||
|
||||
.. seealso:: :ref:`topics-settings-module-envvar`
|
||||
|
||||
.. _addon-settings:
|
||||
|
||||
4. Add-on settings
|
||||
------------------
|
||||
|
||||
:ref:`Add-ons <topics-addons>` can modify settings. They should do this with
|
||||
this priority, though this is not enforced.
|
||||
``"addon"`` priority where possible.
|
||||
|
||||
5. Default settings per-command
|
||||
-------------------------------
|
||||
.. _cmd-default-settings:
|
||||
|
||||
Each :doc:`Scrapy tool </topics/commands>` command can have its own default
|
||||
settings, which override the global default settings. Those custom command
|
||||
settings are specified in the ``default_settings`` attribute of the command
|
||||
class.
|
||||
5. Command-specific default settings
|
||||
------------------------------------
|
||||
|
||||
Each :ref:`Scrapy command <topics-commands>` can have its own default settings,
|
||||
which override the :ref:`global default settings <default-settings>`.
|
||||
|
||||
Those command-specific default settings are specified in the
|
||||
``default_settings`` attribute of each command class.
|
||||
|
||||
.. _default-settings:
|
||||
|
||||
6. Default global settings
|
||||
--------------------------
|
||||
@ -234,6 +250,61 @@ example, proper setting names for a fictional robots.txt extension would be
|
||||
``ROBOTSTXT_ENABLED``, ``ROBOTSTXT_OBEY``, ``ROBOTSTXT_CACHEDIR``, etc.
|
||||
|
||||
|
||||
Special settings
|
||||
================
|
||||
|
||||
The following settings work slightly differently than all other settings.
|
||||
|
||||
.. _pre-crawler-settings:
|
||||
|
||||
Pre-crawler settings
|
||||
--------------------
|
||||
|
||||
**Pre-crawler settings** are settings used before the
|
||||
:class:`~scrapy.crawler.Crawler` object is created.
|
||||
|
||||
These settings cannot be :ref:`set from a spider <spider-settings>`.
|
||||
|
||||
These settings are :setting:`SPIDER_LOADER_CLASS` and settings used by the
|
||||
corresponding :ref:`component <topics-components>`, e.g.
|
||||
:setting:`SPIDER_MODULES` and :setting:`SPIDER_LOADER_WARN_ONLY` for the
|
||||
default component.
|
||||
|
||||
|
||||
.. _reactor-settings:
|
||||
|
||||
Reactor settings
|
||||
----------------
|
||||
|
||||
**Reactor settings** are settings tied to the :doc:`Twisted reactor
|
||||
<twisted:core/howto/reactor-basics>`.
|
||||
|
||||
These settings can be defined from a spider. However, because only 1 reactor
|
||||
can be used per process, these settings cannot use a different value per spider
|
||||
when :ref:`running multiple spiders in the same process
|
||||
<run-multiple-spiders>`.
|
||||
|
||||
In general, if different spiders define different values, the first defined
|
||||
value is used. However, if two spiders request a different reactor, an
|
||||
exception is raised.
|
||||
|
||||
These settings are:
|
||||
|
||||
- :setting:`ASYNCIO_EVENT_LOOP`
|
||||
|
||||
- :setting:`DNS_RESOLVER` and settings used by the corresponding
|
||||
component, e.g. :setting:`DNSCACHE_ENABLED`, :setting:`DNSCACHE_SIZE`
|
||||
and :setting:`DNS_TIMEOUT` for the default one.
|
||||
|
||||
- :setting:`REACTOR_THREADPOOL_MAXSIZE`
|
||||
|
||||
- :setting:`TWISTED_REACTOR`
|
||||
|
||||
:setting:`ASYNCIO_EVENT_LOOP` and :setting:`TWISTED_REACTOR` are used upon
|
||||
installing the reactor. The rest of the settings are applied when starting
|
||||
the reactor.
|
||||
|
||||
|
||||
.. _topics-settings-ref:
|
||||
|
||||
Built-in settings reference
|
||||
|
@ -9,7 +9,7 @@ from scrapy.utils.misc import build_from_crawler, load_object
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.settings import BaseSettings, Settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -36,7 +36,8 @@ class AddonManager:
|
||||
try:
|
||||
addoncls = load_object(clspath)
|
||||
addon = build_from_crawler(addoncls, self.crawler)
|
||||
addon.update_settings(settings)
|
||||
if hasattr(addon, "update_settings"):
|
||||
addon.update_settings(settings)
|
||||
self.addons.append(addon)
|
||||
except NotConfigured as e:
|
||||
if e.args:
|
||||
@ -52,3 +53,20 @@ class AddonManager:
|
||||
},
|
||||
extra={"crawler": self.crawler},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def load_pre_crawler_settings(cls, settings: BaseSettings):
|
||||
"""Update early settings that do not require a crawler instance, such as SPIDER_MODULES.
|
||||
|
||||
Similar to the load_settings method, this loads each add-on configured in the
|
||||
``ADDONS`` setting and calls their 'update_pre_crawler_settings' class method if present.
|
||||
This method doesn't have access to the crawler instance or the addons list.
|
||||
|
||||
:param settings: The :class:`~scrapy.settings.BaseSettings` object from \
|
||||
which to read the early add-on configuration
|
||||
:type settings: :class:`~scrapy.settings.Settings`
|
||||
"""
|
||||
for clspath in build_component_list(settings["ADDONS"]):
|
||||
addoncls = load_object(clspath)
|
||||
if hasattr(addoncls, "update_pre_crawler_settings"):
|
||||
addoncls.update_pre_crawler_settings(settings)
|
||||
|
@ -292,6 +292,7 @@ class CrawlerRunner:
|
||||
def __init__(self, settings: dict[str, Any] | Settings | None = None):
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
AddonManager.load_pre_crawler_settings(settings)
|
||||
self.settings: Settings = settings
|
||||
self.spider_loader: SpiderLoader = self._get_spider_loader(settings)
|
||||
self._crawlers: set[Crawler] = set()
|
||||
|
@ -97,6 +97,25 @@ class SpiderLoaderTest(unittest.TestCase):
|
||||
self.spider_loader = SpiderLoader.from_settings(settings)
|
||||
assert len(self.spider_loader._spiders) == 0
|
||||
|
||||
def test_load_spider_module_from_addons(self):
|
||||
module = "tests.test_spiderloader.spiders_from_addons.spider0"
|
||||
|
||||
class SpiderModuleAddon:
|
||||
@classmethod
|
||||
def update_pre_crawler_settings(cls, settings):
|
||||
settings.set(
|
||||
"SPIDER_MODULES",
|
||||
[module],
|
||||
"project",
|
||||
)
|
||||
|
||||
runner = CrawlerRunner({"ADDONS": {SpiderModuleAddon: 1}})
|
||||
|
||||
crawler = runner.create_crawler("spider_from_addon")
|
||||
self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
|
||||
self.assertEqual(crawler.spidercls.name, "spider_from_addon")
|
||||
self.assertTrue(len(crawler.settings["SPIDER_MODULES"]) == 1)
|
||||
|
||||
def test_crawler_runner_loading(self):
|
||||
module = "tests.test_spiderloader.test_spiders.spider1"
|
||||
runner = CrawlerRunner(
|
||||
|
6
tests/test_spiderloader/spiders_from_addons/spider0.py
Normal file
6
tests/test_spiderloader/spiders_from_addons/spider0.py
Normal file
@ -0,0 +1,6 @@
|
||||
from scrapy.spiders import Spider
|
||||
|
||||
|
||||
class SpiderFromAddon(Spider):
|
||||
name = "spider_from_addon"
|
||||
allowed_domains = ["scrapy1.org", "scrapy3.org"]
|
Loading…
x
Reference in New Issue
Block a user