Merge pull request #1190 from Curita/crawlerprocess-docs

CrawlerProcess documentation
2025-02-27 00:04:25 +00:00 · 2015-04-29 23:27:18 -03:00 · 2015-04-29 23:27:18 -03:00 · 5b884d1bdf
commit 5b884d1bdf
parent 3243a13b80 c1634e4914
4 changed files with 190 additions and 89 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -26,7 +26,10 @@ sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy"))

 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['scrapydocs']
+extensions = [
+    'scrapydocs',
+    'sphinx.ext.autodoc'
+]

 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
--- a/docs/topics/api.rst
+++ b/docs/topics/api.rst
@ -99,52 +99,13 @@ how you :ref:`configure the downloader middlewares

        Returns a deferred that is fired when the crawl is finished.

-.. class:: CrawlerRunner(settings)
+.. autoclass:: CrawlerRunner
+   :members:

-    This is a convenient helper class that keeps track of, manages and runs
-    crawlers inside an already setup Twisted `reactor`_.
-
-    The CrawlerRunner object must be instantiated with a
-    :class:`~scrapy.settings.Settings` object.
-
-    This class shouldn't be needed (since Scrapy is responsible of using it
-    accordingly) unless writing scripts that manually handle the crawling
-    process. See :ref:`run-from-script` for an example.
-
-    .. attribute:: crawlers
-
-       Set of :class:`crawlers <scrapy.crawler.Crawler>` created by the
-       :meth:`crawl` method.
-
-    .. method:: crawl(crawler_or_spidercls, \*args, \**kwargs)
-
-       This method runs a crawler with the provided arguments.
-
-       It will keep track of the given crawler so it can be stopped later,
-       while calling its :meth:`Crawler.crawl` method.
-
-       If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
-       instance, it will try to create one using this parameter as the spider
-       class given to it.
-
-       Returns a deferred that is fired when the crawl is finished.
-
-       :param crawler_or_spidercls: already created crawler, or a spider class
-       or spider's name inside the project to create it
-       :type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
-        :class:`~scrapy.spider.Spider` subclass or string
-
-       :param args: arguments to initializate the spider
-       :type args: list
-
-       :param kwargs: keyword arguments to initializate the spider
-       :type kwargs: dict
-
-    .. method:: stop()
-
-       Stops simultaneously all the crawling jobs taking place.
-
-       Returns a deferred that is fired when they all have ended.
+.. autoclass:: CrawlerProcess
+   :show-inheritance:
+   :members:
+   :inherited-members:

 .. _topics-api-settings:

--- a/docs/topics/practices.rst
+++ b/docs/topics/practices.rst
@ -18,39 +18,69 @@ the typical way of running Scrapy via ``scrapy crawl``.
 Remember that Scrapy is built on top of the Twisted
 asynchronous networking library, so you need to run it inside the Twisted reactor.

-Note that you will also have to shutdown the Twisted reactor yourself after the
-spider is finished. This can be achieved by adding callbacks to the deferred
-returned by the :meth:`CrawlerRunner.crawl
-<scrapy.crawler.CrawlerRunner.crawl>` method.
+First utility you can use to run your spiders is
+:class:`scrapy.crawler.CrawlerProcess`. This class will start a Twisted reactor
+for you, configuring the logging and setting shutdown handlers. This class is
+the one used by all Scrapy commands.
+
+Here's an example showing how to run a single spider with it.
+
+::
+
+    import scrapy
+    from scrapy.crawler import CrawlerProcess
+
+    class MySpider(scrapy.Spider):
+        # Your spider definition
+        ...
+
+    process = CrawlerProcess({
+        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
+    })
+
+    process.crawl(MySpider)
+    process.start() # the script will block here until the crawling is finished
+
+Make sure to check :class:`~scrapy.crawler.CrawlerProcess` documentation to get
+acquainted with its usage details.
+
+If you are inside a Scrapy project there are some additional helpers you can
+use to import those components within the project. You can automatically import
+your spiders passing their name to :class:`~scrapy.crawler.CrawlerProcess`, and
+use ``get_project_settings`` to get a :class:`~scrapy.settings.Settings`
+instance with your project settings.

 What follows is a working example of how to do that, using the `testspiders`_
 project as example.

 ::

-    from twisted.internet import reactor
-    from scrapy.crawler import CrawlerRunner
-    from scrapy.utils.log import configure_logging
+    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings

-    settings = get_project_settings()
-    configure_logging(settings)
-    runner = CrawlerRunner(settings)
+    process = CrawlerProcess(get_project_settings())

    # 'followall' is the name of one of the spiders of the project.
-    d = runner.crawl('followall', domain='scrapinghub.com')
-    d.addBoth(lambda _: reactor.stop())
-    reactor.run() # the script will block here until the crawling is finished
+    process.crawl('testspider', domain='scrapinghub.com')
+    process.start() # the script will block here until the crawling is finished

-Running spiders outside projects it's not much different. You have to create a
-generic :class:`~scrapy.settings.Settings` object and populate it as needed
-(See :ref:`topics-settings-ref` for the available settings), instead of using
-the configuration returned by `get_project_settings`.
+There's another Scrapy utility that provides more control over the crawling
+process: :class:`scrapy.crawler.CrawlerRunner`. This class is a thin wrapper
+that encapsulates some simple helpers to run multiple crawlers, but it won't
+start or interfere with existing reactors in any way.

-Spiders can still be referenced by their name if :setting:`SPIDER_MODULES` is
-set with the modules where Scrapy should look for spiders.  Otherwise, passing
-the spider class as first argument in the :meth:`CrawlerRunner.crawl
-<scrapy.crawler.CrawlerRunner.crawl>` method is enough.
+Using this class the reactor should be explicitly run after scheduling your
+spiders. It's recommended you use :class:`~scrapy.crawler.CrawlerRunner`
+instead of :class:`~scrapy.crawler.CrawlerProcess` if your application is
+already using Twisted and you want to run Scrapy in the same reactor.
+
+Note that you will also have to shutdown the Twisted reactor yourself after the
+spider is finished. This can be achieved by adding callbacks to the deferred
+returned by the :meth:`CrawlerRunner.crawl
+<scrapy.crawler.CrawlerRunner.crawl>` method.
+
+Here's an example of its usage, along with a callback to manually stop the
+reactor after `MySpider` has finished running.

 ::

@ -63,7 +93,7 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl
        # Your spider definition
        ...

-    configure_logging(settings)
+    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    runner = CrawlerRunner({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
@ -83,25 +113,50 @@ By default, Scrapy runs a single spider per process when you run ``scrapy
 crawl``. However, Scrapy supports running multiple spiders per process using
 the :ref:`internal API <topics-api>`.

-Here is an example that runs multiple spiders simultaneously, using the
-`testspiders`_ project:
+Here is an example that runs multiple spiders simultaneously:

 ::

-    from twisted.internet import reactor, defer
+    import scrapy
+    from scrapy.crawler import CrawlerProcess
+
+    class MySpider1(scrapy.Spider):
+        # Your first spider definition
+        ...
+
+    class MySpider2(scrapy.Spider):
+        # Your second spider definition
+        ...
+
+    process = CrawlerProcess({})
+    process.crawl(MySpider1)
+    process.crawl(MySpider2)
+    process.start() # the script will block here until all crawling jobs are finished
+
+Same example using :class:`~scrapy.crawler.CrawlerRunner`:
+
+::
+
+    import scrapy
+    from twisted.internet import reactor
    from scrapy.crawler import CrawlerRunner
    from scrapy.utils.log import configure_logging
-    from scrapy.utils.project import get_project_settings

-    settings = get_project_settings()
-    configure_logging(settings)
-    runner = CrawlerRunner(settings)
-    dfs = set()
-    for domain in ['scrapinghub.com', 'insophia.com']:
-        d = runner.crawl('followall', domain=domain)
-        dfs.add(d)
+    class MySpider1(scrapy.Spider):
+        # Your first spider definition
+        ...
+
+    class MySpider2(scrapy.Spider):
+        # Your second spider definition
+        ...
+
+    configure_logging({})
+    runner = CrawlerRunner({})
+    runner.crawl(MySpider1)
+    runner.crawl(MySpider2)
+    d = runner.join()
+    d.addBoth(lambda _: reactor.stop())

-    defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
    reactor.run() # the script will block here until all crawling jobs are finished

 Same example but running the spiders sequentially by chaining the deferreds:
@ -111,16 +166,22 @@ Same example but running the spiders sequentially by chaining the deferreds:
    from twisted.internet import reactor, defer
    from scrapy.crawler import CrawlerRunner
    from scrapy.utils.log import configure_logging
-    from scrapy.utils.project import get_project_settings

-    settings = get_project_settings()
-    configure_logging(settings)
-    runner = CrawlerRunner(settings)
+    class MySpider1(scrapy.Spider):
+        # Your first spider definition
+        ...
+
+    class MySpider2(scrapy.Spider):
+        # Your second spider definition
+        ...
+
+    configure_logging({})
+    runner = CrawlerRunner({})

    @defer.inlineCallbacks
    def crawl():
-        for domain in ['scrapinghub.com', 'insophia.com']:
-            yield runner.crawl('followall', domain=domain)
+        yield runner.crawl(MySpider1)
+        yield runner.crawl(MySpider2)
        reactor.stop()

    crawl()
--- a/scrapy/crawler.py
+++ b/scrapy/crawler.py
@ -89,13 +89,30 @@ class Crawler(object):


 class CrawlerRunner(object):
+    """
+    This is a convenient helper class that keeps track of, manages and runs
+    crawlers inside an already setup Twisted `reactor`_.
+
+    The CrawlerRunner object must be instantiated with a
+    :class:`~scrapy.settings.Settings` object.
+
+    This class shouldn't be needed (since Scrapy is responsible of using it
+    accordingly) unless writing scripts that manually handle the crawling
+    process. See :ref:`run-from-script` for an example.
+    """
+
+    crawlers = property(
+        lambda self: self._crawlers,
+        doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
+            ":meth:`crawl` and managed by this class."
+    )

    def __init__(self, settings):
        if isinstance(settings, dict):
            settings = Settings(settings)
        self.settings = settings
        self.spider_loader = _get_spider_loader(settings)
-        self.crawlers = set()
+        self._crawlers = set()
        self._active = set()

    @property
@ -106,6 +123,27 @@ class CrawlerRunner(object):
        return self.spider_loader

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
+        """
+        Run a crawler with the provided arguments.
+
+        It will call the given Crawler's :meth:`~Crawler.crawl` method, while
+        keeping track of it so it can be stopped later.
+
+        If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
+        instance, this method will try to create one using this parameter as
+        the spider class given to it.
+
+        Returns a deferred that is fired when the crawling is finished.
+
+        :param crawler_or_spidercls: already created crawler, or a spider class
+            or spider's name inside the project to create it
+        :type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
+            :class:`~scrapy.spider.Spider` subclass or string
+
+        :param list args: arguments to initialize the spider
+
+        :param dict kwargs: keyword arguments to initialize the spider
+        """
        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)
@ -127,17 +165,44 @@ class CrawlerRunner(object):
        return Crawler(spidercls, self.settings)

    def stop(self):
+        """
+        Stops simultaneously all the crawling jobs taking place.
+
+        Returns a deferred that is fired when they all have ended.
+        """
        return defer.DeferredList([c.stop() for c in list(self.crawlers)])

    @defer.inlineCallbacks
    def join(self):
-        """Wait for all managed crawlers to complete"""
+        """
+        join()
+
+        Returns a deferred that is fired when all managed :attr:`crawlers` have
+        completed their executions.
+        """
        while self._active:
            yield defer.DeferredList(self._active)


 class CrawlerProcess(CrawlerRunner):
-    """A class to run multiple scrapy crawlers in a process simultaneously"""
+    """
+    A class to run multiple scrapy crawlers in a process simultaneously.
+
+    This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
+    for starting a Twisted `reactor`_ and handling shutdown signals, like the
+    keyboard interrupt command Ctrl-C. It also configures top-level logging.
+
+    This utility should be a better fit than
+    :class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
+    Twisted `reactor`_ within your application.
+
+    The CrawlerProcess object must be instantiated with a
+    :class:`~scrapy.settings.Settings` object.
+
+    This class shouldn't be needed (since Scrapy is responsible of using it
+    accordingly) unless writing scripts that manually handle the crawling
+    process. See :ref:`run-from-script` for an example.
+    """

    def __init__(self, settings):
        super(CrawlerProcess, self).__init__(settings)
@ -161,6 +226,17 @@ class CrawlerProcess(CrawlerRunner):
        reactor.callFromThread(self._stop_reactor)

    def start(self, stop_after_crawl=True):
+        """
+        This method starts a Twisted `reactor`_, adjusts its pool size to
+        :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
+        on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
+
+        If `stop_after_crawl` is True, the reactor will be stopped after all
+        crawlers have finished, using :meth:`join`.
+
+        :param boolean stop_after_crawl: stop or not the reactor when all
+            crawlers have finished
+        """
        if stop_after_crawl:
            d = self.join()
            # Don't start the reactor if the deferreds are already fired