1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 06:03:58 +00:00

Merge pull request #1190 from Curita/crawlerprocess-docs

CrawlerProcess documentation
This commit is contained in:
Julia Medina 2015-04-29 23:27:18 -03:00
commit 5b884d1bdf
4 changed files with 190 additions and 89 deletions

View File

@ -26,7 +26,10 @@ sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy"))
# Add any Sphinx extension module names here, as strings. They can be extensions # Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['scrapydocs'] extensions = [
'scrapydocs',
'sphinx.ext.autodoc'
]
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']

View File

@ -99,52 +99,13 @@ how you :ref:`configure the downloader middlewares
Returns a deferred that is fired when the crawl is finished. Returns a deferred that is fired when the crawl is finished.
.. class:: CrawlerRunner(settings) .. autoclass:: CrawlerRunner
:members:
This is a convenient helper class that keeps track of, manages and runs .. autoclass:: CrawlerProcess
crawlers inside an already setup Twisted `reactor`_. :show-inheritance:
:members:
The CrawlerRunner object must be instantiated with a :inherited-members:
:class:`~scrapy.settings.Settings` object.
This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
.. attribute:: crawlers
Set of :class:`crawlers <scrapy.crawler.Crawler>` created by the
:meth:`crawl` method.
.. method:: crawl(crawler_or_spidercls, \*args, \**kwargs)
This method runs a crawler with the provided arguments.
It will keep track of the given crawler so it can be stopped later,
while calling its :meth:`Crawler.crawl` method.
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
instance, it will try to create one using this parameter as the spider
class given to it.
Returns a deferred that is fired when the crawl is finished.
:param crawler_or_spidercls: already created crawler, or a spider class
or spider's name inside the project to create it
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
:class:`~scrapy.spider.Spider` subclass or string
:param args: arguments to initializate the spider
:type args: list
:param kwargs: keyword arguments to initializate the spider
:type kwargs: dict
.. method:: stop()
Stops simultaneously all the crawling jobs taking place.
Returns a deferred that is fired when they all have ended.
.. _topics-api-settings: .. _topics-api-settings:

View File

@ -18,39 +18,69 @@ the typical way of running Scrapy via ``scrapy crawl``.
Remember that Scrapy is built on top of the Twisted Remember that Scrapy is built on top of the Twisted
asynchronous networking library, so you need to run it inside the Twisted reactor. asynchronous networking library, so you need to run it inside the Twisted reactor.
Note that you will also have to shutdown the Twisted reactor yourself after the First utility you can use to run your spiders is
spider is finished. This can be achieved by adding callbacks to the deferred :class:`scrapy.crawler.CrawlerProcess`. This class will start a Twisted reactor
returned by the :meth:`CrawlerRunner.crawl for you, configuring the logging and setting shutdown handlers. This class is
<scrapy.crawler.CrawlerRunner.crawl>` method. the one used by all Scrapy commands.
Here's an example showing how to run a single spider with it.
::
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
# Your spider definition
...
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished
Make sure to check :class:`~scrapy.crawler.CrawlerProcess` documentation to get
acquainted with its usage details.
If you are inside a Scrapy project there are some additional helpers you can
use to import those components within the project. You can automatically import
your spiders passing their name to :class:`~scrapy.crawler.CrawlerProcess`, and
use ``get_project_settings`` to get a :class:`~scrapy.settings.Settings`
instance with your project settings.
What follows is a working example of how to do that, using the `testspiders`_ What follows is a working example of how to do that, using the `testspiders`_
project as example. project as example.
:: ::
from twisted.internet import reactor from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
settings = get_project_settings() process = CrawlerProcess(get_project_settings())
configure_logging(settings)
runner = CrawlerRunner(settings)
# 'followall' is the name of one of the spiders of the project. # 'followall' is the name of one of the spiders of the project.
d = runner.crawl('followall', domain='scrapinghub.com') process.crawl('testspider', domain='scrapinghub.com')
d.addBoth(lambda _: reactor.stop()) process.start() # the script will block here until the crawling is finished
reactor.run() # the script will block here until the crawling is finished
Running spiders outside projects it's not much different. You have to create a There's another Scrapy utility that provides more control over the crawling
generic :class:`~scrapy.settings.Settings` object and populate it as needed process: :class:`scrapy.crawler.CrawlerRunner`. This class is a thin wrapper
(See :ref:`topics-settings-ref` for the available settings), instead of using that encapsulates some simple helpers to run multiple crawlers, but it won't
the configuration returned by `get_project_settings`. start or interfere with existing reactors in any way.
Spiders can still be referenced by their name if :setting:`SPIDER_MODULES` is Using this class the reactor should be explicitly run after scheduling your
set with the modules where Scrapy should look for spiders. Otherwise, passing spiders. It's recommended you use :class:`~scrapy.crawler.CrawlerRunner`
the spider class as first argument in the :meth:`CrawlerRunner.crawl instead of :class:`~scrapy.crawler.CrawlerProcess` if your application is
<scrapy.crawler.CrawlerRunner.crawl>` method is enough. already using Twisted and you want to run Scrapy in the same reactor.
Note that you will also have to shutdown the Twisted reactor yourself after the
spider is finished. This can be achieved by adding callbacks to the deferred
returned by the :meth:`CrawlerRunner.crawl
<scrapy.crawler.CrawlerRunner.crawl>` method.
Here's an example of its usage, along with a callback to manually stop the
reactor after `MySpider` has finished running.
:: ::
@ -63,7 +93,7 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl
# Your spider definition # Your spider definition
... ...
configure_logging(settings) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner({ runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
}) })
@ -83,25 +113,50 @@ By default, Scrapy runs a single spider per process when you run ``scrapy
crawl``. However, Scrapy supports running multiple spiders per process using crawl``. However, Scrapy supports running multiple spiders per process using
the :ref:`internal API <topics-api>`. the :ref:`internal API <topics-api>`.
Here is an example that runs multiple spiders simultaneously, using the Here is an example that runs multiple spiders simultaneously:
`testspiders`_ project:
:: ::
from twisted.internet import reactor, defer import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider1(scrapy.Spider):
# Your first spider definition
...
class MySpider2(scrapy.Spider):
# Your second spider definition
...
process = CrawlerProcess({})
process.crawl(MySpider1)
process.crawl(MySpider2)
process.start() # the script will block here until all crawling jobs are finished
Same example using :class:`~scrapy.crawler.CrawlerRunner`:
::
import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
settings = get_project_settings() class MySpider1(scrapy.Spider):
configure_logging(settings) # Your first spider definition
runner = CrawlerRunner(settings) ...
dfs = set()
for domain in ['scrapinghub.com', 'insophia.com']: class MySpider2(scrapy.Spider):
d = runner.crawl('followall', domain=domain) # Your second spider definition
dfs.add(d) ...
configure_logging({})
runner = CrawlerRunner({})
runner.crawl(MySpider1)
runner.crawl(MySpider2)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished reactor.run() # the script will block here until all crawling jobs are finished
Same example but running the spiders sequentially by chaining the deferreds: Same example but running the spiders sequentially by chaining the deferreds:
@ -111,16 +166,22 @@ Same example but running the spiders sequentially by chaining the deferreds:
from twisted.internet import reactor, defer from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
settings = get_project_settings() class MySpider1(scrapy.Spider):
configure_logging(settings) # Your first spider definition
runner = CrawlerRunner(settings) ...
class MySpider2(scrapy.Spider):
# Your second spider definition
...
configure_logging({})
runner = CrawlerRunner({})
@defer.inlineCallbacks @defer.inlineCallbacks
def crawl(): def crawl():
for domain in ['scrapinghub.com', 'insophia.com']: yield runner.crawl(MySpider1)
yield runner.crawl('followall', domain=domain) yield runner.crawl(MySpider2)
reactor.stop() reactor.stop()
crawl() crawl()

View File

@ -89,13 +89,30 @@ class Crawler(object):
class CrawlerRunner(object): class CrawlerRunner(object):
"""
This is a convenient helper class that keeps track of, manages and runs
crawlers inside an already setup Twisted `reactor`_.
The CrawlerRunner object must be instantiated with a
:class:`~scrapy.settings.Settings` object.
This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
"""
crawlers = property(
lambda self: self._crawlers,
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
":meth:`crawl` and managed by this class."
)
def __init__(self, settings): def __init__(self, settings):
if isinstance(settings, dict): if isinstance(settings, dict):
settings = Settings(settings) settings = Settings(settings)
self.settings = settings self.settings = settings
self.spider_loader = _get_spider_loader(settings) self.spider_loader = _get_spider_loader(settings)
self.crawlers = set() self._crawlers = set()
self._active = set() self._active = set()
@property @property
@ -106,6 +123,27 @@ class CrawlerRunner(object):
return self.spider_loader return self.spider_loader
def crawl(self, crawler_or_spidercls, *args, **kwargs): def crawl(self, crawler_or_spidercls, *args, **kwargs):
"""
Run a crawler with the provided arguments.
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
keeping track of it so it can be stopped later.
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
instance, this method will try to create one using this parameter as
the spider class given to it.
Returns a deferred that is fired when the crawling is finished.
:param crawler_or_spidercls: already created crawler, or a spider class
or spider's name inside the project to create it
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
:class:`~scrapy.spider.Spider` subclass or string
:param list args: arguments to initialize the spider
:param dict kwargs: keyword arguments to initialize the spider
"""
crawler = crawler_or_spidercls crawler = crawler_or_spidercls
if not isinstance(crawler_or_spidercls, Crawler): if not isinstance(crawler_or_spidercls, Crawler):
crawler = self._create_crawler(crawler_or_spidercls) crawler = self._create_crawler(crawler_or_spidercls)
@ -127,17 +165,44 @@ class CrawlerRunner(object):
return Crawler(spidercls, self.settings) return Crawler(spidercls, self.settings)
def stop(self): def stop(self):
"""
Stops simultaneously all the crawling jobs taking place.
Returns a deferred that is fired when they all have ended.
"""
return defer.DeferredList([c.stop() for c in list(self.crawlers)]) return defer.DeferredList([c.stop() for c in list(self.crawlers)])
@defer.inlineCallbacks @defer.inlineCallbacks
def join(self): def join(self):
"""Wait for all managed crawlers to complete""" """
join()
Returns a deferred that is fired when all managed :attr:`crawlers` have
completed their executions.
"""
while self._active: while self._active:
yield defer.DeferredList(self._active) yield defer.DeferredList(self._active)
class CrawlerProcess(CrawlerRunner): class CrawlerProcess(CrawlerRunner):
"""A class to run multiple scrapy crawlers in a process simultaneously""" """
A class to run multiple scrapy crawlers in a process simultaneously.
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
for starting a Twisted `reactor`_ and handling shutdown signals, like the
keyboard interrupt command Ctrl-C. It also configures top-level logging.
This utility should be a better fit than
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
Twisted `reactor`_ within your application.
The CrawlerProcess object must be instantiated with a
:class:`~scrapy.settings.Settings` object.
This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
"""
def __init__(self, settings): def __init__(self, settings):
super(CrawlerProcess, self).__init__(settings) super(CrawlerProcess, self).__init__(settings)
@ -161,6 +226,17 @@ class CrawlerProcess(CrawlerRunner):
reactor.callFromThread(self._stop_reactor) reactor.callFromThread(self._stop_reactor)
def start(self, stop_after_crawl=True): def start(self, stop_after_crawl=True):
"""
This method starts a Twisted `reactor`_, adjusts its pool size to
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
If `stop_after_crawl` is True, the reactor will be stopped after all
crawlers have finished, using :meth:`join`.
:param boolean stop_after_crawl: stop or not the reactor when all
crawlers have finished
"""
if stop_after_crawl: if stop_after_crawl:
d = self.join() d = self.join()
# Don't start the reactor if the deferreds are already fired # Don't start the reactor if the deferreds are already fired