mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 03:03:47 +00:00
Merge pull request #1190 from Curita/crawlerprocess-docs
CrawlerProcess documentation
This commit is contained in:
commit
5b884d1bdf
@ -26,7 +26,10 @@ sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy"))
|
|||||||
|
|
||||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||||
extensions = ['scrapydocs']
|
extensions = [
|
||||||
|
'scrapydocs',
|
||||||
|
'sphinx.ext.autodoc'
|
||||||
|
]
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
templates_path = ['_templates']
|
templates_path = ['_templates']
|
||||||
|
@ -99,52 +99,13 @@ how you :ref:`configure the downloader middlewares
|
|||||||
|
|
||||||
Returns a deferred that is fired when the crawl is finished.
|
Returns a deferred that is fired when the crawl is finished.
|
||||||
|
|
||||||
.. class:: CrawlerRunner(settings)
|
.. autoclass:: CrawlerRunner
|
||||||
|
:members:
|
||||||
|
|
||||||
This is a convenient helper class that keeps track of, manages and runs
|
.. autoclass:: CrawlerProcess
|
||||||
crawlers inside an already setup Twisted `reactor`_.
|
:show-inheritance:
|
||||||
|
:members:
|
||||||
The CrawlerRunner object must be instantiated with a
|
:inherited-members:
|
||||||
:class:`~scrapy.settings.Settings` object.
|
|
||||||
|
|
||||||
This class shouldn't be needed (since Scrapy is responsible of using it
|
|
||||||
accordingly) unless writing scripts that manually handle the crawling
|
|
||||||
process. See :ref:`run-from-script` for an example.
|
|
||||||
|
|
||||||
.. attribute:: crawlers
|
|
||||||
|
|
||||||
Set of :class:`crawlers <scrapy.crawler.Crawler>` created by the
|
|
||||||
:meth:`crawl` method.
|
|
||||||
|
|
||||||
.. method:: crawl(crawler_or_spidercls, \*args, \**kwargs)
|
|
||||||
|
|
||||||
This method runs a crawler with the provided arguments.
|
|
||||||
|
|
||||||
It will keep track of the given crawler so it can be stopped later,
|
|
||||||
while calling its :meth:`Crawler.crawl` method.
|
|
||||||
|
|
||||||
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
|
|
||||||
instance, it will try to create one using this parameter as the spider
|
|
||||||
class given to it.
|
|
||||||
|
|
||||||
Returns a deferred that is fired when the crawl is finished.
|
|
||||||
|
|
||||||
:param crawler_or_spidercls: already created crawler, or a spider class
|
|
||||||
or spider's name inside the project to create it
|
|
||||||
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
|
||||||
:class:`~scrapy.spider.Spider` subclass or string
|
|
||||||
|
|
||||||
:param args: arguments to initializate the spider
|
|
||||||
:type args: list
|
|
||||||
|
|
||||||
:param kwargs: keyword arguments to initializate the spider
|
|
||||||
:type kwargs: dict
|
|
||||||
|
|
||||||
.. method:: stop()
|
|
||||||
|
|
||||||
Stops simultaneously all the crawling jobs taking place.
|
|
||||||
|
|
||||||
Returns a deferred that is fired when they all have ended.
|
|
||||||
|
|
||||||
.. _topics-api-settings:
|
.. _topics-api-settings:
|
||||||
|
|
||||||
|
@ -18,39 +18,69 @@ the typical way of running Scrapy via ``scrapy crawl``.
|
|||||||
Remember that Scrapy is built on top of the Twisted
|
Remember that Scrapy is built on top of the Twisted
|
||||||
asynchronous networking library, so you need to run it inside the Twisted reactor.
|
asynchronous networking library, so you need to run it inside the Twisted reactor.
|
||||||
|
|
||||||
Note that you will also have to shutdown the Twisted reactor yourself after the
|
First utility you can use to run your spiders is
|
||||||
spider is finished. This can be achieved by adding callbacks to the deferred
|
:class:`scrapy.crawler.CrawlerProcess`. This class will start a Twisted reactor
|
||||||
returned by the :meth:`CrawlerRunner.crawl
|
for you, configuring the logging and setting shutdown handlers. This class is
|
||||||
<scrapy.crawler.CrawlerRunner.crawl>` method.
|
the one used by all Scrapy commands.
|
||||||
|
|
||||||
|
Here's an example showing how to run a single spider with it.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
|
||||||
|
class MySpider(scrapy.Spider):
|
||||||
|
# Your spider definition
|
||||||
|
...
|
||||||
|
|
||||||
|
process = CrawlerProcess({
|
||||||
|
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
||||||
|
})
|
||||||
|
|
||||||
|
process.crawl(MySpider)
|
||||||
|
process.start() # the script will block here until the crawling is finished
|
||||||
|
|
||||||
|
Make sure to check :class:`~scrapy.crawler.CrawlerProcess` documentation to get
|
||||||
|
acquainted with its usage details.
|
||||||
|
|
||||||
|
If you are inside a Scrapy project there are some additional helpers you can
|
||||||
|
use to import those components within the project. You can automatically import
|
||||||
|
your spiders passing their name to :class:`~scrapy.crawler.CrawlerProcess`, and
|
||||||
|
use ``get_project_settings`` to get a :class:`~scrapy.settings.Settings`
|
||||||
|
instance with your project settings.
|
||||||
|
|
||||||
What follows is a working example of how to do that, using the `testspiders`_
|
What follows is a working example of how to do that, using the `testspiders`_
|
||||||
project as example.
|
project as example.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
from twisted.internet import reactor
|
from scrapy.crawler import CrawlerProcess
|
||||||
from scrapy.crawler import CrawlerRunner
|
|
||||||
from scrapy.utils.log import configure_logging
|
|
||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
settings = get_project_settings()
|
process = CrawlerProcess(get_project_settings())
|
||||||
configure_logging(settings)
|
|
||||||
runner = CrawlerRunner(settings)
|
|
||||||
|
|
||||||
# 'followall' is the name of one of the spiders of the project.
|
# 'followall' is the name of one of the spiders of the project.
|
||||||
d = runner.crawl('followall', domain='scrapinghub.com')
|
process.crawl('testspider', domain='scrapinghub.com')
|
||||||
d.addBoth(lambda _: reactor.stop())
|
process.start() # the script will block here until the crawling is finished
|
||||||
reactor.run() # the script will block here until the crawling is finished
|
|
||||||
|
|
||||||
Running spiders outside projects it's not much different. You have to create a
|
There's another Scrapy utility that provides more control over the crawling
|
||||||
generic :class:`~scrapy.settings.Settings` object and populate it as needed
|
process: :class:`scrapy.crawler.CrawlerRunner`. This class is a thin wrapper
|
||||||
(See :ref:`topics-settings-ref` for the available settings), instead of using
|
that encapsulates some simple helpers to run multiple crawlers, but it won't
|
||||||
the configuration returned by `get_project_settings`.
|
start or interfere with existing reactors in any way.
|
||||||
|
|
||||||
Spiders can still be referenced by their name if :setting:`SPIDER_MODULES` is
|
Using this class the reactor should be explicitly run after scheduling your
|
||||||
set with the modules where Scrapy should look for spiders. Otherwise, passing
|
spiders. It's recommended you use :class:`~scrapy.crawler.CrawlerRunner`
|
||||||
the spider class as first argument in the :meth:`CrawlerRunner.crawl
|
instead of :class:`~scrapy.crawler.CrawlerProcess` if your application is
|
||||||
<scrapy.crawler.CrawlerRunner.crawl>` method is enough.
|
already using Twisted and you want to run Scrapy in the same reactor.
|
||||||
|
|
||||||
|
Note that you will also have to shutdown the Twisted reactor yourself after the
|
||||||
|
spider is finished. This can be achieved by adding callbacks to the deferred
|
||||||
|
returned by the :meth:`CrawlerRunner.crawl
|
||||||
|
<scrapy.crawler.CrawlerRunner.crawl>` method.
|
||||||
|
|
||||||
|
Here's an example of its usage, along with a callback to manually stop the
|
||||||
|
reactor after `MySpider` has finished running.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
@ -63,7 +93,7 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl
|
|||||||
# Your spider definition
|
# Your spider definition
|
||||||
...
|
...
|
||||||
|
|
||||||
configure_logging(settings)
|
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
|
||||||
runner = CrawlerRunner({
|
runner = CrawlerRunner({
|
||||||
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
||||||
})
|
})
|
||||||
@ -83,25 +113,50 @@ By default, Scrapy runs a single spider per process when you run ``scrapy
|
|||||||
crawl``. However, Scrapy supports running multiple spiders per process using
|
crawl``. However, Scrapy supports running multiple spiders per process using
|
||||||
the :ref:`internal API <topics-api>`.
|
the :ref:`internal API <topics-api>`.
|
||||||
|
|
||||||
Here is an example that runs multiple spiders simultaneously, using the
|
Here is an example that runs multiple spiders simultaneously:
|
||||||
`testspiders`_ project:
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
from twisted.internet import reactor, defer
|
import scrapy
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
|
||||||
|
class MySpider1(scrapy.Spider):
|
||||||
|
# Your first spider definition
|
||||||
|
...
|
||||||
|
|
||||||
|
class MySpider2(scrapy.Spider):
|
||||||
|
# Your second spider definition
|
||||||
|
...
|
||||||
|
|
||||||
|
process = CrawlerProcess({})
|
||||||
|
process.crawl(MySpider1)
|
||||||
|
process.crawl(MySpider2)
|
||||||
|
process.start() # the script will block here until all crawling jobs are finished
|
||||||
|
|
||||||
|
Same example using :class:`~scrapy.crawler.CrawlerRunner`:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from twisted.internet import reactor
|
||||||
from scrapy.crawler import CrawlerRunner
|
from scrapy.crawler import CrawlerRunner
|
||||||
from scrapy.utils.log import configure_logging
|
from scrapy.utils.log import configure_logging
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
|
|
||||||
settings = get_project_settings()
|
class MySpider1(scrapy.Spider):
|
||||||
configure_logging(settings)
|
# Your first spider definition
|
||||||
runner = CrawlerRunner(settings)
|
...
|
||||||
dfs = set()
|
|
||||||
for domain in ['scrapinghub.com', 'insophia.com']:
|
class MySpider2(scrapy.Spider):
|
||||||
d = runner.crawl('followall', domain=domain)
|
# Your second spider definition
|
||||||
dfs.add(d)
|
...
|
||||||
|
|
||||||
|
configure_logging({})
|
||||||
|
runner = CrawlerRunner({})
|
||||||
|
runner.crawl(MySpider1)
|
||||||
|
runner.crawl(MySpider2)
|
||||||
|
d = runner.join()
|
||||||
|
d.addBoth(lambda _: reactor.stop())
|
||||||
|
|
||||||
defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
|
|
||||||
reactor.run() # the script will block here until all crawling jobs are finished
|
reactor.run() # the script will block here until all crawling jobs are finished
|
||||||
|
|
||||||
Same example but running the spiders sequentially by chaining the deferreds:
|
Same example but running the spiders sequentially by chaining the deferreds:
|
||||||
@ -111,16 +166,22 @@ Same example but running the spiders sequentially by chaining the deferreds:
|
|||||||
from twisted.internet import reactor, defer
|
from twisted.internet import reactor, defer
|
||||||
from scrapy.crawler import CrawlerRunner
|
from scrapy.crawler import CrawlerRunner
|
||||||
from scrapy.utils.log import configure_logging
|
from scrapy.utils.log import configure_logging
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
|
|
||||||
settings = get_project_settings()
|
class MySpider1(scrapy.Spider):
|
||||||
configure_logging(settings)
|
# Your first spider definition
|
||||||
runner = CrawlerRunner(settings)
|
...
|
||||||
|
|
||||||
|
class MySpider2(scrapy.Spider):
|
||||||
|
# Your second spider definition
|
||||||
|
...
|
||||||
|
|
||||||
|
configure_logging({})
|
||||||
|
runner = CrawlerRunner({})
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def crawl():
|
def crawl():
|
||||||
for domain in ['scrapinghub.com', 'insophia.com']:
|
yield runner.crawl(MySpider1)
|
||||||
yield runner.crawl('followall', domain=domain)
|
yield runner.crawl(MySpider2)
|
||||||
reactor.stop()
|
reactor.stop()
|
||||||
|
|
||||||
crawl()
|
crawl()
|
||||||
|
@ -89,13 +89,30 @@ class Crawler(object):
|
|||||||
|
|
||||||
|
|
||||||
class CrawlerRunner(object):
|
class CrawlerRunner(object):
|
||||||
|
"""
|
||||||
|
This is a convenient helper class that keeps track of, manages and runs
|
||||||
|
crawlers inside an already setup Twisted `reactor`_.
|
||||||
|
|
||||||
|
The CrawlerRunner object must be instantiated with a
|
||||||
|
:class:`~scrapy.settings.Settings` object.
|
||||||
|
|
||||||
|
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||||
|
accordingly) unless writing scripts that manually handle the crawling
|
||||||
|
process. See :ref:`run-from-script` for an example.
|
||||||
|
"""
|
||||||
|
|
||||||
|
crawlers = property(
|
||||||
|
lambda self: self._crawlers,
|
||||||
|
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
|
||||||
|
":meth:`crawl` and managed by this class."
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, settings):
|
def __init__(self, settings):
|
||||||
if isinstance(settings, dict):
|
if isinstance(settings, dict):
|
||||||
settings = Settings(settings)
|
settings = Settings(settings)
|
||||||
self.settings = settings
|
self.settings = settings
|
||||||
self.spider_loader = _get_spider_loader(settings)
|
self.spider_loader = _get_spider_loader(settings)
|
||||||
self.crawlers = set()
|
self._crawlers = set()
|
||||||
self._active = set()
|
self._active = set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -106,6 +123,27 @@ class CrawlerRunner(object):
|
|||||||
return self.spider_loader
|
return self.spider_loader
|
||||||
|
|
||||||
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Run a crawler with the provided arguments.
|
||||||
|
|
||||||
|
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
|
||||||
|
keeping track of it so it can be stopped later.
|
||||||
|
|
||||||
|
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
|
||||||
|
instance, this method will try to create one using this parameter as
|
||||||
|
the spider class given to it.
|
||||||
|
|
||||||
|
Returns a deferred that is fired when the crawling is finished.
|
||||||
|
|
||||||
|
:param crawler_or_spidercls: already created crawler, or a spider class
|
||||||
|
or spider's name inside the project to create it
|
||||||
|
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
||||||
|
:class:`~scrapy.spider.Spider` subclass or string
|
||||||
|
|
||||||
|
:param list args: arguments to initialize the spider
|
||||||
|
|
||||||
|
:param dict kwargs: keyword arguments to initialize the spider
|
||||||
|
"""
|
||||||
crawler = crawler_or_spidercls
|
crawler = crawler_or_spidercls
|
||||||
if not isinstance(crawler_or_spidercls, Crawler):
|
if not isinstance(crawler_or_spidercls, Crawler):
|
||||||
crawler = self._create_crawler(crawler_or_spidercls)
|
crawler = self._create_crawler(crawler_or_spidercls)
|
||||||
@ -127,17 +165,44 @@ class CrawlerRunner(object):
|
|||||||
return Crawler(spidercls, self.settings)
|
return Crawler(spidercls, self.settings)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
"""
|
||||||
|
Stops simultaneously all the crawling jobs taking place.
|
||||||
|
|
||||||
|
Returns a deferred that is fired when they all have ended.
|
||||||
|
"""
|
||||||
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def join(self):
|
def join(self):
|
||||||
"""Wait for all managed crawlers to complete"""
|
"""
|
||||||
|
join()
|
||||||
|
|
||||||
|
Returns a deferred that is fired when all managed :attr:`crawlers` have
|
||||||
|
completed their executions.
|
||||||
|
"""
|
||||||
while self._active:
|
while self._active:
|
||||||
yield defer.DeferredList(self._active)
|
yield defer.DeferredList(self._active)
|
||||||
|
|
||||||
|
|
||||||
class CrawlerProcess(CrawlerRunner):
|
class CrawlerProcess(CrawlerRunner):
|
||||||
"""A class to run multiple scrapy crawlers in a process simultaneously"""
|
"""
|
||||||
|
A class to run multiple scrapy crawlers in a process simultaneously.
|
||||||
|
|
||||||
|
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
|
||||||
|
for starting a Twisted `reactor`_ and handling shutdown signals, like the
|
||||||
|
keyboard interrupt command Ctrl-C. It also configures top-level logging.
|
||||||
|
|
||||||
|
This utility should be a better fit than
|
||||||
|
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
|
||||||
|
Twisted `reactor`_ within your application.
|
||||||
|
|
||||||
|
The CrawlerProcess object must be instantiated with a
|
||||||
|
:class:`~scrapy.settings.Settings` object.
|
||||||
|
|
||||||
|
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||||
|
accordingly) unless writing scripts that manually handle the crawling
|
||||||
|
process. See :ref:`run-from-script` for an example.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, settings):
|
def __init__(self, settings):
|
||||||
super(CrawlerProcess, self).__init__(settings)
|
super(CrawlerProcess, self).__init__(settings)
|
||||||
@ -161,6 +226,17 @@ class CrawlerProcess(CrawlerRunner):
|
|||||||
reactor.callFromThread(self._stop_reactor)
|
reactor.callFromThread(self._stop_reactor)
|
||||||
|
|
||||||
def start(self, stop_after_crawl=True):
|
def start(self, stop_after_crawl=True):
|
||||||
|
"""
|
||||||
|
This method starts a Twisted `reactor`_, adjusts its pool size to
|
||||||
|
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
|
||||||
|
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
|
||||||
|
|
||||||
|
If `stop_after_crawl` is True, the reactor will be stopped after all
|
||||||
|
crawlers have finished, using :meth:`join`.
|
||||||
|
|
||||||
|
:param boolean stop_after_crawl: stop or not the reactor when all
|
||||||
|
crawlers have finished
|
||||||
|
"""
|
||||||
if stop_after_crawl:
|
if stop_after_crawl:
|
||||||
d = self.join()
|
d = self.join()
|
||||||
# Don't start the reactor if the deferreds are already fired
|
# Don't start the reactor if the deferreds are already fired
|
||||||
|
Loading…
x
Reference in New Issue
Block a user