mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 00:04:25 +00:00
Merge pull request #1190 from Curita/crawlerprocess-docs
CrawlerProcess documentation
This commit is contained in:
commit
5b884d1bdf
@ -26,7 +26,10 @@ sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy"))
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = ['scrapydocs']
|
||||
extensions = [
|
||||
'scrapydocs',
|
||||
'sphinx.ext.autodoc'
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
@ -99,52 +99,13 @@ how you :ref:`configure the downloader middlewares
|
||||
|
||||
Returns a deferred that is fired when the crawl is finished.
|
||||
|
||||
.. class:: CrawlerRunner(settings)
|
||||
.. autoclass:: CrawlerRunner
|
||||
:members:
|
||||
|
||||
This is a convenient helper class that keeps track of, manages and runs
|
||||
crawlers inside an already setup Twisted `reactor`_.
|
||||
|
||||
The CrawlerRunner object must be instantiated with a
|
||||
:class:`~scrapy.settings.Settings` object.
|
||||
|
||||
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||
accordingly) unless writing scripts that manually handle the crawling
|
||||
process. See :ref:`run-from-script` for an example.
|
||||
|
||||
.. attribute:: crawlers
|
||||
|
||||
Set of :class:`crawlers <scrapy.crawler.Crawler>` created by the
|
||||
:meth:`crawl` method.
|
||||
|
||||
.. method:: crawl(crawler_or_spidercls, \*args, \**kwargs)
|
||||
|
||||
This method runs a crawler with the provided arguments.
|
||||
|
||||
It will keep track of the given crawler so it can be stopped later,
|
||||
while calling its :meth:`Crawler.crawl` method.
|
||||
|
||||
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
|
||||
instance, it will try to create one using this parameter as the spider
|
||||
class given to it.
|
||||
|
||||
Returns a deferred that is fired when the crawl is finished.
|
||||
|
||||
:param crawler_or_spidercls: already created crawler, or a spider class
|
||||
or spider's name inside the project to create it
|
||||
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
||||
:class:`~scrapy.spider.Spider` subclass or string
|
||||
|
||||
:param args: arguments to initializate the spider
|
||||
:type args: list
|
||||
|
||||
:param kwargs: keyword arguments to initializate the spider
|
||||
:type kwargs: dict
|
||||
|
||||
.. method:: stop()
|
||||
|
||||
Stops simultaneously all the crawling jobs taking place.
|
||||
|
||||
Returns a deferred that is fired when they all have ended.
|
||||
.. autoclass:: CrawlerProcess
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:inherited-members:
|
||||
|
||||
.. _topics-api-settings:
|
||||
|
||||
|
@ -18,39 +18,69 @@ the typical way of running Scrapy via ``scrapy crawl``.
|
||||
Remember that Scrapy is built on top of the Twisted
|
||||
asynchronous networking library, so you need to run it inside the Twisted reactor.
|
||||
|
||||
Note that you will also have to shutdown the Twisted reactor yourself after the
|
||||
spider is finished. This can be achieved by adding callbacks to the deferred
|
||||
returned by the :meth:`CrawlerRunner.crawl
|
||||
<scrapy.crawler.CrawlerRunner.crawl>` method.
|
||||
First utility you can use to run your spiders is
|
||||
:class:`scrapy.crawler.CrawlerProcess`. This class will start a Twisted reactor
|
||||
for you, configuring the logging and setting shutdown handlers. This class is
|
||||
the one used by all Scrapy commands.
|
||||
|
||||
Here's an example showing how to run a single spider with it.
|
||||
|
||||
::
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
# Your spider definition
|
||||
...
|
||||
|
||||
process = CrawlerProcess({
|
||||
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
||||
})
|
||||
|
||||
process.crawl(MySpider)
|
||||
process.start() # the script will block here until the crawling is finished
|
||||
|
||||
Make sure to check :class:`~scrapy.crawler.CrawlerProcess` documentation to get
|
||||
acquainted with its usage details.
|
||||
|
||||
If you are inside a Scrapy project there are some additional helpers you can
|
||||
use to import those components within the project. You can automatically import
|
||||
your spiders passing their name to :class:`~scrapy.crawler.CrawlerProcess`, and
|
||||
use ``get_project_settings`` to get a :class:`~scrapy.settings.Settings`
|
||||
instance with your project settings.
|
||||
|
||||
What follows is a working example of how to do that, using the `testspiders`_
|
||||
project as example.
|
||||
|
||||
::
|
||||
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
settings = get_project_settings()
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner(settings)
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
|
||||
# 'followall' is the name of one of the spiders of the project.
|
||||
d = runner.crawl('followall', domain='scrapinghub.com')
|
||||
d.addBoth(lambda _: reactor.stop())
|
||||
reactor.run() # the script will block here until the crawling is finished
|
||||
process.crawl('testspider', domain='scrapinghub.com')
|
||||
process.start() # the script will block here until the crawling is finished
|
||||
|
||||
Running spiders outside projects it's not much different. You have to create a
|
||||
generic :class:`~scrapy.settings.Settings` object and populate it as needed
|
||||
(See :ref:`topics-settings-ref` for the available settings), instead of using
|
||||
the configuration returned by `get_project_settings`.
|
||||
There's another Scrapy utility that provides more control over the crawling
|
||||
process: :class:`scrapy.crawler.CrawlerRunner`. This class is a thin wrapper
|
||||
that encapsulates some simple helpers to run multiple crawlers, but it won't
|
||||
start or interfere with existing reactors in any way.
|
||||
|
||||
Spiders can still be referenced by their name if :setting:`SPIDER_MODULES` is
|
||||
set with the modules where Scrapy should look for spiders. Otherwise, passing
|
||||
the spider class as first argument in the :meth:`CrawlerRunner.crawl
|
||||
<scrapy.crawler.CrawlerRunner.crawl>` method is enough.
|
||||
Using this class the reactor should be explicitly run after scheduling your
|
||||
spiders. It's recommended you use :class:`~scrapy.crawler.CrawlerRunner`
|
||||
instead of :class:`~scrapy.crawler.CrawlerProcess` if your application is
|
||||
already using Twisted and you want to run Scrapy in the same reactor.
|
||||
|
||||
Note that you will also have to shutdown the Twisted reactor yourself after the
|
||||
spider is finished. This can be achieved by adding callbacks to the deferred
|
||||
returned by the :meth:`CrawlerRunner.crawl
|
||||
<scrapy.crawler.CrawlerRunner.crawl>` method.
|
||||
|
||||
Here's an example of its usage, along with a callback to manually stop the
|
||||
reactor after `MySpider` has finished running.
|
||||
|
||||
::
|
||||
|
||||
@ -63,7 +93,7 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl
|
||||
# Your spider definition
|
||||
...
|
||||
|
||||
configure_logging(settings)
|
||||
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
|
||||
runner = CrawlerRunner({
|
||||
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
||||
})
|
||||
@ -83,25 +113,50 @@ By default, Scrapy runs a single spider per process when you run ``scrapy
|
||||
crawl``. However, Scrapy supports running multiple spiders per process using
|
||||
the :ref:`internal API <topics-api>`.
|
||||
|
||||
Here is an example that runs multiple spiders simultaneously, using the
|
||||
`testspiders`_ project:
|
||||
Here is an example that runs multiple spiders simultaneously:
|
||||
|
||||
::
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
|
||||
class MySpider1(scrapy.Spider):
|
||||
# Your first spider definition
|
||||
...
|
||||
|
||||
class MySpider2(scrapy.Spider):
|
||||
# Your second spider definition
|
||||
...
|
||||
|
||||
process = CrawlerProcess({})
|
||||
process.crawl(MySpider1)
|
||||
process.crawl(MySpider2)
|
||||
process.start() # the script will block here until all crawling jobs are finished
|
||||
|
||||
Same example using :class:`~scrapy.crawler.CrawlerRunner`:
|
||||
|
||||
::
|
||||
|
||||
import scrapy
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
settings = get_project_settings()
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner(settings)
|
||||
dfs = set()
|
||||
for domain in ['scrapinghub.com', 'insophia.com']:
|
||||
d = runner.crawl('followall', domain=domain)
|
||||
dfs.add(d)
|
||||
class MySpider1(scrapy.Spider):
|
||||
# Your first spider definition
|
||||
...
|
||||
|
||||
class MySpider2(scrapy.Spider):
|
||||
# Your second spider definition
|
||||
...
|
||||
|
||||
configure_logging({})
|
||||
runner = CrawlerRunner({})
|
||||
runner.crawl(MySpider1)
|
||||
runner.crawl(MySpider2)
|
||||
d = runner.join()
|
||||
d.addBoth(lambda _: reactor.stop())
|
||||
|
||||
defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
|
||||
reactor.run() # the script will block here until all crawling jobs are finished
|
||||
|
||||
Same example but running the spiders sequentially by chaining the deferreds:
|
||||
@ -111,16 +166,22 @@ Same example but running the spiders sequentially by chaining the deferreds:
|
||||
from twisted.internet import reactor, defer
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
settings = get_project_settings()
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner(settings)
|
||||
class MySpider1(scrapy.Spider):
|
||||
# Your first spider definition
|
||||
...
|
||||
|
||||
class MySpider2(scrapy.Spider):
|
||||
# Your second spider definition
|
||||
...
|
||||
|
||||
configure_logging({})
|
||||
runner = CrawlerRunner({})
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl():
|
||||
for domain in ['scrapinghub.com', 'insophia.com']:
|
||||
yield runner.crawl('followall', domain=domain)
|
||||
yield runner.crawl(MySpider1)
|
||||
yield runner.crawl(MySpider2)
|
||||
reactor.stop()
|
||||
|
||||
crawl()
|
||||
|
@ -89,13 +89,30 @@ class Crawler(object):
|
||||
|
||||
|
||||
class CrawlerRunner(object):
|
||||
"""
|
||||
This is a convenient helper class that keeps track of, manages and runs
|
||||
crawlers inside an already setup Twisted `reactor`_.
|
||||
|
||||
The CrawlerRunner object must be instantiated with a
|
||||
:class:`~scrapy.settings.Settings` object.
|
||||
|
||||
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||
accordingly) unless writing scripts that manually handle the crawling
|
||||
process. See :ref:`run-from-script` for an example.
|
||||
"""
|
||||
|
||||
crawlers = property(
|
||||
lambda self: self._crawlers,
|
||||
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
|
||||
":meth:`crawl` and managed by this class."
|
||||
)
|
||||
|
||||
def __init__(self, settings):
|
||||
if isinstance(settings, dict):
|
||||
settings = Settings(settings)
|
||||
self.settings = settings
|
||||
self.spider_loader = _get_spider_loader(settings)
|
||||
self.crawlers = set()
|
||||
self._crawlers = set()
|
||||
self._active = set()
|
||||
|
||||
@property
|
||||
@ -106,6 +123,27 @@ class CrawlerRunner(object):
|
||||
return self.spider_loader
|
||||
|
||||
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
||||
"""
|
||||
Run a crawler with the provided arguments.
|
||||
|
||||
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
|
||||
keeping track of it so it can be stopped later.
|
||||
|
||||
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
|
||||
instance, this method will try to create one using this parameter as
|
||||
the spider class given to it.
|
||||
|
||||
Returns a deferred that is fired when the crawling is finished.
|
||||
|
||||
:param crawler_or_spidercls: already created crawler, or a spider class
|
||||
or spider's name inside the project to create it
|
||||
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
||||
:class:`~scrapy.spider.Spider` subclass or string
|
||||
|
||||
:param list args: arguments to initialize the spider
|
||||
|
||||
:param dict kwargs: keyword arguments to initialize the spider
|
||||
"""
|
||||
crawler = crawler_or_spidercls
|
||||
if not isinstance(crawler_or_spidercls, Crawler):
|
||||
crawler = self._create_crawler(crawler_or_spidercls)
|
||||
@ -127,17 +165,44 @@ class CrawlerRunner(object):
|
||||
return Crawler(spidercls, self.settings)
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stops simultaneously all the crawling jobs taking place.
|
||||
|
||||
Returns a deferred that is fired when they all have ended.
|
||||
"""
|
||||
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def join(self):
|
||||
"""Wait for all managed crawlers to complete"""
|
||||
"""
|
||||
join()
|
||||
|
||||
Returns a deferred that is fired when all managed :attr:`crawlers` have
|
||||
completed their executions.
|
||||
"""
|
||||
while self._active:
|
||||
yield defer.DeferredList(self._active)
|
||||
|
||||
|
||||
class CrawlerProcess(CrawlerRunner):
|
||||
"""A class to run multiple scrapy crawlers in a process simultaneously"""
|
||||
"""
|
||||
A class to run multiple scrapy crawlers in a process simultaneously.
|
||||
|
||||
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
|
||||
for starting a Twisted `reactor`_ and handling shutdown signals, like the
|
||||
keyboard interrupt command Ctrl-C. It also configures top-level logging.
|
||||
|
||||
This utility should be a better fit than
|
||||
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
|
||||
Twisted `reactor`_ within your application.
|
||||
|
||||
The CrawlerProcess object must be instantiated with a
|
||||
:class:`~scrapy.settings.Settings` object.
|
||||
|
||||
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||
accordingly) unless writing scripts that manually handle the crawling
|
||||
process. See :ref:`run-from-script` for an example.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
super(CrawlerProcess, self).__init__(settings)
|
||||
@ -161,6 +226,17 @@ class CrawlerProcess(CrawlerRunner):
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
def start(self, stop_after_crawl=True):
|
||||
"""
|
||||
This method starts a Twisted `reactor`_, adjusts its pool size to
|
||||
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
|
||||
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
|
||||
|
||||
If `stop_after_crawl` is True, the reactor will be stopped after all
|
||||
crawlers have finished, using :meth:`join`.
|
||||
|
||||
:param boolean stop_after_crawl: stop or not the reactor when all
|
||||
crawlers have finished
|
||||
"""
|
||||
if stop_after_crawl:
|
||||
d = self.join()
|
||||
# Don't start the reactor if the deferreds are already fired
|
||||
|
Loading…
x
Reference in New Issue
Block a user