mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-14 16:28:31 +00:00
345 lines
13 KiB
Python
345 lines
13 KiB
Python
import logging
|
|
import pprint
|
|
import signal
|
|
import warnings
|
|
|
|
from twisted.internet import defer
|
|
from zope.interface.exceptions import DoesNotImplement
|
|
|
|
try:
|
|
# zope >= 5.0 only supports MultipleInvalid
|
|
from zope.interface.exceptions import MultipleInvalid
|
|
except ImportError:
|
|
MultipleInvalid = None
|
|
|
|
from zope.interface.verify import verifyClass
|
|
|
|
from scrapy import signals, Spider
|
|
from scrapy.core.engine import ExecutionEngine
|
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
|
from scrapy.extension import ExtensionManager
|
|
from scrapy.interfaces import ISpiderLoader
|
|
from scrapy.settings import overridden_settings, Settings
|
|
from scrapy.signalmanager import SignalManager
|
|
from scrapy.utils.log import (
|
|
configure_logging,
|
|
get_scrapy_root_handler,
|
|
install_scrapy_root_handler,
|
|
log_scrapy_info,
|
|
LogCounterHandler,
|
|
)
|
|
from scrapy.utils.misc import create_instance, load_object
|
|
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
|
from scrapy.utils.reactor import install_reactor, verify_installed_reactor
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, spidercls, settings=None):
|
|
if isinstance(spidercls, Spider):
|
|
raise ValueError('The spidercls argument must be a class, not an object')
|
|
|
|
if isinstance(settings, dict) or settings is None:
|
|
settings = Settings(settings)
|
|
|
|
self.spidercls = spidercls
|
|
self.settings = settings.copy()
|
|
self.spidercls.update_settings(self.settings)
|
|
|
|
self.signals = SignalManager(self)
|
|
self.stats = load_object(self.settings['STATS_CLASS'])(self)
|
|
|
|
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
|
|
logging.root.addHandler(handler)
|
|
|
|
d = dict(overridden_settings(self.settings))
|
|
logger.info("Overridden settings:\n%(settings)s",
|
|
{'settings': pprint.pformat(d)})
|
|
|
|
if get_scrapy_root_handler() is not None:
|
|
# scrapy root handler already installed: update it with new settings
|
|
install_scrapy_root_handler(self.settings)
|
|
# lambda is assigned to Crawler attribute because this way it is not
|
|
# garbage collected after leaving __init__ scope
|
|
self.__remove_handler = lambda: logging.root.removeHandler(handler)
|
|
self.signals.connect(self.__remove_handler, signals.engine_stopped)
|
|
|
|
lf_cls = load_object(self.settings['LOG_FORMATTER'])
|
|
self.logformatter = lf_cls.from_crawler(self)
|
|
self.extensions = ExtensionManager.from_crawler(self)
|
|
|
|
self.settings.freeze()
|
|
self.crawling = False
|
|
self.spider = None
|
|
self.engine = None
|
|
|
|
@defer.inlineCallbacks
|
|
def crawl(self, *args, **kwargs):
|
|
if self.crawling:
|
|
raise RuntimeError("Crawling already taking place")
|
|
self.crawling = True
|
|
|
|
try:
|
|
self.spider = self._create_spider(*args, **kwargs)
|
|
self.engine = self._create_engine()
|
|
start_requests = iter(self.spider.start_requests())
|
|
yield self.engine.open_spider(self.spider, start_requests)
|
|
yield defer.maybeDeferred(self.engine.start)
|
|
except Exception:
|
|
self.crawling = False
|
|
if self.engine is not None:
|
|
yield self.engine.close()
|
|
raise
|
|
|
|
def _create_spider(self, *args, **kwargs):
|
|
return self.spidercls.from_crawler(self, *args, **kwargs)
|
|
|
|
def _create_engine(self):
|
|
return ExecutionEngine(self, lambda _: self.stop())
|
|
|
|
@defer.inlineCallbacks
|
|
def stop(self):
|
|
"""Starts a graceful stop of the crawler and returns a deferred that is
|
|
fired when the crawler is stopped."""
|
|
if self.crawling:
|
|
self.crawling = False
|
|
yield defer.maybeDeferred(self.engine.stop)
|
|
|
|
|
|
class CrawlerRunner:
|
|
"""
|
|
This is a convenient helper class that keeps track of, manages and runs
|
|
crawlers inside an already setup :mod:`~twisted.internet.reactor`.
|
|
|
|
The CrawlerRunner object must be instantiated with a
|
|
:class:`~scrapy.settings.Settings` object.
|
|
|
|
This class shouldn't be needed (since Scrapy is responsible of using it
|
|
accordingly) unless writing scripts that manually handle the crawling
|
|
process. See :ref:`run-from-script` for an example.
|
|
"""
|
|
|
|
crawlers = property(
|
|
lambda self: self._crawlers,
|
|
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
|
|
":meth:`crawl` and managed by this class."
|
|
)
|
|
|
|
@staticmethod
|
|
def _get_spider_loader(settings):
|
|
""" Get SpiderLoader instance from settings """
|
|
cls_path = settings.get('SPIDER_LOADER_CLASS')
|
|
loader_cls = load_object(cls_path)
|
|
excs = (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
|
|
try:
|
|
verifyClass(ISpiderLoader, loader_cls)
|
|
except excs:
|
|
warnings.warn(
|
|
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
|
|
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
|
|
'Please add all missing methods to avoid unexpected runtime errors.',
|
|
category=ScrapyDeprecationWarning, stacklevel=2
|
|
)
|
|
return loader_cls.from_settings(settings.frozencopy())
|
|
|
|
def __init__(self, settings=None):
|
|
if isinstance(settings, dict) or settings is None:
|
|
settings = Settings(settings)
|
|
self.settings = settings
|
|
self.spider_loader = self._get_spider_loader(settings)
|
|
self._crawlers = set()
|
|
self._active = set()
|
|
self.bootstrap_failed = False
|
|
self._handle_twisted_reactor()
|
|
|
|
@property
|
|
def spiders(self):
|
|
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
|
|
"CrawlerRunner.spider_loader.",
|
|
category=ScrapyDeprecationWarning, stacklevel=2)
|
|
return self.spider_loader
|
|
|
|
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
|
"""
|
|
Run a crawler with the provided arguments.
|
|
|
|
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
|
|
keeping track of it so it can be stopped later.
|
|
|
|
If ``crawler_or_spidercls`` isn't a :class:`~scrapy.crawler.Crawler`
|
|
instance, this method will try to create one using this parameter as
|
|
the spider class given to it.
|
|
|
|
Returns a deferred that is fired when the crawling is finished.
|
|
|
|
:param crawler_or_spidercls: already created crawler, or a spider class
|
|
or spider's name inside the project to create it
|
|
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
|
:class:`~scrapy.spiders.Spider` subclass or string
|
|
|
|
:param list args: arguments to initialize the spider
|
|
|
|
:param dict kwargs: keyword arguments to initialize the spider
|
|
"""
|
|
if isinstance(crawler_or_spidercls, Spider):
|
|
raise ValueError(
|
|
'The crawler_or_spidercls argument cannot be a spider object, '
|
|
'it must be a spider class (or a Crawler object)')
|
|
crawler = self.create_crawler(crawler_or_spidercls)
|
|
return self._crawl(crawler, *args, **kwargs)
|
|
|
|
def _crawl(self, crawler, *args, **kwargs):
|
|
self.crawlers.add(crawler)
|
|
d = crawler.crawl(*args, **kwargs)
|
|
self._active.add(d)
|
|
|
|
def _done(result):
|
|
self.crawlers.discard(crawler)
|
|
self._active.discard(d)
|
|
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
|
|
return result
|
|
|
|
return d.addBoth(_done)
|
|
|
|
def create_crawler(self, crawler_or_spidercls):
|
|
"""
|
|
Return a :class:`~scrapy.crawler.Crawler` object.
|
|
|
|
* If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
|
|
* If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
|
|
is constructed for it.
|
|
* If ``crawler_or_spidercls`` is a string, this function finds
|
|
a spider with this name in a Scrapy project (using spider loader),
|
|
then creates a Crawler instance for it.
|
|
"""
|
|
if isinstance(crawler_or_spidercls, Spider):
|
|
raise ValueError(
|
|
'The crawler_or_spidercls argument cannot be a spider object, '
|
|
'it must be a spider class (or a Crawler object)')
|
|
if isinstance(crawler_or_spidercls, Crawler):
|
|
return crawler_or_spidercls
|
|
return self._create_crawler(crawler_or_spidercls)
|
|
|
|
def _create_crawler(self, spidercls):
|
|
if isinstance(spidercls, str):
|
|
spidercls = self.spider_loader.load(spidercls)
|
|
return Crawler(spidercls, self.settings)
|
|
|
|
def stop(self):
|
|
"""
|
|
Stops simultaneously all the crawling jobs taking place.
|
|
|
|
Returns a deferred that is fired when they all have ended.
|
|
"""
|
|
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
|
|
|
@defer.inlineCallbacks
|
|
def join(self):
|
|
"""
|
|
join()
|
|
|
|
Returns a deferred that is fired when all managed :attr:`crawlers` have
|
|
completed their executions.
|
|
"""
|
|
while self._active:
|
|
yield defer.DeferredList(self._active)
|
|
|
|
def _handle_twisted_reactor(self):
|
|
if self.settings.get("TWISTED_REACTOR"):
|
|
verify_installed_reactor(self.settings["TWISTED_REACTOR"])
|
|
|
|
|
|
class CrawlerProcess(CrawlerRunner):
|
|
"""
|
|
A class to run multiple scrapy crawlers in a process simultaneously.
|
|
|
|
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
|
|
for starting a :mod:`~twisted.internet.reactor` and handling shutdown
|
|
signals, like the keyboard interrupt command Ctrl-C. It also configures
|
|
top-level logging.
|
|
|
|
This utility should be a better fit than
|
|
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
|
|
:mod:`~twisted.internet.reactor` within your application.
|
|
|
|
The CrawlerProcess object must be instantiated with a
|
|
:class:`~scrapy.settings.Settings` object.
|
|
|
|
:param install_root_handler: whether to install root logging handler
|
|
(default: True)
|
|
|
|
This class shouldn't be needed (since Scrapy is responsible of using it
|
|
accordingly) unless writing scripts that manually handle the crawling
|
|
process. See :ref:`run-from-script` for an example.
|
|
"""
|
|
|
|
def __init__(self, settings=None, install_root_handler=True):
|
|
super(CrawlerProcess, self).__init__(settings)
|
|
install_shutdown_handlers(self._signal_shutdown)
|
|
configure_logging(self.settings, install_root_handler)
|
|
log_scrapy_info(self.settings)
|
|
|
|
def _signal_shutdown(self, signum, _):
|
|
from twisted.internet import reactor
|
|
install_shutdown_handlers(self._signal_kill)
|
|
signame = signal_names[signum]
|
|
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
|
|
{'signame': signame})
|
|
reactor.callFromThread(self._graceful_stop_reactor)
|
|
|
|
def _signal_kill(self, signum, _):
|
|
from twisted.internet import reactor
|
|
install_shutdown_handlers(signal.SIG_IGN)
|
|
signame = signal_names[signum]
|
|
logger.info('Received %(signame)s twice, forcing unclean shutdown',
|
|
{'signame': signame})
|
|
reactor.callFromThread(self._stop_reactor)
|
|
|
|
def start(self, stop_after_crawl=True):
|
|
"""
|
|
This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
|
|
size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
|
|
based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
|
|
|
|
If ``stop_after_crawl`` is True, the reactor will be stopped after all
|
|
crawlers have finished, using :meth:`join`.
|
|
|
|
:param boolean stop_after_crawl: stop or not the reactor when all
|
|
crawlers have finished
|
|
"""
|
|
from twisted.internet import reactor
|
|
if stop_after_crawl:
|
|
d = self.join()
|
|
# Don't start the reactor if the deferreds are already fired
|
|
if d.called:
|
|
return
|
|
d.addBoth(self._stop_reactor)
|
|
|
|
resolver_class = load_object(self.settings["DNS_RESOLVER"])
|
|
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
|
|
resolver.install_on_reactor()
|
|
tp = reactor.getThreadPool()
|
|
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
|
|
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
|
|
reactor.run(installSignalHandlers=False) # blocking call
|
|
|
|
def _graceful_stop_reactor(self):
|
|
d = self.stop()
|
|
d.addBoth(self._stop_reactor)
|
|
return d
|
|
|
|
def _stop_reactor(self, _=None):
|
|
from twisted.internet import reactor
|
|
try:
|
|
reactor.stop()
|
|
except RuntimeError: # raised if already stopped or in shutdown stage
|
|
pass
|
|
|
|
def _handle_twisted_reactor(self):
|
|
if self.settings.get("TWISTED_REACTOR"):
|
|
install_reactor(self.settings["TWISTED_REACTOR"])
|
|
super()._handle_twisted_reactor()
|