1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 02:03:51 +00:00
scrapy/scrapy/crawler.py
2013-10-03 12:49:16 -03:00

174 lines
5.5 KiB
Python

import signal
from itertools import chain
from twisted.internet import reactor, defer
from scrapy.core.engine import ExecutionEngine
from scrapy.resolver import CachingThreadedResolver
from scrapy.extension import ExtensionManager
from scrapy.signalmanager import SignalManager
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
from scrapy.utils.misc import load_object
from scrapy import log, signals
class Crawler(object):
def __init__(self, settings):
self.configured = False
self.settings = settings
self.signals = SignalManager(self)
self.stats = load_object(settings['STATS_CLASS'])(self)
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
self.spiders = spman_cls.from_crawler(self)
self._scheduled = {}
def install(self):
import scrapy.project
assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
scrapy.project.crawler = self
def uninstall(self):
import scrapy.project
assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
del scrapy.project.crawler
def configure(self):
if self.configured:
return
self.configured = True
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
self.engine = ExecutionEngine(self, self._spider_closed)
def crawl(self, spider, requests=None):
spider.set_crawler(self)
if self.configured and self.engine.running:
assert not self._scheduled
return self._schedule(spider, requests)
elif requests is None:
self._scheduled[spider] = None
else:
self._scheduled.setdefault(spider, []).append(requests)
def _schedule(self, spider, batches=()):
requests = chain.from_iterable(batches) \
if batches else spider.start_requests()
return self.engine.open_spider(spider, requests)
def _spider_closed(self, spider=None):
if not self.engine.open_spiders:
self.stop()
@defer.inlineCallbacks
def start(self):
yield defer.maybeDeferred(self.configure)
for spider, batches in self._scheduled.iteritems():
yield self._schedule(spider, batches)
yield defer.maybeDeferred(self.engine.start)
@defer.inlineCallbacks
def stop(self):
if self.engine.running:
yield defer.maybeDeferred(self.engine.stop)
class ProcessMixin(object):
""" Mixin which provides automatic control of the Twisted reactor and
installs some convenient signals for shutting it down
"""
def __init__(self, *a, **kw):
install_shutdown_handlers(self._signal_shutdown)
def start(self):
if self.start_crawling():
self.start_reactor()
def start_reactor(self):
if self.settings.getbool('DNSCACHE_ENABLED'):
reactor.installResolver(CachingThreadedResolver(reactor))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call
def start_crawling(self):
raise NotImplementedError
def stop(self):
raise NotImplementedError
def stop_reactor(self, _=None):
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
pass
def _signal_shutdown(self, signum, _):
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
log.msg(format="Received %(signame)s, shutting down gracefully. Send again to force ",
level=log.INFO, signame=signame)
reactor.callFromThread(self.stop)
def _signal_kill(self, signum, _):
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
level=log.INFO, signame=signame)
reactor.callFromThread(self.stop_reactor)
class CrawlerProcess(ProcessMixin):
""" A class to run multiple scrapy crawlers in a process sequentially
"""
def __init__(self, settings):
super(CrawlerProcess, self).__init__(settings)
self.settings = settings
self.crawlers = {}
self.stopping = False
def create_crawler(self, name=None):
if name not in self.crawlers:
self.crawlers[name] = Crawler(self.settings)
return self.crawlers[name]
def start_crawler(self):
if self.crawlers and not self.stopping:
name, crawler = self.crawlers.popitem()
sflo = log.start_from_crawler(crawler)
crawler.configure()
crawler.install()
crawler.signals.connect(crawler.uninstall, signals.engine_stopped)
if sflo:
crawler.signals.connect(sflo.stop, signals.engine_stopped)
crawler.signals.connect(self.check_done, signals.engine_stopped)
crawler.start()
return name, crawler
def check_done(self, **kwargs):
if not self.start_crawler():
self.stop_reactor()
def start_crawling(self):
log.scrapy_info(self.settings)
return self.start_crawler() is not None
@defer.inlineCallbacks
def stop(self):
self.stopping = True
for crawler in self.crawlers.itervalues():
if crawler.configured:
yield crawler.stop()