diff --git a/scrapy/cmdline.py b/scrapy/cmdline.py index db7f0c122..943bb43a9 100644 --- a/scrapy/cmdline.py +++ b/scrapy/cmdline.py @@ -5,7 +5,7 @@ import inspect import pkg_resources import scrapy -from scrapy.crawler import CrawlerProcess +from scrapy.crawler import CrawlerProcess, MultiCrawlerProcess from scrapy.xlib import lsprofcalltree from scrapy.command import ScrapyCommand from scrapy.exceptions import UsageError @@ -81,7 +81,7 @@ def _print_commands(settings, inproject): def _print_unknown_command(settings, cmdname, inproject): _print_header(settings, inproject) print "Unknown command: %s\n" % cmdname - print 'Use "scrapy" to see available commands' + print 'Use "scrapy" to see available commands' def _run_print_help(parser, func, *a, **kw): try: @@ -117,8 +117,6 @@ def execute(argv=None, settings=None): conf.settings = settings # ------------------------------------------------------------------ - crawler = CrawlerProcess(settings) - crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) @@ -139,7 +137,15 @@ def execute(argv=None, settings=None): cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) - cmd.set_crawler(crawler) + + if cmd.multi_crawlers: + process = MultiCrawlerProcess(settings) + cmd.process = process + else: + process = CrawlerProcess(settings) + process.install() + cmd.set_crawler(process) + _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode) diff --git a/scrapy/command.py b/scrapy/command.py index 9a80dae9f..87b0cff26 100644 --- a/scrapy/command.py +++ b/scrapy/command.py @@ -13,6 +13,7 @@ from scrapy.exceptions import UsageError class ScrapyCommand(object): requires_project = False + multi_crawlers = False # default settings to be used for this command instead of global defaults default_settings = {} @@ -21,7 +22,6 @@ class ScrapyCommand(object): def __init__(self): self.settings = None # set in scrapy.cmdline - self.configured = False def set_crawler(self, crawler): assert not hasattr(self, '_crawler'), "crawler already set" @@ -29,10 +29,10 @@ class ScrapyCommand(object): @property def crawler(self): - if not self.configured: + if not self.multi_crawlers and not self._crawler.configured: log.start_from_crawler(self._crawler) self._crawler.configure() - self.configured = True + return self._crawler def syntax(self): @@ -83,7 +83,7 @@ class ScrapyCommand(object): help="set/override setting (may be repeated)") group.add_option("--pdb", action="store_true", help="enable pdb on failure") parser.add_option_group(group) - + def process_options(self, args, opts): try: self.settings.overrides.update(arglist_to_dict(opts.set)) diff --git a/scrapy/commands/check.py b/scrapy/commands/check.py index eaf16e031..a479600eb 100644 --- a/scrapy/commands/check.py +++ b/scrapy/commands/check.py @@ -2,13 +2,11 @@ from collections import defaultdict from functools import wraps from unittest import TextTestRunner -from scrapy import signals from scrapy.command import ScrapyCommand from scrapy.contracts import ContractsManager from scrapy.utils.misc import load_object from scrapy.utils.spider import iterate_spider_output from scrapy.utils.conf import build_component_list -from scrapy.xlib.pydispatch import dispatcher def _generate(cb): @@ -22,7 +20,8 @@ def _generate(cb): class Command(ScrapyCommand): requires_project = True - default_settings = {'LOG_ENABLED': False} + multi_crawlers = True + default_settings = {'LOG_ENABLED': True} def syntax(self): return "[options] " @@ -48,17 +47,20 @@ class Command(ScrapyCommand): # contract requests contract_reqs = defaultdict(list) - self.crawler.engine.has_capacity = lambda: True - for spider in args or self.crawler.spiders.list(): - spider = self.crawler.spiders.create(spider) + spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) + spiders = spman_cls.from_settings(self.settings) + + for spider in args or spiders.list(): + spider = spiders.create(spider) requests = self.get_requests(spider) if opts.list: for req in requests: contract_reqs[spider.name].append(req.callback.__name__) - else: - self.crawler.crawl(spider, requests) + elif requests: + crawler = self.process.create_crawler(spider.name) + crawler.crawl(spider, requests) # start checks if opts.list: @@ -67,9 +69,8 @@ class Command(ScrapyCommand): for method in sorted(methods): print ' * %s' % method else: - dispatcher.connect(self.results.printErrors, - signals.engine_stopped) - self.crawler.start() + self.process.start() + self.results.printErrors() def get_requests(self, spider): requests = [] diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 6bd13e884..c88e1641d 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -20,6 +20,8 @@ class Crawler(object): self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) + self.scheduled = {} + def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" @@ -47,7 +49,12 @@ class Crawler(object): spider.set_crawler(self) if requests is None: requests = spider.start_requests() - return self.engine.open_spider(spider, requests) + + if self.configured and self.engine.running: + assert not self.scheduled + return self.engine.open_spider(spider, requests) + else: + self.scheduled.setdefault(spider, []).extend(requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: @@ -56,6 +63,10 @@ class Crawler(object): @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) + + for spider, requests in self.scheduled.iteritems(): + yield self.engine.open_spider(spider, requests) + yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks @@ -64,33 +75,27 @@ class Crawler(object): yield defer.maybeDeferred(self.engine.stop) -class CrawlerProcess(Crawler): - """A class to run a single Scrapy crawler in a process. It provides - automatic control of the Twisted reactor and installs some convenient - signals for shutting down the crawl. +class ProcessMixin(object): + """ Mixin which provides automatic control of the Twisted reactor and + installs some convenient signals for shutting it down """ def __init__(self, *a, **kw): - super(CrawlerProcess, self).__init__(*a, **kw) - self.signals.connect(self.stop, signals.engine_stopped) install_shutdown_handlers(self._signal_shutdown) def start(self): - super(CrawlerProcess, self).start() if self.settings.getbool('DNSCACHE_ENABLED'): reactor.installResolver(CachingThreadedResolver(reactor)) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) - reactor.run(installSignalHandlers=False) # blocking call + reactor.run(installSignalHandlers=False) # blocking call def stop(self): - d = super(CrawlerProcess, self).stop() - d.addBoth(self._stop_reactor) - return d + raise NotImplementedError def _stop_reactor(self, _=None): try: reactor.stop() - except RuntimeError: # raised if already stopped or in shutdown stage + except RuntimeError: # raised if already stopped or in shutdown stage pass def _signal_shutdown(self, signum, _): @@ -106,3 +111,68 @@ class CrawlerProcess(Crawler): log.msg(format='Received %(signame)s twice, forcing unclean shutdown', level=log.INFO, signame=signame) reactor.callFromThread(self._stop_reactor) + + +class CrawlerProcess(Crawler, ProcessMixin): + """ A class to run a single Scrapy crawler in a process + """ + + def __init__(self, *a, **kw): + Crawler.__init__(self, *a, **kw) + ProcessMixin.__init__(self, *a, **kw) + self.signals.connect(self.stop, signals.engine_stopped) + + def start(self): + Crawler.start(self) + ProcessMixin.start(self) + + def stop(self): + d = Crawler.stop(self) + d.addBoth(self._stop_reactor) + return d + + +class MultiCrawlerProcess(ProcessMixin): + """ A class to run multiple scrapy crawlers in a process sequentially + """ + + def __init__(self, settings): + super(MultiCrawlerProcess, self).__init__(settings) + + self.settings = settings + self.crawlers = {} + self.stopping = False + + def create_crawler(self, name): + if name not in self.crawlers: + self.crawlers[name] = Crawler(self.settings) + + return self.crawlers[name] + + def start_crawler(self): + name, crawler = self.crawlers.popitem() + + crawler.sflo = log.start_from_crawler(crawler) + crawler.signals.connect(crawler.sflo.stop, signals.engine_stopped) + crawler.signals.connect(self.check_done, signals.engine_stopped) + crawler.start() + + return name, crawler + + def check_done(self, **kwargs): + if self.crawlers and not self.stopping: + self.start_crawler() + else: + self._stop_reactor() + + def start(self): + self.start_crawler() + super(MultiCrawlerProcess, self).start() + + @defer.inlineCallbacks + def stop(self): + self.stopping = True + + for crawler in self.crawlers.itervalues(): + if crawler.configured: + yield crawler.stop() diff --git a/scrapy/log.py b/scrapy/log.py index 9622b9ee3..00ef5e5bc 100644 --- a/scrapy/log.py +++ b/scrapy/log.py @@ -1,4 +1,4 @@ -""" +""" Scrapy logging facility See documentation in docs/topics/logging.rst @@ -11,7 +11,7 @@ from twisted.python import log import scrapy from scrapy.utils.python import unicode_to_str - + # Logging levels DEBUG = logging.DEBUG INFO = logging.INFO @@ -138,9 +138,12 @@ def start_from_crawler(crawler): if not settings.getbool('LOG_ENABLED'): return - start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'], + sflo = start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'], settings['LOG_ENCODING'], crawler) + msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \ settings['BOT_NAME'])) msg("Optional features available: %s" % ", ".join(scrapy.optional_features), level=DEBUG) + + return sflo