1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-28 13:04:05 +00:00

command with multiple crawlers, fix check command

This commit is contained in:
Alex Cepoi 2013-07-08 16:57:12 +02:00
parent 45d6d2044c
commit 919d7c4062
5 changed files with 116 additions and 36 deletions

View File

@ -5,7 +5,7 @@ import inspect
import pkg_resources
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerProcess, MultiCrawlerProcess
from scrapy.xlib import lsprofcalltree
from scrapy.command import ScrapyCommand
from scrapy.exceptions import UsageError
@ -81,7 +81,7 @@ def _print_commands(settings, inproject):
def _print_unknown_command(settings, cmdname, inproject):
_print_header(settings, inproject)
print "Unknown command: %s\n" % cmdname
print 'Use "scrapy" to see available commands'
print 'Use "scrapy" to see available commands'
def _run_print_help(parser, func, *a, **kw):
try:
@ -117,8 +117,6 @@ def execute(argv=None, settings=None):
conf.settings = settings
# ------------------------------------------------------------------
crawler = CrawlerProcess(settings)
crawler.install()
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
@ -139,7 +137,15 @@ def execute(argv=None, settings=None):
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.set_crawler(crawler)
if cmd.multi_crawlers:
process = MultiCrawlerProcess(settings)
cmd.process = process
else:
process = CrawlerProcess(settings)
process.install()
cmd.set_crawler(process)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)

View File

@ -13,6 +13,7 @@ from scrapy.exceptions import UsageError
class ScrapyCommand(object):
requires_project = False
multi_crawlers = False
# default settings to be used for this command instead of global defaults
default_settings = {}
@ -21,7 +22,6 @@ class ScrapyCommand(object):
def __init__(self):
self.settings = None # set in scrapy.cmdline
self.configured = False
def set_crawler(self, crawler):
assert not hasattr(self, '_crawler'), "crawler already set"
@ -29,10 +29,10 @@ class ScrapyCommand(object):
@property
def crawler(self):
if not self.configured:
if not self.multi_crawlers and not self._crawler.configured:
log.start_from_crawler(self._crawler)
self._crawler.configure()
self.configured = True
return self._crawler
def syntax(self):
@ -83,7 +83,7 @@ class ScrapyCommand(object):
help="set/override setting (may be repeated)")
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
parser.add_option_group(group)
def process_options(self, args, opts):
try:
self.settings.overrides.update(arglist_to_dict(opts.set))

View File

@ -2,13 +2,11 @@ from collections import defaultdict
from functools import wraps
from unittest import TextTestRunner
from scrapy import signals
from scrapy.command import ScrapyCommand
from scrapy.contracts import ContractsManager
from scrapy.utils.misc import load_object
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.conf import build_component_list
from scrapy.xlib.pydispatch import dispatcher
def _generate(cb):
@ -22,7 +20,8 @@ def _generate(cb):
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
multi_crawlers = True
default_settings = {'LOG_ENABLED': True}
def syntax(self):
return "[options] <spider>"
@ -48,17 +47,20 @@ class Command(ScrapyCommand):
# contract requests
contract_reqs = defaultdict(list)
self.crawler.engine.has_capacity = lambda: True
for spider in args or self.crawler.spiders.list():
spider = self.crawler.spiders.create(spider)
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
spiders = spman_cls.from_settings(self.settings)
for spider in args or spiders.list():
spider = spiders.create(spider)
requests = self.get_requests(spider)
if opts.list:
for req in requests:
contract_reqs[spider.name].append(req.callback.__name__)
else:
self.crawler.crawl(spider, requests)
elif requests:
crawler = self.process.create_crawler(spider.name)
crawler.crawl(spider, requests)
# start checks
if opts.list:
@ -67,9 +69,8 @@ class Command(ScrapyCommand):
for method in sorted(methods):
print ' * %s' % method
else:
dispatcher.connect(self.results.printErrors,
signals.engine_stopped)
self.crawler.start()
self.process.start()
self.results.printErrors()
def get_requests(self, spider):
requests = []

View File

@ -20,6 +20,8 @@ class Crawler(object):
self.signals = SignalManager(self)
self.stats = load_object(settings['STATS_CLASS'])(self)
self.scheduled = {}
def install(self):
import scrapy.project
assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
@ -47,7 +49,12 @@ class Crawler(object):
spider.set_crawler(self)
if requests is None:
requests = spider.start_requests()
return self.engine.open_spider(spider, requests)
if self.configured and self.engine.running:
assert not self.scheduled
return self.engine.open_spider(spider, requests)
else:
self.scheduled.setdefault(spider, []).extend(requests)
def _spider_closed(self, spider=None):
if not self.engine.open_spiders:
@ -56,6 +63,10 @@ class Crawler(object):
@defer.inlineCallbacks
def start(self):
yield defer.maybeDeferred(self.configure)
for spider, requests in self.scheduled.iteritems():
yield self.engine.open_spider(spider, requests)
yield defer.maybeDeferred(self.engine.start)
@defer.inlineCallbacks
@ -64,33 +75,27 @@ class Crawler(object):
yield defer.maybeDeferred(self.engine.stop)
class CrawlerProcess(Crawler):
"""A class to run a single Scrapy crawler in a process. It provides
automatic control of the Twisted reactor and installs some convenient
signals for shutting down the crawl.
class ProcessMixin(object):
""" Mixin which provides automatic control of the Twisted reactor and
installs some convenient signals for shutting it down
"""
def __init__(self, *a, **kw):
super(CrawlerProcess, self).__init__(*a, **kw)
self.signals.connect(self.stop, signals.engine_stopped)
install_shutdown_handlers(self._signal_shutdown)
def start(self):
super(CrawlerProcess, self).start()
if self.settings.getbool('DNSCACHE_ENABLED'):
reactor.installResolver(CachingThreadedResolver(reactor))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call
reactor.run(installSignalHandlers=False) # blocking call
def stop(self):
d = super(CrawlerProcess, self).stop()
d.addBoth(self._stop_reactor)
return d
raise NotImplementedError
def _stop_reactor(self, _=None):
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
except RuntimeError: # raised if already stopped or in shutdown stage
pass
def _signal_shutdown(self, signum, _):
@ -106,3 +111,68 @@ class CrawlerProcess(Crawler):
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
level=log.INFO, signame=signame)
reactor.callFromThread(self._stop_reactor)
class CrawlerProcess(Crawler, ProcessMixin):
""" A class to run a single Scrapy crawler in a process
"""
def __init__(self, *a, **kw):
Crawler.__init__(self, *a, **kw)
ProcessMixin.__init__(self, *a, **kw)
self.signals.connect(self.stop, signals.engine_stopped)
def start(self):
Crawler.start(self)
ProcessMixin.start(self)
def stop(self):
d = Crawler.stop(self)
d.addBoth(self._stop_reactor)
return d
class MultiCrawlerProcess(ProcessMixin):
""" A class to run multiple scrapy crawlers in a process sequentially
"""
def __init__(self, settings):
super(MultiCrawlerProcess, self).__init__(settings)
self.settings = settings
self.crawlers = {}
self.stopping = False
def create_crawler(self, name):
if name not in self.crawlers:
self.crawlers[name] = Crawler(self.settings)
return self.crawlers[name]
def start_crawler(self):
name, crawler = self.crawlers.popitem()
crawler.sflo = log.start_from_crawler(crawler)
crawler.signals.connect(crawler.sflo.stop, signals.engine_stopped)
crawler.signals.connect(self.check_done, signals.engine_stopped)
crawler.start()
return name, crawler
def check_done(self, **kwargs):
if self.crawlers and not self.stopping:
self.start_crawler()
else:
self._stop_reactor()
def start(self):
self.start_crawler()
super(MultiCrawlerProcess, self).start()
@defer.inlineCallbacks
def stop(self):
self.stopping = True
for crawler in self.crawlers.itervalues():
if crawler.configured:
yield crawler.stop()

View File

@ -1,4 +1,4 @@
"""
"""
Scrapy logging facility
See documentation in docs/topics/logging.rst
@ -11,7 +11,7 @@ from twisted.python import log
import scrapy
from scrapy.utils.python import unicode_to_str
# Logging levels
DEBUG = logging.DEBUG
INFO = logging.INFO
@ -138,9 +138,12 @@ def start_from_crawler(crawler):
if not settings.getbool('LOG_ENABLED'):
return
start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
sflo = start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
settings['LOG_ENCODING'], crawler)
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
settings['BOT_NAME']))
msg("Optional features available: %s" % ", ".join(scrapy.optional_features),
level=DEBUG)
return sflo