mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-28 17:23:37 +00:00
command with multiple crawlers, fix check command
This commit is contained in:
parent
45d6d2044c
commit
919d7c4062
@ -5,7 +5,7 @@ import inspect
|
||||
import pkg_resources
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.crawler import CrawlerProcess, MultiCrawlerProcess
|
||||
from scrapy.xlib import lsprofcalltree
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
@ -81,7 +81,7 @@ def _print_commands(settings, inproject):
|
||||
def _print_unknown_command(settings, cmdname, inproject):
|
||||
_print_header(settings, inproject)
|
||||
print "Unknown command: %s\n" % cmdname
|
||||
print 'Use "scrapy" to see available commands'
|
||||
print 'Use "scrapy" to see available commands'
|
||||
|
||||
def _run_print_help(parser, func, *a, **kw):
|
||||
try:
|
||||
@ -117,8 +117,6 @@ def execute(argv=None, settings=None):
|
||||
conf.settings = settings
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
crawler = CrawlerProcess(settings)
|
||||
crawler.install()
|
||||
inproject = inside_project()
|
||||
cmds = _get_commands_dict(settings, inproject)
|
||||
cmdname = _pop_command_name(argv)
|
||||
@ -139,7 +137,15 @@ def execute(argv=None, settings=None):
|
||||
cmd.add_options(parser)
|
||||
opts, args = parser.parse_args(args=argv[1:])
|
||||
_run_print_help(parser, cmd.process_options, args, opts)
|
||||
cmd.set_crawler(crawler)
|
||||
|
||||
if cmd.multi_crawlers:
|
||||
process = MultiCrawlerProcess(settings)
|
||||
cmd.process = process
|
||||
else:
|
||||
process = CrawlerProcess(settings)
|
||||
process.install()
|
||||
cmd.set_crawler(process)
|
||||
|
||||
_run_print_help(parser, _run_command, cmd, args, opts)
|
||||
sys.exit(cmd.exitcode)
|
||||
|
||||
|
@ -13,6 +13,7 @@ from scrapy.exceptions import UsageError
|
||||
class ScrapyCommand(object):
|
||||
|
||||
requires_project = False
|
||||
multi_crawlers = False
|
||||
|
||||
# default settings to be used for this command instead of global defaults
|
||||
default_settings = {}
|
||||
@ -21,7 +22,6 @@ class ScrapyCommand(object):
|
||||
|
||||
def __init__(self):
|
||||
self.settings = None # set in scrapy.cmdline
|
||||
self.configured = False
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
assert not hasattr(self, '_crawler'), "crawler already set"
|
||||
@ -29,10 +29,10 @@ class ScrapyCommand(object):
|
||||
|
||||
@property
|
||||
def crawler(self):
|
||||
if not self.configured:
|
||||
if not self.multi_crawlers and not self._crawler.configured:
|
||||
log.start_from_crawler(self._crawler)
|
||||
self._crawler.configure()
|
||||
self.configured = True
|
||||
|
||||
return self._crawler
|
||||
|
||||
def syntax(self):
|
||||
@ -83,7 +83,7 @@ class ScrapyCommand(object):
|
||||
help="set/override setting (may be repeated)")
|
||||
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
|
||||
parser.add_option_group(group)
|
||||
|
||||
|
||||
def process_options(self, args, opts):
|
||||
try:
|
||||
self.settings.overrides.update(arglist_to_dict(opts.set))
|
||||
|
@ -2,13 +2,11 @@ from collections import defaultdict
|
||||
from functools import wraps
|
||||
from unittest import TextTestRunner
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.contracts import ContractsManager
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
|
||||
def _generate(cb):
|
||||
@ -22,7 +20,8 @@ def _generate(cb):
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
multi_crawlers = True
|
||||
default_settings = {'LOG_ENABLED': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider>"
|
||||
@ -48,17 +47,20 @@ class Command(ScrapyCommand):
|
||||
|
||||
# contract requests
|
||||
contract_reqs = defaultdict(list)
|
||||
self.crawler.engine.has_capacity = lambda: True
|
||||
|
||||
for spider in args or self.crawler.spiders.list():
|
||||
spider = self.crawler.spiders.create(spider)
|
||||
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
|
||||
spiders = spman_cls.from_settings(self.settings)
|
||||
|
||||
for spider in args or spiders.list():
|
||||
spider = spiders.create(spider)
|
||||
requests = self.get_requests(spider)
|
||||
|
||||
if opts.list:
|
||||
for req in requests:
|
||||
contract_reqs[spider.name].append(req.callback.__name__)
|
||||
else:
|
||||
self.crawler.crawl(spider, requests)
|
||||
elif requests:
|
||||
crawler = self.process.create_crawler(spider.name)
|
||||
crawler.crawl(spider, requests)
|
||||
|
||||
# start checks
|
||||
if opts.list:
|
||||
@ -67,9 +69,8 @@ class Command(ScrapyCommand):
|
||||
for method in sorted(methods):
|
||||
print ' * %s' % method
|
||||
else:
|
||||
dispatcher.connect(self.results.printErrors,
|
||||
signals.engine_stopped)
|
||||
self.crawler.start()
|
||||
self.process.start()
|
||||
self.results.printErrors()
|
||||
|
||||
def get_requests(self, spider):
|
||||
requests = []
|
||||
|
@ -20,6 +20,8 @@ class Crawler(object):
|
||||
self.signals = SignalManager(self)
|
||||
self.stats = load_object(settings['STATS_CLASS'])(self)
|
||||
|
||||
self.scheduled = {}
|
||||
|
||||
def install(self):
|
||||
import scrapy.project
|
||||
assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
|
||||
@ -47,7 +49,12 @@ class Crawler(object):
|
||||
spider.set_crawler(self)
|
||||
if requests is None:
|
||||
requests = spider.start_requests()
|
||||
return self.engine.open_spider(spider, requests)
|
||||
|
||||
if self.configured and self.engine.running:
|
||||
assert not self.scheduled
|
||||
return self.engine.open_spider(spider, requests)
|
||||
else:
|
||||
self.scheduled.setdefault(spider, []).extend(requests)
|
||||
|
||||
def _spider_closed(self, spider=None):
|
||||
if not self.engine.open_spiders:
|
||||
@ -56,6 +63,10 @@ class Crawler(object):
|
||||
@defer.inlineCallbacks
|
||||
def start(self):
|
||||
yield defer.maybeDeferred(self.configure)
|
||||
|
||||
for spider, requests in self.scheduled.iteritems():
|
||||
yield self.engine.open_spider(spider, requests)
|
||||
|
||||
yield defer.maybeDeferred(self.engine.start)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
@ -64,33 +75,27 @@ class Crawler(object):
|
||||
yield defer.maybeDeferred(self.engine.stop)
|
||||
|
||||
|
||||
class CrawlerProcess(Crawler):
|
||||
"""A class to run a single Scrapy crawler in a process. It provides
|
||||
automatic control of the Twisted reactor and installs some convenient
|
||||
signals for shutting down the crawl.
|
||||
class ProcessMixin(object):
|
||||
""" Mixin which provides automatic control of the Twisted reactor and
|
||||
installs some convenient signals for shutting it down
|
||||
"""
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
super(CrawlerProcess, self).__init__(*a, **kw)
|
||||
self.signals.connect(self.stop, signals.engine_stopped)
|
||||
install_shutdown_handlers(self._signal_shutdown)
|
||||
|
||||
def start(self):
|
||||
super(CrawlerProcess, self).start()
|
||||
if self.settings.getbool('DNSCACHE_ENABLED'):
|
||||
reactor.installResolver(CachingThreadedResolver(reactor))
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
|
||||
reactor.run(installSignalHandlers=False) # blocking call
|
||||
reactor.run(installSignalHandlers=False) # blocking call
|
||||
|
||||
def stop(self):
|
||||
d = super(CrawlerProcess, self).stop()
|
||||
d.addBoth(self._stop_reactor)
|
||||
return d
|
||||
raise NotImplementedError
|
||||
|
||||
def _stop_reactor(self, _=None):
|
||||
try:
|
||||
reactor.stop()
|
||||
except RuntimeError: # raised if already stopped or in shutdown stage
|
||||
except RuntimeError: # raised if already stopped or in shutdown stage
|
||||
pass
|
||||
|
||||
def _signal_shutdown(self, signum, _):
|
||||
@ -106,3 +111,68 @@ class CrawlerProcess(Crawler):
|
||||
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
|
||||
level=log.INFO, signame=signame)
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
|
||||
class CrawlerProcess(Crawler, ProcessMixin):
|
||||
""" A class to run a single Scrapy crawler in a process
|
||||
"""
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
Crawler.__init__(self, *a, **kw)
|
||||
ProcessMixin.__init__(self, *a, **kw)
|
||||
self.signals.connect(self.stop, signals.engine_stopped)
|
||||
|
||||
def start(self):
|
||||
Crawler.start(self)
|
||||
ProcessMixin.start(self)
|
||||
|
||||
def stop(self):
|
||||
d = Crawler.stop(self)
|
||||
d.addBoth(self._stop_reactor)
|
||||
return d
|
||||
|
||||
|
||||
class MultiCrawlerProcess(ProcessMixin):
|
||||
""" A class to run multiple scrapy crawlers in a process sequentially
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
super(MultiCrawlerProcess, self).__init__(settings)
|
||||
|
||||
self.settings = settings
|
||||
self.crawlers = {}
|
||||
self.stopping = False
|
||||
|
||||
def create_crawler(self, name):
|
||||
if name not in self.crawlers:
|
||||
self.crawlers[name] = Crawler(self.settings)
|
||||
|
||||
return self.crawlers[name]
|
||||
|
||||
def start_crawler(self):
|
||||
name, crawler = self.crawlers.popitem()
|
||||
|
||||
crawler.sflo = log.start_from_crawler(crawler)
|
||||
crawler.signals.connect(crawler.sflo.stop, signals.engine_stopped)
|
||||
crawler.signals.connect(self.check_done, signals.engine_stopped)
|
||||
crawler.start()
|
||||
|
||||
return name, crawler
|
||||
|
||||
def check_done(self, **kwargs):
|
||||
if self.crawlers and not self.stopping:
|
||||
self.start_crawler()
|
||||
else:
|
||||
self._stop_reactor()
|
||||
|
||||
def start(self):
|
||||
self.start_crawler()
|
||||
super(MultiCrawlerProcess, self).start()
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def stop(self):
|
||||
self.stopping = True
|
||||
|
||||
for crawler in self.crawlers.itervalues():
|
||||
if crawler.configured:
|
||||
yield crawler.stop()
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""
|
||||
"""
|
||||
Scrapy logging facility
|
||||
|
||||
See documentation in docs/topics/logging.rst
|
||||
@ -11,7 +11,7 @@ from twisted.python import log
|
||||
|
||||
import scrapy
|
||||
from scrapy.utils.python import unicode_to_str
|
||||
|
||||
|
||||
# Logging levels
|
||||
DEBUG = logging.DEBUG
|
||||
INFO = logging.INFO
|
||||
@ -138,9 +138,12 @@ def start_from_crawler(crawler):
|
||||
if not settings.getbool('LOG_ENABLED'):
|
||||
return
|
||||
|
||||
start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
|
||||
sflo = start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
|
||||
settings['LOG_ENCODING'], crawler)
|
||||
|
||||
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
|
||||
settings['BOT_NAME']))
|
||||
msg("Optional features available: %s" % ", ".join(scrapy.optional_features),
|
||||
level=DEBUG)
|
||||
|
||||
return sflo
|
||||
|
Loading…
x
Reference in New Issue
Block a user