1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 22:44:05 +00:00

Merge pull request #418 from nramirezuy/engine-multispider

engine multispider support removed
This commit is contained in:
Pablo Hoffman 2013-10-10 10:31:13 -07:00
commit 1a2db089ff
2 changed files with 26 additions and 23 deletions

View File

@ -55,7 +55,8 @@ class ExecutionEngine(object):
self.settings = crawler.settings self.settings = crawler.settings
self.signals = crawler.signals self.signals = crawler.signals
self.logformatter = crawler.logformatter self.logformatter = crawler.logformatter
self.slots = {} self.slot = None
self.spider = None
self.running = False self.running = False
self.paused = False self.paused = False
self.scheduler_cls = load_object(self.settings['SCHEDULER']) self.scheduler_cls = load_object(self.settings['SCHEDULER'])
@ -93,9 +94,8 @@ class ExecutionEngine(object):
self.paused = False self.paused = False
def _next_request(self, spider): def _next_request(self, spider):
try: slot = self.slot
slot = self.slots[spider] if not slot:
except KeyError:
return return
if self.paused: if self.paused:
@ -121,14 +121,14 @@ class ExecutionEngine(object):
self._spider_idle(spider) self._spider_idle(spider)
def _needs_backout(self, spider): def _needs_backout(self, spider):
slot = self.slots[spider] slot = self.slot
return not self.running \ return not self.running \
or slot.closing \ or slot.closing \
or self.downloader.needs_backout() \ or self.downloader.needs_backout() \
or self.scraper.slot.needs_backout() or self.scraper.slot.needs_backout()
def _next_request_from_scheduler(self, spider): def _next_request_from_scheduler(self, spider):
slot = self.slots[spider] slot = self.slot
request = slot.scheduler.next_request() request = slot.scheduler.next_request()
if not request: if not request:
return return
@ -154,32 +154,32 @@ class ExecutionEngine(object):
def spider_is_idle(self, spider): def spider_is_idle(self, spider):
scraper_idle = self.scraper.slot.is_idle() scraper_idle = self.scraper.slot.is_idle()
pending = self.slots[spider].scheduler.has_pending_requests() pending = self.slot.scheduler.has_pending_requests()
downloading = bool(self.downloader.active) downloading = bool(self.downloader.active)
idle = scraper_idle and not (pending or downloading) idle = scraper_idle and not (pending or downloading)
return idle return idle
@property @property
def open_spiders(self): def open_spiders(self):
return self.slots.keys() return [self.spider] if self.spider else []
def has_capacity(self): def has_capacity(self):
"""Does the engine have capacity to handle more spiders""" """Does the engine have capacity to handle more spiders"""
return len(self.slots) < self._concurrent_spiders return not bool(self.slot)
def crawl(self, request, spider): def crawl(self, request, spider):
assert spider in self.open_spiders, \ assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request) "Spider %r not opened when crawling: %s" % (spider.name, request)
self.schedule(request, spider) self.schedule(request, spider)
self.slots[spider].nextcall.schedule() self.slot.nextcall.schedule()
def schedule(self, request, spider): def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled, self.signals.send_catch_log(signal=signals.request_scheduled,
request=request, spider=spider) request=request, spider=spider)
return self.slots[spider].scheduler.enqueue_request(request) return self.slot.scheduler.enqueue_request(request)
def download(self, request, spider): def download(self, request, spider):
slot = self.slots[spider] slot = self.slot
slot.add_request(request) slot.add_request(request)
d = self._download(request, spider) d = self._download(request, spider)
d.addBoth(self._downloaded, slot, request, spider) d.addBoth(self._downloaded, slot, request, spider)
@ -191,7 +191,7 @@ class ExecutionEngine(object):
if isinstance(response, Request) else response if isinstance(response, Request) else response
def _download(self, request, spider): def _download(self, request, spider):
slot = self.slots[spider] slot = self.slot
slot.add_request(request) slot.add_request(request)
def _on_success(response): def _on_success(response):
assert isinstance(response, (Response, Request)) assert isinstance(response, (Response, Request))
@ -214,14 +214,15 @@ class ExecutionEngine(object):
@defer.inlineCallbacks @defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True): def open_spider(self, spider, start_requests=(), close_if_idle=True):
assert self.has_capacity(), "No free spider slots when opening %r" % \ assert self.has_capacity(), "No free spider slot when opening %r" % \
spider.name spider.name
log.msg("Spider opened", spider=spider) log.msg("Spider opened", spider=spider)
nextcall = CallLaterOnce(self._next_request, spider) nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler) scheduler = self.scheduler_cls.from_crawler(self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
slot = Slot(start_requests, close_if_idle, nextcall, scheduler) slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.slots[spider] = slot self.slot = slot
self.spider = spider
yield scheduler.open(spider) yield scheduler.open(spider)
yield self.scraper.open_spider(spider) yield self.scraper.open_spider(spider)
self.crawler.stats.open_spider(spider) self.crawler.stats.open_spider(spider)
@ -240,7 +241,7 @@ class ExecutionEngine(object):
spider=spider, dont_log=DontCloseSpider) spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res): for _, x in res):
self.slots[spider].nextcall.schedule(5) self.slot.nextcall.schedule(5)
return return
if self.spider_is_idle(spider): if self.spider_is_idle(spider):
@ -249,7 +250,7 @@ class ExecutionEngine(object):
def close_spider(self, spider, reason='cancelled'): def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests""" """Close (cancel) spider and clear all its outstanding requests"""
slot = self.slots[spider] slot = self.slot
if slot.closing: if slot.closing:
return slot.closing return slot.closing
log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider) log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)
@ -276,7 +277,10 @@ class ExecutionEngine(object):
dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider)) dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))
dfd.addBoth(lambda _: self.slots.pop(spider)) dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log.err, spider=spider) dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self._spider_closed_callback(spider)) dfd.addBoth(lambda _: self._spider_closed_callback(spider))

View File

@ -7,16 +7,15 @@ def get_engine_status(engine):
global_tests = [ global_tests = [
"time()-engine.start_time", "time()-engine.start_time",
"engine.has_capacity()", "engine.has_capacity()",
"len(engine.downloader.slots)",
"len(engine.downloader.active)", "len(engine.downloader.active)",
"engine.scraper.is_idle()", "engine.scraper.is_idle()",
] ]
spider_tests = [ spider_tests = [
"engine.spider_is_idle(spider)", "engine.spider_is_idle(spider)",
"engine.slots[spider].closing", "engine.slot.closing",
"len(engine.slots[spider].inprogress)", "len(engine.slot.inprogress)",
"len(engine.slots[spider].scheduler.dqs or [])", "len(engine.slot.scheduler.dqs or [])",
"len(engine.slots[spider].scheduler.mqs)", "len(engine.slot.scheduler.mqs)",
"len(engine.scraper.slot.queue)", "len(engine.scraper.slot.queue)",
"len(engine.scraper.slot.active)", "len(engine.scraper.slot.active)",
"engine.scraper.slot.active_size", "engine.scraper.slot.active_size",