mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 02:03:41 +00:00
scraper: renamed SpiderInfo to Slot, for consistency with engine names
This commit is contained in:
parent
f19442425a
commit
d6b83fee3e
@ -125,7 +125,7 @@ class ExecutionEngine(object):
|
||||
or slot.closing \
|
||||
or self.spider_is_closed(spider) \
|
||||
or self.downloader.sites[spider].needs_backout() \
|
||||
or self.scraper.sites[spider].needs_backout()
|
||||
or self.scraper.slots[spider].needs_backout()
|
||||
|
||||
def _next_request(self, spider):
|
||||
request = self.scheduler.next_request(spider)
|
||||
@ -157,8 +157,8 @@ class ExecutionEngine(object):
|
||||
return d
|
||||
|
||||
def spider_is_idle(self, spider):
|
||||
scraper_idle = spider in self.scraper.sites \
|
||||
and self.scraper.sites[spider].is_idle()
|
||||
scraper_idle = spider in self.scraper.slots \
|
||||
and self.scraper.slots[spider].is_idle()
|
||||
pending = self.scheduler.spider_has_pending_requests(spider)
|
||||
downloading = spider in self.downloader.sites \
|
||||
and self.downloader.sites[spider].active
|
||||
|
@ -19,8 +19,8 @@ from scrapy import log
|
||||
from scrapy.stats import stats
|
||||
|
||||
|
||||
class SpiderInfo(object):
|
||||
"""Object for holding data of the responses being scraped"""
|
||||
class Slot(object):
|
||||
"""Scraper slot (one per running spider)"""
|
||||
|
||||
MIN_RESPONSE_SIZE = 1024
|
||||
|
||||
@ -62,7 +62,7 @@ class SpiderInfo(object):
|
||||
class Scraper(object):
|
||||
|
||||
def __init__(self, engine, settings):
|
||||
self.sites = {}
|
||||
self.slots = {}
|
||||
self.spidermw = SpiderMiddlewareManager.from_settings(settings)
|
||||
itemproc_cls = load_object(settings['ITEM_PROCESSOR'])
|
||||
self.itemproc = itemproc_cls.from_settings(settings)
|
||||
@ -72,47 +72,47 @@ class Scraper(object):
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider):
|
||||
"""Open the given spider for scraping and allocate resources for it"""
|
||||
assert spider not in self.sites, "Spider already opened: %s" % spider
|
||||
self.sites[spider] = SpiderInfo()
|
||||
assert spider not in self.slots, "Spider already opened: %s" % spider
|
||||
self.slots[spider] = Slot()
|
||||
yield self.itemproc.open_spider(spider)
|
||||
|
||||
def close_spider(self, spider):
|
||||
"""Close a spider being scraped and release its resources"""
|
||||
assert spider in self.sites, "Spider not opened: %s" % spider
|
||||
site = self.sites[spider]
|
||||
site.closing = defer.Deferred()
|
||||
site.closing.addCallback(self.itemproc.close_spider)
|
||||
self._check_if_closing(spider, site)
|
||||
return site.closing
|
||||
assert spider in self.slots, "Spider not opened: %s" % spider
|
||||
slot = self.slots[spider]
|
||||
slot.closing = defer.Deferred()
|
||||
slot.closing.addCallback(self.itemproc.close_spider)
|
||||
self._check_if_closing(spider, slot)
|
||||
return slot.closing
|
||||
|
||||
def is_idle(self):
|
||||
"""Return True if there isn't any more spiders to process"""
|
||||
return not self.sites
|
||||
return not self.slots
|
||||
|
||||
def _check_if_closing(self, spider, site):
|
||||
if site.closing and site.is_idle():
|
||||
del self.sites[spider]
|
||||
site.closing.callback(spider)
|
||||
def _check_if_closing(self, spider, slot):
|
||||
if slot.closing and slot.is_idle():
|
||||
del self.slots[spider]
|
||||
slot.closing.callback(spider)
|
||||
|
||||
def enqueue_scrape(self, response, request, spider):
|
||||
site = self.sites.get(spider, None)
|
||||
if site is None:
|
||||
slot = self.slots.get(spider, None)
|
||||
if slot is None:
|
||||
return
|
||||
dfd = site.add_response_request(response, request)
|
||||
dfd = slot.add_response_request(response, request)
|
||||
def finish_scraping(_):
|
||||
site.finish_response(response)
|
||||
self._check_if_closing(spider, site)
|
||||
self._scrape_next(spider, site)
|
||||
slot.finish_response(response)
|
||||
self._check_if_closing(spider, slot)
|
||||
self._scrape_next(spider, slot)
|
||||
return _
|
||||
dfd.addBoth(finish_scraping)
|
||||
dfd.addErrback(log.err, 'Scraper bug processing %s' % request, \
|
||||
spider=spider)
|
||||
self._scrape_next(spider, site)
|
||||
self._scrape_next(spider, slot)
|
||||
return dfd
|
||||
|
||||
def _scrape_next(self, spider, site):
|
||||
while site.queue:
|
||||
response, request, deferred = site.next_response_request_deferred()
|
||||
def _scrape_next(self, spider, slot):
|
||||
while slot.queue:
|
||||
response, request, deferred = slot.next_response_request_deferred()
|
||||
self._scrape(response, request, spider).chainDeferred(deferred)
|
||||
|
||||
def _scrape(self, response, request, spider):
|
||||
@ -170,7 +170,7 @@ class Scraper(object):
|
||||
spider=spider)
|
||||
self.engine.crawl(request=output, spider=spider)
|
||||
elif isinstance(output, BaseItem):
|
||||
self.sites[spider].itemproc_size += 1
|
||||
self.slots[spider].itemproc_size += 1
|
||||
dfd = self.itemproc.process_item(output, spider)
|
||||
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
||||
return dfd
|
||||
@ -193,7 +193,7 @@ class Scraper(object):
|
||||
def _itemproc_finished(self, output, item, response, spider):
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``
|
||||
"""
|
||||
self.sites[spider].itemproc_size -= 1
|
||||
self.slots[spider].itemproc_size -= 1
|
||||
if isinstance(output, Failure):
|
||||
ex = output.value
|
||||
if isinstance(ex, DropItem):
|
||||
|
@ -18,7 +18,7 @@ def get_engine_status(engine=None):
|
||||
"engine.downloader.is_idle()",
|
||||
"len(engine.downloader.sites)",
|
||||
"engine.scraper.is_idle()",
|
||||
"len(engine.scraper.sites)",
|
||||
"len(engine.scraper.slots)",
|
||||
]
|
||||
spider_tests = [
|
||||
"engine.spider_is_idle(spider)",
|
||||
@ -28,11 +28,11 @@ def get_engine_status(engine=None):
|
||||
"len(engine.downloader.sites[spider].active)",
|
||||
"len(engine.downloader.sites[spider].transferring)",
|
||||
"engine.downloader.sites[spider].lastseen",
|
||||
"len(engine.scraper.sites[spider].queue)",
|
||||
"len(engine.scraper.sites[spider].active)",
|
||||
"engine.scraper.sites[spider].active_size",
|
||||
"engine.scraper.sites[spider].itemproc_size",
|
||||
"engine.scraper.sites[spider].needs_backout()",
|
||||
"len(engine.scraper.slots[spider].queue)",
|
||||
"len(engine.scraper.slots[spider].active)",
|
||||
"engine.scraper.slots[spider].active_size",
|
||||
"engine.scraper.slots[spider].itemproc_size",
|
||||
"engine.scraper.slots[spider].needs_backout()",
|
||||
]
|
||||
|
||||
status = {'global': [], 'spiders': {}}
|
||||
@ -41,7 +41,7 @@ def get_engine_status(engine=None):
|
||||
status['global'] += [(test, eval(test))]
|
||||
except Exception, e:
|
||||
status['global'] += [(test, "%s (exception)" % type(e).__name__)]
|
||||
for spider in set(engine.downloader.sites.keys() + engine.scraper.sites.keys()):
|
||||
for spider in set(engine.downloader.sites.keys() + engine.scraper.slots.keys()):
|
||||
x = []
|
||||
for test in spider_tests:
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user