1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 08:03:54 +00:00

scraper: renamed SpiderInfo to Slot, for consistency with engine names

This commit is contained in:
Pablo Hoffman 2011-07-22 02:01:05 -03:00
parent f19442425a
commit d6b83fee3e
3 changed files with 38 additions and 38 deletions

View File

@ -125,7 +125,7 @@ class ExecutionEngine(object):
or slot.closing \
or self.spider_is_closed(spider) \
or self.downloader.sites[spider].needs_backout() \
or self.scraper.sites[spider].needs_backout()
or self.scraper.slots[spider].needs_backout()
def _next_request(self, spider):
request = self.scheduler.next_request(spider)
@ -157,8 +157,8 @@ class ExecutionEngine(object):
return d
def spider_is_idle(self, spider):
scraper_idle = spider in self.scraper.sites \
and self.scraper.sites[spider].is_idle()
scraper_idle = spider in self.scraper.slots \
and self.scraper.slots[spider].is_idle()
pending = self.scheduler.spider_has_pending_requests(spider)
downloading = spider in self.downloader.sites \
and self.downloader.sites[spider].active

View File

@ -19,8 +19,8 @@ from scrapy import log
from scrapy.stats import stats
class SpiderInfo(object):
"""Object for holding data of the responses being scraped"""
class Slot(object):
"""Scraper slot (one per running spider)"""
MIN_RESPONSE_SIZE = 1024
@ -62,7 +62,7 @@ class SpiderInfo(object):
class Scraper(object):
def __init__(self, engine, settings):
self.sites = {}
self.slots = {}
self.spidermw = SpiderMiddlewareManager.from_settings(settings)
itemproc_cls = load_object(settings['ITEM_PROCESSOR'])
self.itemproc = itemproc_cls.from_settings(settings)
@ -72,47 +72,47 @@ class Scraper(object):
@defer.inlineCallbacks
def open_spider(self, spider):
"""Open the given spider for scraping and allocate resources for it"""
assert spider not in self.sites, "Spider already opened: %s" % spider
self.sites[spider] = SpiderInfo()
assert spider not in self.slots, "Spider already opened: %s" % spider
self.slots[spider] = Slot()
yield self.itemproc.open_spider(spider)
def close_spider(self, spider):
"""Close a spider being scraped and release its resources"""
assert spider in self.sites, "Spider not opened: %s" % spider
site = self.sites[spider]
site.closing = defer.Deferred()
site.closing.addCallback(self.itemproc.close_spider)
self._check_if_closing(spider, site)
return site.closing
assert spider in self.slots, "Spider not opened: %s" % spider
slot = self.slots[spider]
slot.closing = defer.Deferred()
slot.closing.addCallback(self.itemproc.close_spider)
self._check_if_closing(spider, slot)
return slot.closing
def is_idle(self):
"""Return True if there isn't any more spiders to process"""
return not self.sites
return not self.slots
def _check_if_closing(self, spider, site):
if site.closing and site.is_idle():
del self.sites[spider]
site.closing.callback(spider)
def _check_if_closing(self, spider, slot):
if slot.closing and slot.is_idle():
del self.slots[spider]
slot.closing.callback(spider)
def enqueue_scrape(self, response, request, spider):
site = self.sites.get(spider, None)
if site is None:
slot = self.slots.get(spider, None)
if slot is None:
return
dfd = site.add_response_request(response, request)
dfd = slot.add_response_request(response, request)
def finish_scraping(_):
site.finish_response(response)
self._check_if_closing(spider, site)
self._scrape_next(spider, site)
slot.finish_response(response)
self._check_if_closing(spider, slot)
self._scrape_next(spider, slot)
return _
dfd.addBoth(finish_scraping)
dfd.addErrback(log.err, 'Scraper bug processing %s' % request, \
spider=spider)
self._scrape_next(spider, site)
self._scrape_next(spider, slot)
return dfd
def _scrape_next(self, spider, site):
while site.queue:
response, request, deferred = site.next_response_request_deferred()
def _scrape_next(self, spider, slot):
while slot.queue:
response, request, deferred = slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)
def _scrape(self, response, request, spider):
@ -170,7 +170,7 @@ class Scraper(object):
spider=spider)
self.engine.crawl(request=output, spider=spider)
elif isinstance(output, BaseItem):
self.sites[spider].itemproc_size += 1
self.slots[spider].itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
return dfd
@ -193,7 +193,7 @@ class Scraper(object):
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.sites[spider].itemproc_size -= 1
self.slots[spider].itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):

View File

@ -18,7 +18,7 @@ def get_engine_status(engine=None):
"engine.downloader.is_idle()",
"len(engine.downloader.sites)",
"engine.scraper.is_idle()",
"len(engine.scraper.sites)",
"len(engine.scraper.slots)",
]
spider_tests = [
"engine.spider_is_idle(spider)",
@ -28,11 +28,11 @@ def get_engine_status(engine=None):
"len(engine.downloader.sites[spider].active)",
"len(engine.downloader.sites[spider].transferring)",
"engine.downloader.sites[spider].lastseen",
"len(engine.scraper.sites[spider].queue)",
"len(engine.scraper.sites[spider].active)",
"engine.scraper.sites[spider].active_size",
"engine.scraper.sites[spider].itemproc_size",
"engine.scraper.sites[spider].needs_backout()",
"len(engine.scraper.slots[spider].queue)",
"len(engine.scraper.slots[spider].active)",
"engine.scraper.slots[spider].active_size",
"engine.scraper.slots[spider].itemproc_size",
"engine.scraper.slots[spider].needs_backout()",
]
status = {'global': [], 'spiders': {}}
@ -41,7 +41,7 @@ def get_engine_status(engine=None):
status['global'] += [(test, eval(test))]
except Exception, e:
status['global'] += [(test, "%s (exception)" % type(e).__name__)]
for spider in set(engine.downloader.sites.keys() + engine.scraper.sites.keys()):
for spider in set(engine.downloader.sites.keys() + engine.scraper.slots.keys()):
x = []
for test in spider_tests:
try: