1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 12:43:41 +00:00

rewrite RequestLimitMiddleware spidermw so it does not consume spider output at once

--HG--
rename : scrapy/contrib/spidermiddleware/limit.py => scrapy/contrib/spidermiddleware/requestlimit.py
This commit is contained in:
Daniel Grana 2009-07-06 15:35:36 -03:00
parent 31b3d7ce1e
commit 2e52005847
3 changed files with 71 additions and 43 deletions

View File

@ -175,7 +175,7 @@ SPIDER_MIDDLEWARES_BASE = {
'scrapy.contrib.spidermiddleware.itempipeline.ItemPipelineMiddleware': 30,
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.itemsampler.ItemSamplerMiddleware': 100,
'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware': 200,
'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200,
'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware': 300,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,

View File

@ -1,42 +0,0 @@
"""
RequestLimitMiddleware: Limits the scheduler request queue size. When spiders
try to schedule more than the allowed amount of requests the new requests
(returned by the spider) will be dropped.
The limit can be set using the spider attribue `requests_queue_size` or the
setting "REQUESTS_QUEUE_SIZE". If not specified (or 0), no limit will be
applied.
"""
from scrapy.core.engine import scrapyengine
from scrapy.core.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy.http import Request
from scrapy import log
class RequestLimitMiddleware(object):
def __init__(self):
self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE")
if not self.max_queue_size:
raise NotConfigured
def process_spider_output(self, response, result, spider):
requests = []
items = []
for r in result:
if isinstance(r, Request):
requests.append(r)
else:
items.append(r)
max_pending = getattr(spider, 'requests_queue_size', self.max_queue_size)
if max_pending:
pending_count = len(scrapyengine.scheduler.pending_requests.get(spider.domain_name, []))
free_slots = max_pending - pending_count
dropped_count = len(requests) - free_slots
if dropped_count > 0:
requests = requests[:free_slots]
log.msg("Dropping %d request(s) because the maximum schedule size (%d) has been exceeded" % \
(dropped_count, max_pending), level=log.DEBUG, domain=spider.domain_name)
return requests + items

View File

@ -0,0 +1,70 @@
"""
RequestLimitMiddleware: Limits the scheduler request queue size. When spiders
try to schedule more than the allowed amount of requests the new requests
(returned by the spider) will be dropped.
The limit can be set using the spider attribue `requests_queue_size` or the
setting "REQUESTS_QUEUE_SIZE". If not specified (or 0), no limit will be
applied.
"""
from itertools import imap
from pydispatch import dispatcher
from scrapy.core import signals
from scrapy.core.engine import scrapyengine
from scrapy.core.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy.http import Request
from scrapy import log
class RequestLimitMiddleware(object):
def __init__(self):
self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE")
if not self.max_queue_size:
raise NotConfigured
self.max_pending = {}
self.dropped_count = {}
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
def domain_opened(self, domain, spider):
self.max_pending[domain] = getattr(spider, 'requests_queue_size', self.max_queue_size)
self.dropped_count[domain] = 0
def domain_closed(self, domain):
dropped_count = self.dropped_count[domain]
if dropped_count:
max_pending = self.max_pending[domain]
log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \
(dropped_count, max_pending), level=log.DEBUG, domain=domain)
del self.dropped_count[domain]
del self.max_pending[domain]
def process_spider_output(self, response, result, spider):
domain = spider.domain_name
max_pending = self.max_pending.get(domain, 0)
if max_pending:
return imap(lambda v: self._limit_requests(v, domain, max_pending), result)
else:
return result
def _limit_requests(self, request_or_other, domain, max_pending):
if isinstance(request_or_other, Request):
free_slots = max_pending - self._pending_count(domain)
if free_slots > 0:
# Scheduler isn't saturated and it is fine to schedule more requests.
return request_or_other
else:
# Skip the request and give engine time to handle other tasks.
self.dropped_count[domain] += 1
return None
else:
# Return others (non-requests) as is.
return request_or_other
def _pending_count(self, domain):
pending = scrapyengine.scheduler.pending_requests.get(domain, [])
return len(pending)