1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 02:24:09 +00:00

dont discard slot when empty, just save in another dict in order to recycle if needed again.

This fix avoids to continuosly create new slot under certain cases, bug that prevents download_delay and max_concurrent_requests to work properly.

The problem arises when the slot for a given domain becomes empty, but further requests for that domain werent still created by the spider. This is typical when spider creates requests one by one, or it makes requests to multiple domains and one or more of them are created in a rate enough slow that makes slot to be empty each time the response is fetched.

The effect is that a new slot is created for each request under such conditions, and so the download_delay and max_concurrent_requests are not taking effect (because in order to apply, depends on an already existing slot for that domain).
This commit is contained in:
olveyra 2012-04-02 18:49:04 +00:00
parent 838e1dcce9
commit b39cb22d83

View File

@ -75,7 +75,7 @@ class Downloader(object):
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
self.inactive_slots = {}
def fetch(self, request, spider):
key, slot = self._get_slot(request, spider)
@ -86,7 +86,7 @@ class Downloader(object):
self.active.remove(request)
slot.active.remove(request)
if not slot.active: # remove empty slots
del self.slots[key]
self.inactive_slots[key] = self.slots.pop(key)
return response
dlfunc = partial(self._enqueue_request, slot=slot)
@ -101,12 +101,15 @@ class Downloader(object):
if self.ip_concurrency:
key = dnscache.get(key, key)
if key not in self.slots:
if self.ip_concurrency:
concurrency = self.ip_concurrency
if key in self.inactive_slots:
self.slots[key] = self.inactive_slots.pop(key)
else:
concurrency = self.domain_concurrency
concurrency, delay = _get_concurrency_delay(concurrency, spider, self.settings)
self.slots[key] = Slot(concurrency, delay, self.settings)
if self.ip_concurrency:
concurrency = self.ip_concurrency
else:
concurrency = self.domain_concurrency
concurrency, delay = _get_concurrency_delay(concurrency, spider, self.settings)
self.slots[key] = Slot(concurrency, delay, self.settings)
return key, self.slots[key]
def _enqueue_request(self, request, spider, slot):