mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 02:24:09 +00:00
dont discard slot when empty, just save in another dict in order to recycle if needed again.
This fix avoids to continuosly create new slot under certain cases, bug that prevents download_delay and max_concurrent_requests to work properly. The problem arises when the slot for a given domain becomes empty, but further requests for that domain werent still created by the spider. This is typical when spider creates requests one by one, or it makes requests to multiple domains and one or more of them are created in a rate enough slow that makes slot to be empty each time the response is fetched. The effect is that a new slot is created for each request under such conditions, and so the download_delay and max_concurrent_requests are not taking effect (because in order to apply, depends on an already existing slot for that domain).
This commit is contained in:
parent
838e1dcce9
commit
b39cb22d83
@ -75,7 +75,7 @@ class Downloader(object):
|
||||
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
|
||||
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
|
||||
|
||||
self.inactive_slots = {}
|
||||
|
||||
def fetch(self, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
@ -86,7 +86,7 @@ class Downloader(object):
|
||||
self.active.remove(request)
|
||||
slot.active.remove(request)
|
||||
if not slot.active: # remove empty slots
|
||||
del self.slots[key]
|
||||
self.inactive_slots[key] = self.slots.pop(key)
|
||||
return response
|
||||
|
||||
dlfunc = partial(self._enqueue_request, slot=slot)
|
||||
@ -101,12 +101,15 @@ class Downloader(object):
|
||||
if self.ip_concurrency:
|
||||
key = dnscache.get(key, key)
|
||||
if key not in self.slots:
|
||||
if self.ip_concurrency:
|
||||
concurrency = self.ip_concurrency
|
||||
if key in self.inactive_slots:
|
||||
self.slots[key] = self.inactive_slots.pop(key)
|
||||
else:
|
||||
concurrency = self.domain_concurrency
|
||||
concurrency, delay = _get_concurrency_delay(concurrency, spider, self.settings)
|
||||
self.slots[key] = Slot(concurrency, delay, self.settings)
|
||||
if self.ip_concurrency:
|
||||
concurrency = self.ip_concurrency
|
||||
else:
|
||||
concurrency = self.domain_concurrency
|
||||
concurrency, delay = _get_concurrency_delay(concurrency, spider, self.settings)
|
||||
self.slots[key] = Slot(concurrency, delay, self.settings)
|
||||
return key, self.slots[key]
|
||||
|
||||
def _enqueue_request(self, request, spider, slot):
|
||||
|
Loading…
x
Reference in New Issue
Block a user