1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 22:43:57 +00:00

allow to set minimal download delay for autothrottle extension. also

limit download delay to a minimal of spider.download_delay if given
This commit is contained in:
Martin Olveyra 2012-01-16 18:16:24 -02:00
parent fc52d8d5cf
commit 59cf9d9b1a

View File

@ -69,12 +69,15 @@ class AutoThrottle(object):
self.CONCURRENCY_CHECK_PERIOD = settings.getint("AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD", 10)
self.MAX_CONCURRENCY = settings.getint("AUTOTHROTTLE_MAX_CONCURRENCY", 8)
self.DEBUG = settings.getint("AUTOTHROTTLE_DEBUG", False)
self.MIN_DOWNLOAD_DELAY = settings.getint("AUTOTHROTTLE_MIN_DOWNLOAD_DELAY")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def spider_opened(self, spider):
if hasattr(spider, "download_delay"):
self.MIN_DOWNLOAD_DELAY = spider.download_delay
spider.download_delay = self.START_DELAY
if hasattr(spider, "max_concurrent_requests"):
self.MAX_CONCURRENCY = spider.max_concurrent_requests
@ -82,7 +85,7 @@ class AutoThrottle(object):
spider.max_concurrent_requests = 1
self.last_latencies = [self.START_DELAY]
self.last_lat = self.START_DELAY, 0.0
def response_received(self, response, spider):
slot = self._get_slot(response.request)
latency = response.meta.get('download_latency')
@ -124,6 +127,9 @@ class AutoThrottle(object):
# if latency is bigger than old delay, then use latency instead of mean. Works better with problematic sites
new_delay = (slot.delay + latency) / 2.0 if latency < slot.delay else latency
if new_delay < self.MIN_DOWNLOAD_DELAY:
new_delay = self.MIN_DOWNLOAD_DELAY
# dont adjust delay if response status != 200 and new delay is smaller than old one,
# as error pages (and redirections) are usually small and so tend to reduce latency, thus provoking a positive feedback
# by reducing delay instead of increase.