mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 20:23:56 +00:00
Added RANDOMIZE_DOWNLOAD_DELAY setting
This commit is contained in:
parent
23fcf48a89
commit
c1f8198639
@ -418,6 +418,15 @@ supported. Example::
|
||||
|
||||
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
|
||||
|
||||
This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
|
||||
setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
|
||||
amount of time between requests, but uses a random interval between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`.
|
||||
|
||||
Another way to change the download delay (per spider, instead of globally) is
|
||||
by using the ``download_delay`` spider attribute, which takes more precedence
|
||||
than this setting.
|
||||
|
||||
.. setting:: DOWNLOAD_TIMEOUT
|
||||
|
||||
DOWNLOAD_TIMEOUT
|
||||
@ -677,6 +686,27 @@ Example::
|
||||
|
||||
NEWSPIDER_MODULE = 'mybot.spiders_dev'
|
||||
|
||||
.. setting:: RANDOMIZE_DOWNLOAD_DELAY
|
||||
|
||||
RANDOMIZE_DOWNLOAD_DELAY
|
||||
------------------------
|
||||
|
||||
Default: ``True``
|
||||
|
||||
If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
|
||||
spider.
|
||||
|
||||
This randomization decreases the chance of the crawler being detected (and
|
||||
subsequently blocked) by sites which analyze requests looking for statistically
|
||||
significant similarities in the time between their times.
|
||||
|
||||
The randomization policy is the same used by `wget`_ ``--random-wait`` option.
|
||||
|
||||
If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
|
||||
|
||||
.. _wget: http://www.gnu.org/software/wget/manual/wget.html
|
||||
|
||||
.. setting:: REDIRECT_MAX_TIMES
|
||||
|
||||
REDIRECT_MAX_TIMES
|
||||
|
@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}
|
||||
|
||||
NEWSPIDER_MODULE = ''
|
||||
|
||||
RANDOMIZE_DOWNLOAD_DELAY = True
|
||||
|
||||
REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||
REDIRECT_PRIORITY_ADJUST = +2
|
||||
|
@ -2,6 +2,7 @@
|
||||
Download web pages using asynchronous IO
|
||||
"""
|
||||
|
||||
import random
|
||||
from time import time
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
@ -20,15 +21,21 @@ class SpiderInfo(object):
|
||||
|
||||
def __init__(self, download_delay=None, max_concurrent_requests=None):
|
||||
if download_delay is None:
|
||||
self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
else:
|
||||
self.download_delay = download_delay
|
||||
if self.download_delay:
|
||||
self._download_delay = float(download_delay)
|
||||
if self._download_delay:
|
||||
self.max_concurrent_requests = 1
|
||||
elif max_concurrent_requests is None:
|
||||
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
else:
|
||||
self.max_concurrent_requests = max_concurrent_requests
|
||||
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
|
||||
# same policy as wget --random-wait
|
||||
self.random_delay_interval = (0.5*self._download_delay, \
|
||||
1.5*self._download_delay)
|
||||
else:
|
||||
self.random_delay_interval = None
|
||||
|
||||
self.active = set()
|
||||
self.queue = []
|
||||
@ -44,6 +51,12 @@ class SpiderInfo(object):
|
||||
# use self.active to include requests in the downloader middleware
|
||||
return len(self.active) > 2 * self.max_concurrent_requests
|
||||
|
||||
def download_delay(self):
|
||||
if self.random_delay_interval:
|
||||
return random.uniform(*self.random_delay_interval)
|
||||
else:
|
||||
return self._download_delay
|
||||
|
||||
def cancel_request_calls(self):
|
||||
for call in self.next_request_calls:
|
||||
call.cancel()
|
||||
@ -99,8 +112,9 @@ class Downloader(object):
|
||||
|
||||
# Delay queue processing if a download_delay is configured
|
||||
now = time()
|
||||
if site.download_delay:
|
||||
penalty = site.download_delay - now + site.lastseen
|
||||
delay = site.download_delay()
|
||||
if delay:
|
||||
penalty = delay - now + site.lastseen
|
||||
if penalty > 0:
|
||||
d = defer.Deferred()
|
||||
d.addCallback(self._process_queue)
|
||||
|
Loading…
x
Reference in New Issue
Block a user