From c1f8198639eab1b8ac3adffd68e5881ad020e4da Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Fri, 19 Feb 2010 21:53:18 -0200 Subject: [PATCH 1/3] Added RANDOMIZE_DOWNLOAD_DELAY setting --- docs/topics/settings.rst | 30 ++++++++++++++++++++++++++++++ scrapy/conf/default_settings.py | 2 ++ scrapy/core/downloader/manager.py | 24 +++++++++++++++++++----- 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 4b3dbc78f..d7a7d7fcf 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -418,6 +418,15 @@ supported. Example:: DOWNLOAD_DELAY = 0.25 # 250 ms of delay +This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY` +setting (which is enabled by default). By default, Scrapy doesn't wait a fixed +amount of time between requests, but uses a random interval between 0.5 and 1.5 +* :setting:`DOWNLOAD_DELAY`. + +Another way to change the download delay (per spider, instead of globally) is +by using the ``download_delay`` spider attribute, which takes more precedence +than this setting. + .. setting:: DOWNLOAD_TIMEOUT DOWNLOAD_TIMEOUT @@ -677,6 +686,27 @@ Example:: NEWSPIDER_MODULE = 'mybot.spiders_dev' +.. setting:: RANDOMIZE_DOWNLOAD_DELAY + +RANDOMIZE_DOWNLOAD_DELAY +------------------------ + +Default: ``True`` + +If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5 +* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same +spider. + +This randomization decreases the chance of the crawler being detected (and +subsequently blocked) by sites which analyze requests looking for statistically +significant similarities in the time between their times. + +The randomization policy is the same used by `wget`_ ``--random-wait`` option. + +If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect. + +.. _wget: http://www.gnu.org/software/wget/manual/wget.html + .. setting:: REDIRECT_MAX_TIMES REDIRECT_MAX_TIMES diff --git a/scrapy/conf/default_settings.py b/scrapy/conf/default_settings.py index 8892e41b4..76feb0c6f 100644 --- a/scrapy/conf/default_settings.py +++ b/scrapy/conf/default_settings.py @@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {} NEWSPIDER_MODULE = '' +RANDOMIZE_DOWNLOAD_DELAY = True + REDIRECT_MAX_METAREFRESH_DELAY = 100 REDIRECT_MAX_TIMES = 20 # uses Firefox default setting REDIRECT_PRIORITY_ADJUST = +2 diff --git a/scrapy/core/downloader/manager.py b/scrapy/core/downloader/manager.py index b53db5811..aec17b05e 100644 --- a/scrapy/core/downloader/manager.py +++ b/scrapy/core/downloader/manager.py @@ -2,6 +2,7 @@ Download web pages using asynchronous IO """ +import random from time import time from twisted.internet import reactor, defer @@ -20,15 +21,21 @@ class SpiderInfo(object): def __init__(self, download_delay=None, max_concurrent_requests=None): if download_delay is None: - self.download_delay = settings.getfloat('DOWNLOAD_DELAY') + self._download_delay = settings.getfloat('DOWNLOAD_DELAY') else: - self.download_delay = download_delay - if self.download_delay: + self._download_delay = float(download_delay) + if self._download_delay: self.max_concurrent_requests = 1 elif max_concurrent_requests is None: self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER') else: self.max_concurrent_requests = max_concurrent_requests + if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'): + # same policy as wget --random-wait + self.random_delay_interval = (0.5*self._download_delay, \ + 1.5*self._download_delay) + else: + self.random_delay_interval = None self.active = set() self.queue = [] @@ -44,6 +51,12 @@ class SpiderInfo(object): # use self.active to include requests in the downloader middleware return len(self.active) > 2 * self.max_concurrent_requests + def download_delay(self): + if self.random_delay_interval: + return random.uniform(*self.random_delay_interval) + else: + return self._download_delay + def cancel_request_calls(self): for call in self.next_request_calls: call.cancel() @@ -99,8 +112,9 @@ class Downloader(object): # Delay queue processing if a download_delay is configured now = time() - if site.download_delay: - penalty = site.download_delay - now + site.lastseen + delay = site.download_delay() + if delay: + penalty = delay - now + site.lastseen if penalty > 0: d = defer.Deferred() d.addCallback(self._process_queue) From 60961e54994a14f527e069b761d7e0f59568fb94 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Fri, 19 Feb 2010 23:09:48 -0200 Subject: [PATCH 2/3] minor documentation fix (refs #135) --- docs/topics/exporters.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/topics/exporters.rst b/docs/topics/exporters.rst index 966724f89..07abbe9df 100644 --- a/docs/topics/exporters.rst +++ b/docs/topics/exporters.rst @@ -41,6 +41,7 @@ Here you can see an :doc:`Item Pipeline ` which uses an Item Exporter to export scraped items to different files, one per spider:: from scrapy.xlib.pydispatch import dispatcher + form scrapy.core import signals from scrapy.contrib.exporter import XmlItemExporter class XmlExportPipeline(object): From cb99edd153e7b90fda1058c13178b96f9dfee179 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Fri, 19 Feb 2010 23:16:55 -0200 Subject: [PATCH 3/3] simplified and improved AUTHORS file --- AUTHORS | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/AUTHORS b/AUTHORS index a0fbe722f..1392aa71f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,28 +1,25 @@ Scrapy was brought to life by Shane Evans while hacking a scraping framework prototype for Mydeco (mydeco.com). It soon became maintained, extended and -improved by Insophia (insophia.com), with the sponsorship of By Design (the -company behind Mydeco). +improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to +bootstrap the project. -Here is the list of the primary authors & contributors, along with their user -name (in Scrapy trac/subversion). Emails are intentionally left out to avoid -spam. +Here is the list of the primary authors & contributors: - * Pablo Hoffman (pablo) - * Daniel Graña (daniel) - * Martin Olveyra (olveyra) - * Gabriel García (elpolilla) - * Michael Cetrulo (samus_) - * Artem Bogomyagkov (artem) - * Damian Canabal (calarval) - * Andres Moreira (andres) - * Ismael Carnales (ismael) - * Matías Aguirre (omab) - * German Hoffman (german) - * Anibal Pacheco (anibal) + * Pablo Hoffman + * Daniel Graña + * Martin Olveyra + * Gabriel García + * Michael Cetrulo + * Artem Bogomyagkov + * Damian Canabal + * Andres Moreira + * Ismael Carnales + * Matías Aguirre + * German Hoffmann + * Anibal Pacheco * Bruno Deferrari * Shane Evans - -And here is the list of people who have helped to put the Scrapy homepage live: - - * Ezequiel Rivero (ezequiel) + * Ezequiel Rivero + * Patrick Mezard + * Rolando Espinoza