From c1f8198639eab1b8ac3adffd68e5881ad020e4da Mon Sep 17 00:00:00 2001
From: Pablo Hoffman <pablo@pablohoffman.com>
Date: Fri, 19 Feb 2010 21:53:18 -0200
Subject: [PATCH 1/3] Added RANDOMIZE_DOWNLOAD_DELAY setting

---
 docs/topics/settings.rst          | 30 ++++++++++++++++++++++++++++++
 scrapy/conf/default_settings.py   |  2 ++
 scrapy/core/downloader/manager.py | 24 +++++++++++++++++++-----
 3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst
index 4b3dbc78f..d7a7d7fcf 100644
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@@ -418,6 +418,15 @@ supported.  Example::
 
     DOWNLOAD_DELAY = 0.25    # 250 ms of delay 
 
+This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
+setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
+amount of time between requests, but uses a random interval between 0.5 and 1.5
+* :setting:`DOWNLOAD_DELAY`.
+
+Another way to change the download delay (per spider, instead of globally) is
+by using the ``download_delay`` spider attribute, which takes more precedence
+than this setting.
+
 .. setting:: DOWNLOAD_TIMEOUT
 
 DOWNLOAD_TIMEOUT
@@ -677,6 +686,27 @@ Example::
 
     NEWSPIDER_MODULE = 'mybot.spiders_dev'
 
+.. setting:: RANDOMIZE_DOWNLOAD_DELAY
+
+RANDOMIZE_DOWNLOAD_DELAY
+------------------------
+
+Default: ``True``
+
+If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
+* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
+spider.
+
+This randomization decreases the chance of the crawler being detected (and
+subsequently blocked) by sites which analyze requests looking for statistically
+significant similarities in the time between their times.
+
+The randomization policy is the same used by `wget`_ ``--random-wait`` option.
+
+If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
+
+.. _wget: http://www.gnu.org/software/wget/manual/wget.html
+
 .. setting:: REDIRECT_MAX_TIMES
 
 REDIRECT_MAX_TIMES
diff --git a/scrapy/conf/default_settings.py b/scrapy/conf/default_settings.py
index 8892e41b4..76feb0c6f 100644
--- a/scrapy/conf/default_settings.py
+++ b/scrapy/conf/default_settings.py
@@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}
 
 NEWSPIDER_MODULE = ''
 
+RANDOMIZE_DOWNLOAD_DELAY = True
+
 REDIRECT_MAX_METAREFRESH_DELAY = 100
 REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
 REDIRECT_PRIORITY_ADJUST = +2
diff --git a/scrapy/core/downloader/manager.py b/scrapy/core/downloader/manager.py
index b53db5811..aec17b05e 100644
--- a/scrapy/core/downloader/manager.py
+++ b/scrapy/core/downloader/manager.py
@@ -2,6 +2,7 @@
 Download web pages using asynchronous IO
 """
 
+import random
 from time import time
 
 from twisted.internet import reactor, defer
@@ -20,15 +21,21 @@ class SpiderInfo(object):
 
     def __init__(self, download_delay=None, max_concurrent_requests=None):
         if download_delay is None:
-            self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
+            self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
         else:
-            self.download_delay = download_delay
-        if self.download_delay:
+            self._download_delay = float(download_delay)
+        if self._download_delay:
             self.max_concurrent_requests = 1
         elif max_concurrent_requests is None:
             self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
         else:
             self.max_concurrent_requests =  max_concurrent_requests
+        if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
+            # same policy as wget --random-wait
+            self.random_delay_interval = (0.5*self._download_delay, \
+                1.5*self._download_delay)
+        else:
+            self.random_delay_interval = None
 
         self.active = set()
         self.queue = []
@@ -44,6 +51,12 @@ class SpiderInfo(object):
         # use self.active to include requests in the downloader middleware
         return len(self.active) > 2 * self.max_concurrent_requests
 
+    def download_delay(self):
+        if self.random_delay_interval:
+            return random.uniform(*self.random_delay_interval)
+        else:
+            return self._download_delay
+
     def cancel_request_calls(self):
         for call in self.next_request_calls:
             call.cancel()
@@ -99,8 +112,9 @@ class Downloader(object):
 
         # Delay queue processing if a download_delay is configured
         now = time()
-        if site.download_delay:
-            penalty = site.download_delay - now + site.lastseen
+        delay = site.download_delay()
+        if delay:
+            penalty = delay - now + site.lastseen
             if penalty > 0:
                 d = defer.Deferred()
                 d.addCallback(self._process_queue)

From 60961e54994a14f527e069b761d7e0f59568fb94 Mon Sep 17 00:00:00 2001
From: Pablo Hoffman <pablo@pablohoffman.com>
Date: Fri, 19 Feb 2010 23:09:48 -0200
Subject: [PATCH 2/3] minor documentation fix (refs #135)

---
 docs/topics/exporters.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/topics/exporters.rst b/docs/topics/exporters.rst
index 966724f89..07abbe9df 100644
--- a/docs/topics/exporters.rst
+++ b/docs/topics/exporters.rst
@@ -41,6 +41,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses an Item
 Exporter to export scraped items to different files, one per spider::
 
    from scrapy.xlib.pydispatch import dispatcher
+   form scrapy.core import signals
    from scrapy.contrib.exporter import XmlItemExporter
 
    class XmlExportPipeline(object):

From cb99edd153e7b90fda1058c13178b96f9dfee179 Mon Sep 17 00:00:00 2001
From: Pablo Hoffman <pablo@pablohoffman.com>
Date: Fri, 19 Feb 2010 23:16:55 -0200
Subject: [PATCH 3/3] simplified and improved AUTHORS file

---
 AUTHORS | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index a0fbe722f..1392aa71f 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,28 +1,25 @@
 Scrapy was brought to life by Shane Evans while hacking a scraping framework
 prototype for Mydeco (mydeco.com). It soon became maintained, extended and
-improved by Insophia (insophia.com), with the sponsorship of By Design (the
-company behind Mydeco).
+improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
+bootstrap the project.
 
-Here is the list of the primary authors & contributors, along with their user
-name (in Scrapy trac/subversion). Emails are intentionally left out to avoid
-spam.
+Here is the list of the primary authors & contributors:
 
- * Pablo Hoffman (pablo)
- * Daniel Graña (daniel)
- * Martin Olveyra (olveyra)
- * Gabriel García (elpolilla)
- * Michael Cetrulo (samus_)
- * Artem Bogomyagkov (artem)
- * Damian Canabal (calarval)
- * Andres Moreira (andres)
- * Ismael Carnales (ismael)
- * Matías Aguirre (omab)
- * German Hoffman (german)
- * Anibal Pacheco (anibal)
+ * Pablo Hoffman
+ * Daniel Graña
+ * Martin Olveyra
+ * Gabriel García
+ * Michael Cetrulo
+ * Artem Bogomyagkov
+ * Damian Canabal
+ * Andres Moreira
+ * Ismael Carnales
+ * Matías Aguirre
+ * German Hoffmann
+ * Anibal Pacheco
  * Bruno Deferrari
  * Shane Evans
-
-And here is the list of people who have helped to put the Scrapy homepage live:
-
- * Ezequiel Rivero (ezequiel)
+ * Ezequiel Rivero
+ * Patrick Mezard
+ * Rolando Espinoza