Automated merge with http://hg.scrapy.org/users/rolando/scrapy/

2025-02-24 21:24:19 +00:00 · 2010-02-20 11:12:37 -02:00 · 2010-02-20 11:12:37 -02:00 · bbef0fe870
commit bbef0fe870
parent 7b1ad321e3 cb99edd153
5 changed files with 70 additions and 26 deletions
--- a/39
+++ b/39
@ -1,28 +1,25 @@
 Scrapy was brought to life by Shane Evans while hacking a scraping framework
 prototype for Mydeco (mydeco.com). It soon became maintained, extended and
-improved by Insophia (insophia.com), with the sponsorship of By Design (the
-company behind Mydeco).
+improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
+bootstrap the project.

-Here is the list of the primary authors & contributors, along with their user
-name (in Scrapy trac/subversion). Emails are intentionally left out to avoid
-spam.
+Here is the list of the primary authors & contributors:

- * Pablo Hoffman (pablo)
- * Daniel Graña (daniel)
- * Martin Olveyra (olveyra)
- * Gabriel García (elpolilla)
- * Michael Cetrulo (samus_)
- * Artem Bogomyagkov (artem)
- * Damian Canabal (calarval)
- * Andres Moreira (andres)
- * Ismael Carnales (ismael)
- * Matías Aguirre (omab)
- * German Hoffman (german)
- * Anibal Pacheco (anibal)
+ * Pablo Hoffman
+ * Daniel Graña
+ * Martin Olveyra
+ * Gabriel García
+ * Michael Cetrulo
+ * Artem Bogomyagkov
+ * Damian Canabal
+ * Andres Moreira
+ * Ismael Carnales
+ * Matías Aguirre
+ * German Hoffmann
+ * Anibal Pacheco
 * Bruno Deferrari
 * Shane Evans
-
-And here is the list of people who have helped to put the Scrapy homepage live:
-
- * Ezequiel Rivero (ezequiel)
+ * Ezequiel Rivero
+ * Patrick Mezard
+ * Rolando Espinoza

--- a/docs/topics/exporters.rst
+++ b/docs/topics/exporters.rst
@ -41,6 +41,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses an Item
 Exporter to export scraped items to different files, one per spider::

   from scrapy.xlib.pydispatch import dispatcher
+   form scrapy.core import signals
   from scrapy.contrib.exporter import XmlItemExporter

   class XmlExportPipeline(object):
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@ -418,6 +418,15 @@ supported.  Example::

    DOWNLOAD_DELAY = 0.25    # 250 ms of delay 

+This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
+setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
+amount of time between requests, but uses a random interval between 0.5 and 1.5
+* :setting:`DOWNLOAD_DELAY`.
+
+Another way to change the download delay (per spider, instead of globally) is
+by using the ``download_delay`` spider attribute, which takes more precedence
+than this setting.
+
 .. setting:: DOWNLOAD_TIMEOUT

 DOWNLOAD_TIMEOUT
@ -677,6 +686,27 @@ Example::

    NEWSPIDER_MODULE = 'mybot.spiders_dev'

+.. setting:: RANDOMIZE_DOWNLOAD_DELAY
+
+RANDOMIZE_DOWNLOAD_DELAY
+------------------------
+
+Default: ``True``
+
+If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
+* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
+spider.
+
+This randomization decreases the chance of the crawler being detected (and
+subsequently blocked) by sites which analyze requests looking for statistically
+significant similarities in the time between their times.
+
+The randomization policy is the same used by `wget`_ ``--random-wait`` option.
+
+If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
+
+.. _wget: http://www.gnu.org/software/wget/manual/wget.html
+
 .. setting:: REDIRECT_MAX_TIMES

 REDIRECT_MAX_TIMES
--- a/scrapy/conf/default_settings.py
+++ b/scrapy/conf/default_settings.py
@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}

 NEWSPIDER_MODULE = ''

+RANDOMIZE_DOWNLOAD_DELAY = True
+
 REDIRECT_MAX_METAREFRESH_DELAY = 100
 REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
 REDIRECT_PRIORITY_ADJUST = +2
--- a/scrapy/core/downloader/manager.py
+++ b/scrapy/core/downloader/manager.py
@ -2,6 +2,7 @@
 Download web pages using asynchronous IO
 """

+import random
 from time import time

 from twisted.internet import reactor, defer
@ -20,15 +21,21 @@ class SpiderInfo(object):

    def __init__(self, download_delay=None, max_concurrent_requests=None):
        if download_delay is None:
-            self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
+            self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
        else:
-            self.download_delay = download_delay
-        if self.download_delay:
+            self._download_delay = float(download_delay)
+        if self._download_delay:
            self.max_concurrent_requests = 1
        elif max_concurrent_requests is None:
            self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
        else:
            self.max_concurrent_requests =  max_concurrent_requests
+        if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
+            # same policy as wget --random-wait
+            self.random_delay_interval = (0.5*self._download_delay, \
+                1.5*self._download_delay)
+        else:
+            self.random_delay_interval = None

        self.active = set()
        self.queue = []
@ -44,6 +51,12 @@ class SpiderInfo(object):
        # use self.active to include requests in the downloader middleware
        return len(self.active) > 2 * self.max_concurrent_requests

+    def download_delay(self):
+        if self.random_delay_interval:
+            return random.uniform(*self.random_delay_interval)
+        else:
+            return self._download_delay
+
    def cancel_request_calls(self):
        for call in self.next_request_calls:
            call.cancel()
@ -99,8 +112,9 @@ class Downloader(object):

        # Delay queue processing if a download_delay is configured
        now = time()
-        if site.download_delay:
-            penalty = site.download_delay - now + site.lastseen
+        delay = site.download_delay()
+        if delay:
+            penalty = delay - now + site.lastseen
            if penalty > 0:
                d = defer.Deferred()
                d.addCallback(self._process_queue)