1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 21:24:19 +00:00
This commit is contained in:
Pablo Hoffman 2010-02-20 11:12:37 -02:00
commit bbef0fe870
5 changed files with 70 additions and 26 deletions

39
AUTHORS
View File

@ -1,28 +1,25 @@
Scrapy was brought to life by Shane Evans while hacking a scraping framework
prototype for Mydeco (mydeco.com). It soon became maintained, extended and
improved by Insophia (insophia.com), with the sponsorship of By Design (the
company behind Mydeco).
improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
bootstrap the project.
Here is the list of the primary authors & contributors, along with their user
name (in Scrapy trac/subversion). Emails are intentionally left out to avoid
spam.
Here is the list of the primary authors & contributors:
* Pablo Hoffman (pablo)
* Daniel Graña (daniel)
* Martin Olveyra (olveyra)
* Gabriel García (elpolilla)
* Michael Cetrulo (samus_)
* Artem Bogomyagkov (artem)
* Damian Canabal (calarval)
* Andres Moreira (andres)
* Ismael Carnales (ismael)
* Matías Aguirre (omab)
* German Hoffman (german)
* Anibal Pacheco (anibal)
* Pablo Hoffman
* Daniel Graña
* Martin Olveyra
* Gabriel García
* Michael Cetrulo
* Artem Bogomyagkov
* Damian Canabal
* Andres Moreira
* Ismael Carnales
* Matías Aguirre
* German Hoffmann
* Anibal Pacheco
* Bruno Deferrari
* Shane Evans
And here is the list of people who have helped to put the Scrapy homepage live:
* Ezequiel Rivero (ezequiel)
* Ezequiel Rivero
* Patrick Mezard
* Rolando Espinoza

View File

@ -41,6 +41,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses an Item
Exporter to export scraped items to different files, one per spider::
from scrapy.xlib.pydispatch import dispatcher
form scrapy.core import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):

View File

@ -418,6 +418,15 @@ supported. Example::
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
amount of time between requests, but uses a random interval between 0.5 and 1.5
* :setting:`DOWNLOAD_DELAY`.
Another way to change the download delay (per spider, instead of globally) is
by using the ``download_delay`` spider attribute, which takes more precedence
than this setting.
.. setting:: DOWNLOAD_TIMEOUT
DOWNLOAD_TIMEOUT
@ -677,6 +686,27 @@ Example::
NEWSPIDER_MODULE = 'mybot.spiders_dev'
.. setting:: RANDOMIZE_DOWNLOAD_DELAY
RANDOMIZE_DOWNLOAD_DELAY
------------------------
Default: ``True``
If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
spider.
This randomization decreases the chance of the crawler being detected (and
subsequently blocked) by sites which analyze requests looking for statistically
significant similarities in the time between their times.
The randomization policy is the same used by `wget`_ ``--random-wait`` option.
If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
.. _wget: http://www.gnu.org/software/wget/manual/wget.html
.. setting:: REDIRECT_MAX_TIMES
REDIRECT_MAX_TIMES

View File

@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}
NEWSPIDER_MODULE = ''
RANDOMIZE_DOWNLOAD_DELAY = True
REDIRECT_MAX_METAREFRESH_DELAY = 100
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
REDIRECT_PRIORITY_ADJUST = +2

View File

@ -2,6 +2,7 @@
Download web pages using asynchronous IO
"""
import random
from time import time
from twisted.internet import reactor, defer
@ -20,15 +21,21 @@ class SpiderInfo(object):
def __init__(self, download_delay=None, max_concurrent_requests=None):
if download_delay is None:
self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
else:
self.download_delay = download_delay
if self.download_delay:
self._download_delay = float(download_delay)
if self._download_delay:
self.max_concurrent_requests = 1
elif max_concurrent_requests is None:
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
else:
self.max_concurrent_requests = max_concurrent_requests
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
# same policy as wget --random-wait
self.random_delay_interval = (0.5*self._download_delay, \
1.5*self._download_delay)
else:
self.random_delay_interval = None
self.active = set()
self.queue = []
@ -44,6 +51,12 @@ class SpiderInfo(object):
# use self.active to include requests in the downloader middleware
return len(self.active) > 2 * self.max_concurrent_requests
def download_delay(self):
if self.random_delay_interval:
return random.uniform(*self.random_delay_interval)
else:
return self._download_delay
def cancel_request_calls(self):
for call in self.next_request_calls:
call.cancel()
@ -99,8 +112,9 @@ class Downloader(object):
# Delay queue processing if a download_delay is configured
now = time()
if site.download_delay:
penalty = site.download_delay - now + site.lastseen
delay = site.download_delay()
if delay:
penalty = delay - now + site.lastseen
if penalty > 0:
d = defer.Deferred()
d.addCallback(self._process_queue)