mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 21:24:19 +00:00
Automated merge with http://hg.scrapy.org/users/rolando/scrapy/
This commit is contained in:
commit
bbef0fe870
39
AUTHORS
39
AUTHORS
@ -1,28 +1,25 @@
|
||||
Scrapy was brought to life by Shane Evans while hacking a scraping framework
|
||||
prototype for Mydeco (mydeco.com). It soon became maintained, extended and
|
||||
improved by Insophia (insophia.com), with the sponsorship of By Design (the
|
||||
company behind Mydeco).
|
||||
improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
|
||||
bootstrap the project.
|
||||
|
||||
Here is the list of the primary authors & contributors, along with their user
|
||||
name (in Scrapy trac/subversion). Emails are intentionally left out to avoid
|
||||
spam.
|
||||
Here is the list of the primary authors & contributors:
|
||||
|
||||
* Pablo Hoffman (pablo)
|
||||
* Daniel Graña (daniel)
|
||||
* Martin Olveyra (olveyra)
|
||||
* Gabriel García (elpolilla)
|
||||
* Michael Cetrulo (samus_)
|
||||
* Artem Bogomyagkov (artem)
|
||||
* Damian Canabal (calarval)
|
||||
* Andres Moreira (andres)
|
||||
* Ismael Carnales (ismael)
|
||||
* Matías Aguirre (omab)
|
||||
* German Hoffman (german)
|
||||
* Anibal Pacheco (anibal)
|
||||
* Pablo Hoffman
|
||||
* Daniel Graña
|
||||
* Martin Olveyra
|
||||
* Gabriel García
|
||||
* Michael Cetrulo
|
||||
* Artem Bogomyagkov
|
||||
* Damian Canabal
|
||||
* Andres Moreira
|
||||
* Ismael Carnales
|
||||
* Matías Aguirre
|
||||
* German Hoffmann
|
||||
* Anibal Pacheco
|
||||
* Bruno Deferrari
|
||||
* Shane Evans
|
||||
|
||||
And here is the list of people who have helped to put the Scrapy homepage live:
|
||||
|
||||
* Ezequiel Rivero (ezequiel)
|
||||
* Ezequiel Rivero
|
||||
* Patrick Mezard
|
||||
* Rolando Espinoza
|
||||
|
||||
|
@ -41,6 +41,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses an Item
|
||||
Exporter to export scraped items to different files, one per spider::
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
form scrapy.core import signals
|
||||
from scrapy.contrib.exporter import XmlItemExporter
|
||||
|
||||
class XmlExportPipeline(object):
|
||||
|
@ -418,6 +418,15 @@ supported. Example::
|
||||
|
||||
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
|
||||
|
||||
This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
|
||||
setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
|
||||
amount of time between requests, but uses a random interval between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`.
|
||||
|
||||
Another way to change the download delay (per spider, instead of globally) is
|
||||
by using the ``download_delay`` spider attribute, which takes more precedence
|
||||
than this setting.
|
||||
|
||||
.. setting:: DOWNLOAD_TIMEOUT
|
||||
|
||||
DOWNLOAD_TIMEOUT
|
||||
@ -677,6 +686,27 @@ Example::
|
||||
|
||||
NEWSPIDER_MODULE = 'mybot.spiders_dev'
|
||||
|
||||
.. setting:: RANDOMIZE_DOWNLOAD_DELAY
|
||||
|
||||
RANDOMIZE_DOWNLOAD_DELAY
|
||||
------------------------
|
||||
|
||||
Default: ``True``
|
||||
|
||||
If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
|
||||
spider.
|
||||
|
||||
This randomization decreases the chance of the crawler being detected (and
|
||||
subsequently blocked) by sites which analyze requests looking for statistically
|
||||
significant similarities in the time between their times.
|
||||
|
||||
The randomization policy is the same used by `wget`_ ``--random-wait`` option.
|
||||
|
||||
If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
|
||||
|
||||
.. _wget: http://www.gnu.org/software/wget/manual/wget.html
|
||||
|
||||
.. setting:: REDIRECT_MAX_TIMES
|
||||
|
||||
REDIRECT_MAX_TIMES
|
||||
|
@ -122,6 +122,8 @@ MYSQL_CONNECTION_SETTINGS = {}
|
||||
|
||||
NEWSPIDER_MODULE = ''
|
||||
|
||||
RANDOMIZE_DOWNLOAD_DELAY = True
|
||||
|
||||
REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||
REDIRECT_PRIORITY_ADJUST = +2
|
||||
|
@ -2,6 +2,7 @@
|
||||
Download web pages using asynchronous IO
|
||||
"""
|
||||
|
||||
import random
|
||||
from time import time
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
@ -20,15 +21,21 @@ class SpiderInfo(object):
|
||||
|
||||
def __init__(self, download_delay=None, max_concurrent_requests=None):
|
||||
if download_delay is None:
|
||||
self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
else:
|
||||
self.download_delay = download_delay
|
||||
if self.download_delay:
|
||||
self._download_delay = float(download_delay)
|
||||
if self._download_delay:
|
||||
self.max_concurrent_requests = 1
|
||||
elif max_concurrent_requests is None:
|
||||
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
else:
|
||||
self.max_concurrent_requests = max_concurrent_requests
|
||||
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
|
||||
# same policy as wget --random-wait
|
||||
self.random_delay_interval = (0.5*self._download_delay, \
|
||||
1.5*self._download_delay)
|
||||
else:
|
||||
self.random_delay_interval = None
|
||||
|
||||
self.active = set()
|
||||
self.queue = []
|
||||
@ -44,6 +51,12 @@ class SpiderInfo(object):
|
||||
# use self.active to include requests in the downloader middleware
|
||||
return len(self.active) > 2 * self.max_concurrent_requests
|
||||
|
||||
def download_delay(self):
|
||||
if self.random_delay_interval:
|
||||
return random.uniform(*self.random_delay_interval)
|
||||
else:
|
||||
return self._download_delay
|
||||
|
||||
def cancel_request_calls(self):
|
||||
for call in self.next_request_calls:
|
||||
call.cancel()
|
||||
@ -99,8 +112,9 @@ class Downloader(object):
|
||||
|
||||
# Delay queue processing if a download_delay is configured
|
||||
now = time()
|
||||
if site.download_delay:
|
||||
penalty = site.download_delay - now + site.lastseen
|
||||
delay = site.download_delay()
|
||||
if delay:
|
||||
penalty = delay - now + site.lastseen
|
||||
if penalty > 0:
|
||||
d = defer.Deferred()
|
||||
d.addCallback(self._process_queue)
|
||||
|
Loading…
x
Reference in New Issue
Block a user