mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 13:44:25 +00:00
Documented AutoThrottle extension and added to extensions available by
default. Also deprecated concurrency and delay settings, in favour of using the standard Scrapy ones.
This commit is contained in:
parent
c1cbc5de3f
commit
b46b5a6ef0
@ -135,6 +135,7 @@ Solving specific problems
|
||||
topics/images
|
||||
topics/ubuntu
|
||||
topics/scrapyd
|
||||
topics/autothrottle
|
||||
topics/jobs
|
||||
topics/djangoitem
|
||||
|
||||
@ -162,6 +163,9 @@ Solving specific problems
|
||||
:doc:`topics/scrapyd`
|
||||
Deploying your Scrapy project in production.
|
||||
|
||||
:doc:`topics/autothrottle`
|
||||
Adjust crawl rate dynamically based on load.
|
||||
|
||||
:doc:`topics/jobs`
|
||||
Learn how to pause and resume crawls for large spiders.
|
||||
|
||||
|
@ -6,6 +6,7 @@ Release notes
|
||||
|
||||
Scrapy changes:
|
||||
|
||||
- documented :doc:`topics/autothrottle` and added to extensions installed by default. You still need to enable it with :setting:`AUTOTHROTTLE_ENABLED`
|
||||
- major Stats Collection refactoring: removed separation of global/per-spider stats, removed stats-related signals (``stats_spider_opened``, etc). Stats are much simpler now, backwards compatibility is kept on the Stats Collector API and signals.
|
||||
- added :meth:`~scrapy.contrib.spidermiddleware.SpiderMiddleware.process_start_requests` method to spider middlewares
|
||||
- dropped Signals singleton. Signals should now be accesed through the Crawler.signals attribute. See the signals documentation for more info.
|
||||
|
102
docs/topics/autothrottle.rst
Normal file
102
docs/topics/autothrottle.rst
Normal file
@ -0,0 +1,102 @@
|
||||
======================
|
||||
AutoThrottle extension
|
||||
======================
|
||||
|
||||
This is an extension for automatically throttling crawling speed based on load
|
||||
of both the Scrapy server and the website you are crawling.
|
||||
|
||||
Design goals
|
||||
============
|
||||
|
||||
1. be nicer to sites instead of using default download delay of zero
|
||||
2. automatically adjust scrapy to the optimum crawling speed, so the user
|
||||
doesn't have to tune the download delays and concurrent requests to find the
|
||||
optimum one. the user only needs to specify the maximum concurrent requests
|
||||
it allows, and the extension does the rest.
|
||||
|
||||
How it works
|
||||
============
|
||||
|
||||
In Scrapy, the download latency is measured as the time elapsed between
|
||||
establishing the TCP connection and receiving the HTTP headers.
|
||||
|
||||
Note that these latencies are very hard to measure accurately in a cooperative
|
||||
multitasking environment because Scrapy may be busy processing a spider
|
||||
callback, for example, and unable to attend downloads. However, these latencies
|
||||
should still give a reasonable estimate of how busy Scrapy (and ultimately, the
|
||||
server) is, and this extension builds on that premise.
|
||||
|
||||
.. _autothrottle-algorithm:
|
||||
|
||||
Throttling algorithm
|
||||
====================
|
||||
|
||||
This adjusts download delays and concurrency based on the following rules:
|
||||
|
||||
1. spiders always start with one concurrent request and a download delay of
|
||||
:setting:`AUTOTHROTTLE_START_DELAY`
|
||||
2. when a response is received, the download delay is adjusted to the
|
||||
average of previous download delay and the latency of the response.
|
||||
3. after :setting:`AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD` responses have
|
||||
passed, the average latency of this period is checked against the previous
|
||||
one and:
|
||||
|
||||
* if the latency remained constant (within standard deviation limits), it is increased
|
||||
* if the latency has increased (beyond standard deviation limits) and the concurrency is higher than 1, the concurrency is decreased
|
||||
|
||||
.. note:: The AutoThrottle extension honours the standard Scrapy settings for
|
||||
concurrency and delay. This means that it will never set a download delay
|
||||
lower than :setting:`DOWNLOAD_DELAY` or a concurrency higher than :setting:`CONCURRENT_REQUESTS_PER_DOMAIN` (or :setting:`CONCURRENT_REQUESTS_PER_IP`, depending on which one you use).
|
||||
|
||||
Settings
|
||||
========
|
||||
|
||||
The settings used to control the AutoThrottle extension are:
|
||||
|
||||
* :setting:`AUTOTHROTTLE_ENABLED`
|
||||
* :setting:`AUTOTHROTTLE_START_DELAY`
|
||||
* :setting:`AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD`
|
||||
* :setting:`AUTOTHROTTLE_DEBUG`
|
||||
* :setting:`DOWNLOAD_DELAY`
|
||||
* :setting:`CONCURRENT_REQUESTS_PER_DOMAIN`
|
||||
* :setting:`CONCURRENT_REQUESTS_PER_IP`
|
||||
|
||||
For more information see :ref:`autothrottle-algorithm`.
|
||||
|
||||
.. setting:: AUTOTHROTTLE_ENABLED
|
||||
|
||||
AUTOTHROTTLE_ENABLED
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Enables the AutoThrottle extension.
|
||||
|
||||
.. setting:: AUTOTHROTTLE_START_DELAY
|
||||
|
||||
AUTOTHROTTLE_START_DELAY
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Default: ``5.0``
|
||||
|
||||
The initial download delay (in seconds).
|
||||
|
||||
.. setting:: AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD
|
||||
|
||||
AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Default: ``10``
|
||||
|
||||
How many responses should pass to perform concurrency adjustments.
|
||||
|
||||
.. setting:: AUTOTHROTTLE_DEBUG
|
||||
|
||||
AUTOTHROTTLE_DEBUG
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Enable AutoThrottle debug mode which will display stats on every response
|
||||
received, so you can see how the throttling parameters are being adjusted in
|
||||
real time.
|
@ -435,9 +435,9 @@ Default::
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
'scrapy.contrib.closespider.CloseSpider': 0,
|
||||
'scrapy.contrib.feedexport.FeedExporter': 0,
|
||||
'scrapy.contrib.spidercontext.SpiderContext': 0,
|
||||
'scrapy.contrib.logstats.LogStats': 0,
|
||||
'scrapy.contrib.spiderstate.SpiderState': 0,
|
||||
'scrapy.contrib.throttle.AutoThrottle': 0,
|
||||
}
|
||||
|
||||
The list of available extensions. Keep in mind that some of them need to
|
||||
|
@ -4,58 +4,6 @@ from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.resolver import dnscache
|
||||
|
||||
class AutoThrottle(object):
|
||||
"""
|
||||
============
|
||||
AutoThrottle
|
||||
============
|
||||
|
||||
This is an extension for automatically throttling crawling speed based on
|
||||
load.
|
||||
|
||||
Design goals
|
||||
============
|
||||
|
||||
1. be nicer to sites instead of using default download delay of zero
|
||||
|
||||
2. automatically adjust scrapy to the optimum crawling speed, so the user
|
||||
doesn't have to tune the download delays and concurrent requests to find
|
||||
the optimum one. the user only needs to specify the maximum concurrent
|
||||
requests it allows, and the extension does the rest.
|
||||
|
||||
Download latencies
|
||||
==================
|
||||
|
||||
In Scrapy, the download latency is the (real) time elapsed between
|
||||
establishing the TCP connection and receiving the HTTP headers.
|
||||
|
||||
Note that these latencies are very hard to measure accurately in a
|
||||
cooperative multitasking environment because Scrapy may be busy processing
|
||||
a spider callback, for example, and unable to attend downloads. However,
|
||||
the latencies should give a reasonable estimate of how busy Scrapy (and
|
||||
ultimately, the server) is. This extension builds on that premise.
|
||||
|
||||
Throttling rules
|
||||
================
|
||||
|
||||
This adjusts download delays and concurrency based on the following rules:
|
||||
|
||||
1. spiders always start with one concurrent request and a download delay of
|
||||
START_DELAY
|
||||
|
||||
2. when a response is received, the download delay is adjusted to the
|
||||
average of previous download delay and the latency of the response.
|
||||
|
||||
3. after CONCURRENCY_CHECK_PERIOD responses have passed, the average
|
||||
latency of this period is checked against the previous one and:
|
||||
|
||||
3.1. if the latency remained constant (within standard deviation limits)
|
||||
and the concurrency is lower than MAX_CONCURRENCY, the concurrency is
|
||||
increased
|
||||
|
||||
3.2. if the latency has increased (beyond standard deviation limits) and
|
||||
the concurrency is higher than 1, the concurrency is decreased
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, crawler):
|
||||
settings = crawler.settings
|
||||
@ -66,12 +14,27 @@ class AutoThrottle(object):
|
||||
crawler.signals.connect(self.response_received, signal=signals.response_received)
|
||||
self.START_DELAY = settings.getfloat("AUTOTHROTTLE_START_DELAY", 5.0)
|
||||
self.CONCURRENCY_CHECK_PERIOD = settings.getint("AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD", 10)
|
||||
self.MAX_CONCURRENCY = settings.getint("AUTOTHROTTLE_MAX_CONCURRENCY", 8)
|
||||
self.MAX_CONCURRENCY = self._max_concurency(settings)
|
||||
self.MIN_DOWNLOAD_DELAY = self._min_download_delay(settings)
|
||||
self.DEBUG = settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
self.MIN_DOWNLOAD_DELAY = settings.getint("AUTOTHROTTLE_MIN_DOWNLOAD_DELAY")
|
||||
self.last_latencies = [self.START_DELAY]
|
||||
self.last_lat = self.START_DELAY, 0.0
|
||||
|
||||
def _min_download_delay(self, settings):
|
||||
return max(settings.getint("AUTOTHROTTLE_MIN_DOWNLOAD_DELAY"),
|
||||
settings.getint("DOWNLOAD_DELAY"))
|
||||
|
||||
def _max_concurency(self, settings):
|
||||
delay = self._min_download_delay(settings)
|
||||
if delay == 0:
|
||||
candidates = ["AUTOTHROTTLE_MAX_CONCURRENCY",
|
||||
"CONCURRENT_REQUESTS_PER_DOMAIN", "CONCURRENT_REQUESTS_PER_IP"]
|
||||
candidates = [settings.getint(x) for x in candidates]
|
||||
candidates = [x for x in candidates if x > 0]
|
||||
if candidates:
|
||||
return min(candidates)
|
||||
return 1
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
@ -108,6 +108,7 @@ EXTENSIONS_BASE = {
|
||||
'scrapy.contrib.feedexport.FeedExporter': 0,
|
||||
'scrapy.contrib.logstats.LogStats': 0,
|
||||
'scrapy.contrib.spiderstate.SpiderState': 0,
|
||||
'scrapy.contrib.throttle.AutoThrottle': 0,
|
||||
}
|
||||
|
||||
FEED_URI = None
|
||||
|
@ -10,6 +10,8 @@ DEPRECATED_SETTINGS = [
|
||||
('STATS_ENABLED', 'no longer supported (change STATS_CLASS instead)'),
|
||||
('SQLITE_DB', 'no longer supported'),
|
||||
('SELECTORS_BACKEND', 'use SCRAPY_SELECTORS_BACKEND environment variable instead'),
|
||||
('AUTOTHROTTLE_MIN_DOWNLOAD_DELAY', 'use DOWNLOAD_DELAY instead'),
|
||||
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
|
||||
]
|
||||
|
||||
def check_deprecated_settings(settings):
|
||||
|
Loading…
x
Reference in New Issue
Block a user