mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 08:43:55 +00:00
make download_timeout configurable by request. closes #229
--HG-- extra : rebase_source : e57dfd4aeb98d48b04fc4d0c6469e9a85e4b33a8
This commit is contained in:
parent
d0081290f2
commit
12b04b068f
@ -181,6 +181,19 @@ DefaultHeadersMiddleware
|
||||
``default_request_headers`` attribute. Spider headers has precedence over
|
||||
global headers.
|
||||
|
||||
DownloadTimeoutMiddleware
|
||||
-------------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.downloadtimeout
|
||||
:synopsis: Download timeout middleware
|
||||
|
||||
.. class:: DownloadTimeoutMiddleware
|
||||
|
||||
This middleware sets download timeout for requests based on
|
||||
`download_timeout` spider attribute. It doesn't override timeout if
|
||||
`download_timeout` is already set in request meta. Otherwise,
|
||||
:setting:`DOWNLOAD_TIMEOUT` setting is used as default download timeout.
|
||||
|
||||
HttpAuthMiddleware
|
||||
------------------
|
||||
|
||||
|
@ -63,6 +63,7 @@ DOWNLOADER_MIDDLEWARES_BASE = {
|
||||
# Engine side
|
||||
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
|
||||
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
|
||||
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
|
||||
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
|
||||
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
|
||||
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
|
||||
|
20
scrapy/contrib/downloadermiddleware/downloadtimeout.py
Normal file
20
scrapy/contrib/downloadermiddleware/downloadtimeout.py
Normal file
@ -0,0 +1,20 @@
|
||||
"""
|
||||
Download timeout middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
from scrapy.utils.python import WeakKeyCache
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self._cache = WeakKeyCache(self._download_timeout)
|
||||
|
||||
def _download_timeout(self, spider):
|
||||
return getattr(spider, "download_timeout", None)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
timeout = self._cache[spider]
|
||||
if timeout:
|
||||
request.meta.setdefault('download_timeout', timeout)
|
@ -12,26 +12,19 @@ if ssl_supported:
|
||||
from twisted.internet.ssl import ClientContextFactory
|
||||
|
||||
HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT')
|
||||
|
||||
|
||||
class HttpDownloadHandler(object):
|
||||
|
||||
def __init__(self, httpclientfactory=HTTPClientFactory, \
|
||||
download_timeout=DOWNLOAD_TIMEOUT):
|
||||
def __init__(self, httpclientfactory=HTTPClientFactory):
|
||||
self.httpclientfactory = httpclientfactory
|
||||
self.download_timeout = download_timeout
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
factory = self._create_factory(request, spider)
|
||||
factory = self.httpclientfactory(request)
|
||||
self._connect(factory)
|
||||
return factory.deferred
|
||||
|
||||
def _create_factory(self, request, spider):
|
||||
timeout = getattr(spider, "download_timeout", None) or self.download_timeout
|
||||
return self.httpclientfactory(request, timeout)
|
||||
|
||||
def _connect(self, factory):
|
||||
host, port = factory.host, factory.port
|
||||
if factory.scheme == 'https':
|
||||
|
@ -8,6 +8,10 @@ from twisted.internet import defer
|
||||
from scrapy.http import Headers
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT')
|
||||
|
||||
|
||||
def _parsed_url_args(parsed):
|
||||
@ -85,13 +89,13 @@ class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||
followRedirect = False
|
||||
afterFoundGet = False
|
||||
|
||||
def __init__(self, request, timeout=0):
|
||||
def __init__(self, request, timeout=DOWNLOAD_TIMEOUT):
|
||||
self.url = urldefrag(request.url)[0]
|
||||
self.method = request.method
|
||||
self.body = request.body or None
|
||||
self.headers = Headers(request.headers)
|
||||
self.response_headers = None
|
||||
self.timeout = timeout
|
||||
self.timeout = request.meta.get('download_timeout') or timeout
|
||||
self.deferred = defer.Deferred().addCallback(self._build_response)
|
||||
|
||||
self._set_connection_attributes(request)
|
||||
|
@ -98,10 +98,8 @@ class HttpTestCase(unittest.TestCase):
|
||||
return d
|
||||
|
||||
def test_timeout_download_from_spider(self):
|
||||
spider = BaseSpider('foo')
|
||||
spider.download_timeout = 0.000001
|
||||
request = Request(self.getURL('wait'))
|
||||
d = self.download_request(request, spider)
|
||||
request = Request(self.getURL('wait'), meta=dict(download_timeout=0.000001))
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
return self.assertFailure(d, defer.TimeoutError)
|
||||
|
||||
def test_host_header_not_in_request_headers(self):
|
||||
|
33
scrapy/tests/test_downloadermiddleware_downloadtimeout.py
Normal file
33
scrapy/tests/test_downloadermiddleware_downloadtimeout.py
Normal file
@ -0,0 +1,33 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
|
||||
|
||||
class DownloadTimeoutMiddlewareTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.mw = DownloadTimeoutMiddleware()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.req = Request('http://scrapytest.org/')
|
||||
|
||||
def tearDown(self):
|
||||
del self.mw
|
||||
del self.spider
|
||||
del self.req
|
||||
|
||||
def test_spider_has_no_download_timeout(self):
|
||||
assert self.mw.process_request(self.req, self.spider) is None
|
||||
assert 'download_timeout' not in self.req.meta
|
||||
|
||||
def test_spider_has_download_timeout(self):
|
||||
self.spider.download_timeout = 2
|
||||
assert self.mw.process_request(self.req, self.spider) is None
|
||||
self.assertEquals(self.req.meta.get('download_timeout'), 2)
|
||||
|
||||
def test_request_has_download_timeout(self):
|
||||
self.spider.download_timeout = 2
|
||||
self.req.meta['download_timeout'] = 1
|
||||
assert self.mw.process_request(self.req, self.spider) is None
|
||||
self.assertEquals(self.req.meta.get('download_timeout'), 1)
|
Loading…
x
Reference in New Issue
Block a user