From 12b04b068f1513da935d14ed58ecc1cf5eb3d263 Mon Sep 17 00:00:00 2001 From: Daniel Grana Date: Tue, 7 Sep 2010 13:01:40 -0300 Subject: [PATCH] make download_timeout configurable by request. closes #229 --HG-- extra : rebase_source : e57dfd4aeb98d48b04fc4d0c6469e9a85e4b33a8 --- docs/topics/downloader-middleware.rst | 13 ++++++++ scrapy/conf/default_settings.py | 1 + .../downloadermiddleware/downloadtimeout.py | 20 +++++++++++ scrapy/core/downloader/handlers/http.py | 11 ++----- scrapy/core/downloader/webclient.py | 8 +++-- scrapy/tests/test_downloader_handlers.py | 6 ++-- ...st_downloadermiddleware_downloadtimeout.py | 33 +++++++++++++++++++ 7 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 scrapy/contrib/downloadermiddleware/downloadtimeout.py create mode 100644 scrapy/tests/test_downloadermiddleware_downloadtimeout.py diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index 4377117c0..86eda9450 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -181,6 +181,19 @@ DefaultHeadersMiddleware ``default_request_headers`` attribute. Spider headers has precedence over global headers. +DownloadTimeoutMiddleware +------------------------- + +.. module:: scrapy.contrib.downloadermiddleware.downloadtimeout + :synopsis: Download timeout middleware + +.. class:: DownloadTimeoutMiddleware + + This middleware sets download timeout for requests based on + `download_timeout` spider attribute. It doesn't override timeout if + `download_timeout` is already set in request meta. Otherwise, + :setting:`DOWNLOAD_TIMEOUT` setting is used as default download timeout. + HttpAuthMiddleware ------------------ diff --git a/scrapy/conf/default_settings.py b/scrapy/conf/default_settings.py index 985995414..b86f4a2ae 100644 --- a/scrapy/conf/default_settings.py +++ b/scrapy/conf/default_settings.py @@ -63,6 +63,7 @@ DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300, + 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400, 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500, 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550, diff --git a/scrapy/contrib/downloadermiddleware/downloadtimeout.py b/scrapy/contrib/downloadermiddleware/downloadtimeout.py new file mode 100644 index 000000000..0c250d4c4 --- /dev/null +++ b/scrapy/contrib/downloadermiddleware/downloadtimeout.py @@ -0,0 +1,20 @@ +""" +Download timeout middleware + +See documentation in docs/topics/downloader-middleware.rst +""" +from scrapy.utils.python import WeakKeyCache + + +class DownloadTimeoutMiddleware(object): + + def __init__(self): + self._cache = WeakKeyCache(self._download_timeout) + + def _download_timeout(self, spider): + return getattr(spider, "download_timeout", None) + + def process_request(self, request, spider): + timeout = self._cache[spider] + if timeout: + request.meta.setdefault('download_timeout', timeout) diff --git a/scrapy/core/downloader/handlers/http.py b/scrapy/core/downloader/handlers/http.py index 6025a1011..61391c626 100644 --- a/scrapy/core/downloader/handlers/http.py +++ b/scrapy/core/downloader/handlers/http.py @@ -12,26 +12,19 @@ if ssl_supported: from twisted.internet.ssl import ClientContextFactory HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY']) -DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT') class HttpDownloadHandler(object): - def __init__(self, httpclientfactory=HTTPClientFactory, \ - download_timeout=DOWNLOAD_TIMEOUT): + def __init__(self, httpclientfactory=HTTPClientFactory): self.httpclientfactory = httpclientfactory - self.download_timeout = download_timeout def download_request(self, request, spider): """Return a deferred for the HTTP download""" - factory = self._create_factory(request, spider) + factory = self.httpclientfactory(request) self._connect(factory) return factory.deferred - def _create_factory(self, request, spider): - timeout = getattr(spider, "download_timeout", None) or self.download_timeout - return self.httpclientfactory(request, timeout) - def _connect(self, factory): host, port = factory.host, factory.port if factory.scheme == 'https': diff --git a/scrapy/core/downloader/webclient.py b/scrapy/core/downloader/webclient.py index dc93c2ab1..0aeb861e0 100644 --- a/scrapy/core/downloader/webclient.py +++ b/scrapy/core/downloader/webclient.py @@ -8,6 +8,10 @@ from twisted.internet import defer from scrapy.http import Headers from scrapy.utils.httpobj import urlparse_cached from scrapy.core.downloader.responsetypes import responsetypes +from scrapy.conf import settings + + +DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT') def _parsed_url_args(parsed): @@ -85,13 +89,13 @@ class ScrapyHTTPClientFactory(HTTPClientFactory): followRedirect = False afterFoundGet = False - def __init__(self, request, timeout=0): + def __init__(self, request, timeout=DOWNLOAD_TIMEOUT): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None - self.timeout = timeout + self.timeout = request.meta.get('download_timeout') or timeout self.deferred = defer.Deferred().addCallback(self._build_response) self._set_connection_attributes(request) diff --git a/scrapy/tests/test_downloader_handlers.py b/scrapy/tests/test_downloader_handlers.py index 27111bd6d..446947255 100644 --- a/scrapy/tests/test_downloader_handlers.py +++ b/scrapy/tests/test_downloader_handlers.py @@ -98,10 +98,8 @@ class HttpTestCase(unittest.TestCase): return d def test_timeout_download_from_spider(self): - spider = BaseSpider('foo') - spider.download_timeout = 0.000001 - request = Request(self.getURL('wait')) - d = self.download_request(request, spider) + request = Request(self.getURL('wait'), meta=dict(download_timeout=0.000001)) + d = self.download_request(request, BaseSpider('foo')) return self.assertFailure(d, defer.TimeoutError) def test_host_header_not_in_request_headers(self): diff --git a/scrapy/tests/test_downloadermiddleware_downloadtimeout.py b/scrapy/tests/test_downloadermiddleware_downloadtimeout.py new file mode 100644 index 000000000..fd60bee9e --- /dev/null +++ b/scrapy/tests/test_downloadermiddleware_downloadtimeout.py @@ -0,0 +1,33 @@ +import unittest + +from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware +from scrapy.spider import BaseSpider +from scrapy.http import Request + + +class DownloadTimeoutMiddlewareTest(unittest.TestCase): + + def setUp(self): + self.mw = DownloadTimeoutMiddleware() + self.spider = BaseSpider('foo') + self.req = Request('http://scrapytest.org/') + + def tearDown(self): + del self.mw + del self.spider + del self.req + + def test_spider_has_no_download_timeout(self): + assert self.mw.process_request(self.req, self.spider) is None + assert 'download_timeout' not in self.req.meta + + def test_spider_has_download_timeout(self): + self.spider.download_timeout = 2 + assert self.mw.process_request(self.req, self.spider) is None + self.assertEquals(self.req.meta.get('download_timeout'), 2) + + def test_request_has_download_timeout(self): + self.spider.download_timeout = 2 + self.req.meta['download_timeout'] = 1 + assert self.mw.process_request(self.req, self.spider) is None + self.assertEquals(self.req.meta.get('download_timeout'), 1)