1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 08:43:55 +00:00

make download_timeout configurable by request. closes #229

--HG--
extra : rebase_source : e57dfd4aeb98d48b04fc4d0c6469e9a85e4b33a8
This commit is contained in:
Daniel Grana 2010-09-07 13:01:40 -03:00
parent d0081290f2
commit 12b04b068f
7 changed files with 77 additions and 15 deletions

View File

@ -181,6 +181,19 @@ DefaultHeadersMiddleware
``default_request_headers`` attribute. Spider headers has precedence over
global headers.
DownloadTimeoutMiddleware
-------------------------
.. module:: scrapy.contrib.downloadermiddleware.downloadtimeout
:synopsis: Download timeout middleware
.. class:: DownloadTimeoutMiddleware
This middleware sets download timeout for requests based on
`download_timeout` spider attribute. It doesn't override timeout if
`download_timeout` is already set in request meta. Otherwise,
:setting:`DOWNLOAD_TIMEOUT` setting is used as default download timeout.
HttpAuthMiddleware
------------------

View File

@ -63,6 +63,7 @@ DOWNLOADER_MIDDLEWARES_BASE = {
# Engine side
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,

View File

@ -0,0 +1,20 @@
"""
Download timeout middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from scrapy.utils.python import WeakKeyCache
class DownloadTimeoutMiddleware(object):
def __init__(self):
self._cache = WeakKeyCache(self._download_timeout)
def _download_timeout(self, spider):
return getattr(spider, "download_timeout", None)
def process_request(self, request, spider):
timeout = self._cache[spider]
if timeout:
request.meta.setdefault('download_timeout', timeout)

View File

@ -12,26 +12,19 @@ if ssl_supported:
from twisted.internet.ssl import ClientContextFactory
HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT')
class HttpDownloadHandler(object):
def __init__(self, httpclientfactory=HTTPClientFactory, \
download_timeout=DOWNLOAD_TIMEOUT):
def __init__(self, httpclientfactory=HTTPClientFactory):
self.httpclientfactory = httpclientfactory
self.download_timeout = download_timeout
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
factory = self._create_factory(request, spider)
factory = self.httpclientfactory(request)
self._connect(factory)
return factory.deferred
def _create_factory(self, request, spider):
timeout = getattr(spider, "download_timeout", None) or self.download_timeout
return self.httpclientfactory(request, timeout)
def _connect(self, factory):
host, port = factory.host, factory.port
if factory.scheme == 'https':

View File

@ -8,6 +8,10 @@ from twisted.internet import defer
from scrapy.http import Headers
from scrapy.utils.httpobj import urlparse_cached
from scrapy.core.downloader.responsetypes import responsetypes
from scrapy.conf import settings
DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT')
def _parsed_url_args(parsed):
@ -85,13 +89,13 @@ class ScrapyHTTPClientFactory(HTTPClientFactory):
followRedirect = False
afterFoundGet = False
def __init__(self, request, timeout=0):
def __init__(self, request, timeout=DOWNLOAD_TIMEOUT):
self.url = urldefrag(request.url)[0]
self.method = request.method
self.body = request.body or None
self.headers = Headers(request.headers)
self.response_headers = None
self.timeout = timeout
self.timeout = request.meta.get('download_timeout') or timeout
self.deferred = defer.Deferred().addCallback(self._build_response)
self._set_connection_attributes(request)

View File

@ -98,10 +98,8 @@ class HttpTestCase(unittest.TestCase):
return d
def test_timeout_download_from_spider(self):
spider = BaseSpider('foo')
spider.download_timeout = 0.000001
request = Request(self.getURL('wait'))
d = self.download_request(request, spider)
request = Request(self.getURL('wait'), meta=dict(download_timeout=0.000001))
d = self.download_request(request, BaseSpider('foo'))
return self.assertFailure(d, defer.TimeoutError)
def test_host_header_not_in_request_headers(self):

View File

@ -0,0 +1,33 @@
import unittest
from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware
from scrapy.spider import BaseSpider
from scrapy.http import Request
class DownloadTimeoutMiddlewareTest(unittest.TestCase):
def setUp(self):
self.mw = DownloadTimeoutMiddleware()
self.spider = BaseSpider('foo')
self.req = Request('http://scrapytest.org/')
def tearDown(self):
del self.mw
del self.spider
del self.req
def test_spider_has_no_download_timeout(self):
assert self.mw.process_request(self.req, self.spider) is None
assert 'download_timeout' not in self.req.meta
def test_spider_has_download_timeout(self):
self.spider.download_timeout = 2
assert self.mw.process_request(self.req, self.spider) is None
self.assertEquals(self.req.meta.get('download_timeout'), 2)
def test_request_has_download_timeout(self):
self.spider.download_timeout = 2
self.req.meta['download_timeout'] = 1
assert self.mw.process_request(self.req, self.spider) is None
self.assertEquals(self.req.meta.get('download_timeout'), 1)