implement download timeouts based on deferred cancellation

2025-02-25 15:44:33 +00:00 · 2013-06-03 19:18:45 -03:00 · 2013-06-03 19:18:45 -03:00 · 7729f939e9
commit 7729f939e9
parent 495acba223
2 changed files with 16 additions and 17 deletions
--- a/scrapy/core/downloader/handlers/http11.py
+++ b/scrapy/core/downloader/handlers/http11.py
@ -12,6 +12,7 @@ from twisted.web.http_headers import Headers as TxHeaders
 from twisted.web.http import PotentialDataLoss
 from twisted.web.iweb import IBodyProducer
 from twisted.internet.endpoints import TCP4ClientEndpoint
+from twisted.internet.error import TimeoutError

 from scrapy.http import Headers
 from scrapy.responsetypes import responsetypes
@ -49,20 +50,27 @@ class ScrapyAgent(object):
        self._pool = pool

    def download_request(self, request):
+        timeout = request.meta.get('download_timeout') or self._connectTimeout
        url = urldefrag(request.url)[0]
        method = request.method
        headers = TxHeaders(request.headers)
        bodyproducer = _RequestBodyProducer(request.body) if request.body else None
-        agent = self._get_agent(request)
+        agent = self._get_agent(request, timeout)
        start_time = time()
        d = agent.request(method, url, headers, bodyproducer)
-        d.addBoth(self._download_latency, request, start_time)
-        d.addCallback(self._agentrequest_downloaded, request)
-        d.addErrback(self._agentrequest_failed, request)
+        d.addBoth(self._both_cb, request, start_time, url, timeout)
+        d.addCallback(self._downloaded, request)
+        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        return d

-    def _get_agent(self, request):
-        timeout = request.meta.get('download_timeout') or self._connectTimeout
+    def _both_cb(self, result, request, start_time, url, timeout):
+        request.meta['download_latency'] = time() - start_time
+        if self._timeout_cl.active():
+            self._timeout_cl.cancel()
+            return result
+        raise TimeoutError("Getting %s took longer than %s seconds." % (url, timeout))
+
+    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
@ -74,11 +82,7 @@ class ScrapyAgent(object):
        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

-    def _download_latency(self, any_, request, start_time):
-        request.meta['download_latency'] = time() - start_time
-        return any_
-
-    def _agentrequest_downloaded(self, txresponse, request):
+    def _downloaded(self, txresponse, request):
        if txresponse.length == 0:
            return self._build_response(('', None), txresponse, request)
        finished = defer.Deferred()
@ -95,11 +99,6 @@ class ScrapyAgent(object):
        respcls = responsetypes.from_args(headers=headers, url=url)
        return respcls(url=url, status=status, headers=headers, body=body)

-    def _agentrequest_failed(self, failure, request):
-        # be clear it is an HTTP failure with new downloader
-        log.err(failure, 'HTTP11 failure: %s' % request)
-        return failure
-

 class _RequestBodyProducer(object):
    implements(IBodyProducer)
--- a/scrapy/tests/test_downloader_handlers.py
+++ b/scrapy/tests/test_downloader_handlers.py
@ -105,7 +105,7 @@ class HttpTestCase(unittest.TestCase):
        return d

    def test_timeout_download_from_spider(self):
-        request = Request(self.getURL('wait'), meta=dict(download_timeout=0.000001))
+        request = Request(self.getURL('wait'), meta=dict(download_timeout=0.1))
        d = self.download_request(request, BaseSpider('foo'))
        return self.assertFailure(d, defer.TimeoutError, error.TimeoutError)