stop raising HttpException at download handlers and adapt download middlewares

2025-02-24 00:23:52 +00:00 · 2009-05-27 16:51:36 -03:00 · 2009-05-27 16:51:36 -03:00 · 7eaa3ed24d
commit 7eaa3ed24d
parent c8827552b6
13 changed files with 125 additions and 127 deletions
--- a/docs/ref/settings.rst
+++ b/docs/ref/settings.rst
@ -655,6 +655,27 @@ Default: ``Not Defined``
 The name of the current project. It matches the project module name as created
 by ``startproject`` command, and is only defined by project settings file.

+.. setting:: REDIRECT_MAX_TIMES
+
+REDIRECT_MAX_TIMES
+------------------
+
+Default: ``20``
+
+Defines the maximun times a request can be redirected. After this maximun the
+request's response is returned as is. We used Firefox default value for the
+same task.
+
+.. setting:: REDIRECT_MAX_METAREFRESH_DELAY
+
+REDIRECT_MAX_METAREFRESH_DELAY
+---------------------------
+
+Default: ``100``
+
+Some sites use meta-refresh for redirecting to a session expired page, so we
+restrict automatic redirection to a maximum delay (in seconds)
+
 .. setting:: REQUEST_HEADER_ACCEPT

 REQUEST_HEADER_ACCEPT
--- a/scrapy/conf/default_settings.py
+++ b/scrapy/conf/default_settings.py
@ -136,7 +136,8 @@ NEWSPIDER_MODULE = ''

 PRIORITIZER = 'scrapy.core.prioritizers.RandomPrioritizer'

-REDIRECTMIDDLEWARE_MAX_TIMES = 20 # uses Firefox default setting
+REDIRECT_MAX_METAREFRESH_DELAY = 100
+REDIRECT_MAX_TIMES = 20 # uses Firefox default setting

 REQUEST_HEADER_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 REQUEST_HEADER_ACCEPT_LANGUAGE = 'en'
@ -146,7 +147,7 @@ REQUESTS_PER_DOMAIN = 8     # max simultaneous requests per domain

 # contrib.middleware.retry.RetryMiddleware default settings
 RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
-RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
+RETRY_HTTP_CODES = ['500', '503', '504', '400', '408']


 ROBOTSTXT_OBEY = False
--- a/scrapy/contrib/downloadermiddleware/cookies.py
+++ b/scrapy/contrib/downloadermiddleware/cookies.py
@ -6,7 +6,6 @@ from pydispatch import dispatcher
 from scrapy.core import signals
 from scrapy.http import Response
 from scrapy.http.cookies import CookieJar
-from scrapy.core.exceptions import HttpException
 from scrapy.conf import settings
 from scrapy import log

@ -44,11 +43,6 @@ class CookiesMiddleware(object):

        return response

-    # cookies should be set on non-200 responses too
-    def process_exception(self, request, exception, spider):
-        if isinstance(exception, HttpException):
-            self.process_response(request, exception.response, spider)
-
    def domain_closed(self, domain):
        self.jars.pop(domain, None)

--- a/scrapy/contrib/downloadermiddleware/errorpages.py
+++ b/scrapy/contrib/downloadermiddleware/errorpages.py
@ -1,13 +1,17 @@
 from scrapy.core.exceptions import HttpException
+from scrapy.utils.response import response_status_message

 class ErrorPagesMiddleware(object):
-    """This middleware allows the spiders to receive error (non 200) responses,
-    the same way the receive normal responses"""
+    """This middleware filters out responses with status code others than 2XX
+    or defined in spider handle_httpstatus_list attribute.

-    def process_exception(self, request, exception, spider):
-        if isinstance(exception, HttpException):
-            statuses = getattr(spider, 'handle_httpstatus_list', None)
-            httpstatus = exception.response.status
-            if statuses and httpstatus in statuses:
-                return exception.response
+    TODO: move this mw to spidermiddleware and remove me
+    """
+
+    def process_response(self, request, response, spider):
+        status = response.status
+        if 200 <= status < 300 or status in getattr(spider, 'handle_httpstatus_list', []):
+            return response
+        else:
+            raise HttpException(status, None, response)

--- a/scrapy/contrib/downloadermiddleware/httpcache.py
+++ b/scrapy/contrib/downloadermiddleware/httpcache.py
@ -10,12 +10,13 @@ from pydispatch import dispatcher
 from scrapy.core import signals
 from scrapy import log
 from scrapy.http import Response, Headers
-from scrapy.core.exceptions import NotConfigured, HttpException, IgnoreRequest
+from scrapy.core.exceptions import NotConfigured, IgnoreRequest
 from scrapy.core.downloader.responsetypes import responsetypes
 from scrapy.utils.request import request_fingerprint
 from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
 from scrapy.conf import settings

+
 class HttpCacheMiddleware(object):
    def __init__(self):
        if not settings['HTTPCACHE_DIR']:
@ -33,37 +34,25 @@ class HttpCacheMiddleware(object):

        key = request_fingerprint(request)
        domain = spider.domain_name
+
        try:
            response = self.cache.retrieve_response(domain, key)
        except:
            log.msg("Corrupt cache for %s" % request.url, log.WARNING)
            response = False
+
        if response:
-            if not 200 <= int(response.status) < 300:
-                raise HttpException(response.status, None, response)
            return response
        elif self.ignore_missing:
            raise IgnoreRequest("Ignored request not in cache: %s" % request)

    def process_response(self, request, response, spider):
-        if not is_cacheable(request):
-            return response
-
-        if isinstance(response, Response) and not response.meta.get('cached'):
+        if is_cacheable(request):
            key = request_fingerprint(request)
-            domain = spider.domain_name
-            self.cache.store(domain, key, request, response)
+            self.cache.store(spider.domain_name, key, request, response)

        return response

-    def process_exception(self, request, exception, spider):
-        if not is_cacheable(request):
-            return
-
-        if isinstance(exception, HttpException) and isinstance(exception.response, Response):
-            key = request_fingerprint(request)
-            domain = spider.domain_name
-            self.cache.store(domain, key, request, exception.response)

 def is_cacheable(request):
    return request.url.scheme in ['http', 'https']
--- a/scrapy/contrib/downloadermiddleware/redirect.py
+++ b/scrapy/contrib/downloadermiddleware/redirect.py
@ -1,41 +1,33 @@
 from scrapy import log
-from scrapy.core.exceptions import HttpException
 from scrapy.utils.url import urljoin_rfc as urljoin
 from scrapy.utils.response import get_meta_refresh
 from scrapy.conf import settings

-# some sites use meta-refresh for redirecting to a session expired page, so we
-# restrict automatic redirection to a maximum delay (in number of seconds)
-META_REFRESH_MAXSEC = 100
-MAX_REDIRECT_LOOP = 10

 class RedirectMiddleware(object):
+    """Handle redirection of requests based on response status and meta-refresh html tag"""
+
    def __init__(self):
-        self.max_redirect_times = settings.getint('REDIRECTMIDDLEWARE_MAX_TIMES')
-
-    def process_exception(self, request, exception, spider):
-        if not isinstance(exception, HttpException):
-            return
+        self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
+        self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')

+    def process_response(self, request, response, spider):
        domain = spider.domain_name
-        status = exception.status
-        response = exception.response

-        if status in [302, 303] and 'Location' in response.headers:
+        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, response.headers['location'])
            redirected = request.replace(url=redirected_url, method='GET', body='')
            redirected.headers.pop('Content-Type', None)
            redirected.headers.pop('Content-Length', None)
-            return self._redirect(redirected, request, spider, status)
+            return self._redirect(redirected, request, spider, response.status)

-        if status in [301, 307] and 'Location' in response.headers:
+        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, response.headers['location'])
            redirected = request.replace(url=redirected_url)
-            return self._redirect(redirected, request, spider, status)
+            return self._redirect(redirected, request, spider, response.status)

-    def process_response(self, request, response, spider):
        interval, url = get_meta_refresh(response)
-        if url and int(interval) < META_REFRESH_MAXSEC:
+        if url and int(interval) < self.max_metarefresh_delay:
            redirected = request.replace(url=urljoin(request.url, url))
            return self._redirect(redirected, request, spider, 'meta refresh')

--- a/scrapy/contrib/downloadermiddleware/retry.py
+++ b/scrapy/contrib/downloadermiddleware/retry.py
@ -19,46 +19,50 @@ About HTTP errors to consider:
 - You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP
  protocol. It's included by default because it's a common code used to
  indicate server overload, which would be something we want to retry
- 200 is included by default (and shoudln't be removed) to check for partial
-  downloads errors, which means the TCP connection has broken in the middle of
-  a HTTP download
 """

 from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookupError, \
                                   ConnectionRefusedError, ConnectionDone, ConnectError, \
                                   ConnectionLost
 from twisted.internet.defer import TimeoutError as UserTimeoutError
+from twisted.web.client import PartialDownloadError

 from scrapy import log
-from scrapy.core.exceptions import HttpException
 from scrapy.utils.request import request_fingerprint
+from scrapy.utils.response import response_status_message
 from scrapy.conf import settings

 class RetryMiddleware(object):

    EXCEPTIONS_TO_RETRY = (ServerTimeoutError, UserTimeoutError, DNSLookupError,
                           ConnectionRefusedError, ConnectionDone, ConnectError,
-                           ConnectionLost)
+                           ConnectionLost, PartialDownloadError)

    def __init__(self):
        self.max_retry_times = settings.getint('RETRY_TIMES')
        self.retry_http_codes = map(int, settings.getlist('RETRY_HTTP_CODES'))

+    def process_response(self, request, response, spider):
+        if response.status in self.retry_http_codes:
+            reason = response_status_message(response.status)
+            return self._retry(request, reason, spider) or response
+        return response
+
    def process_exception(self, request, exception, spider):
-        if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
-                or (isinstance(exception, HttpException) \
-                        and (int(exception.status) in self.retry_http_codes)):
+        if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
+            return self._retry(request, exception, spider)

-            retries = request.meta.get('retry_times', 0) + 1
+    def _retry(self, request, reason, spider):
+        retries = request.meta.get('retry_times', 0) + 1

-            if retries <= self.max_retry_times:
-                log.msg("Retrying %s (failed %d times): %s" % (request, retries, exception),
-                        domain=spider.domain_name, level=log.DEBUG)
-                retryreq = request.copy()
-                retryreq.meta['retry_times'] = retries
-                retryreq.dont_filter = True
-                return retryreq
-            else:
-                log.msg("Discarding %s (failed %d times): %s" % (request, retries, exception),
-                        domain=spider.domain_name, level=log.DEBUG)
+        if retries <= self.max_retry_times:
+            log.msg("Retrying %s (failed %d times): %s" % (request, retries, reason),
+                    domain=spider.domain_name, level=log.DEBUG)
+            retryreq = request.copy()
+            retryreq.meta['retry_times'] = retries
+            retryreq.dont_filter = True
+            return retryreq
+        else:
+            log.msg("Discarding %s (failed %d times): %s" % (request, retries, reason),
+                    domain=spider.domain_name, level=log.DEBUG)

--- a/scrapy/contrib/downloadermiddleware/stats.py
+++ b/scrapy/contrib/downloadermiddleware/stats.py
@ -1,4 +1,4 @@
-from scrapy.core.exceptions import HttpException, NotConfigured
+from scrapy.core.exceptions import NotConfigured
 from scrapy.stats import stats
 from scrapy.conf import settings

@ -18,7 +18,7 @@ class DownloaderStats(object):
    def __init__(self):
        if not settings.getbool('DOWNLOADER_STATS'):
            raise NotConfigured
-            
+
    def process_request(self, request, spider):
        stats.incpath('_global/downloader/request_count')
        stats.incpath('%s/downloader/request_count' % spider.domain_name)
@ -30,14 +30,12 @@ class DownloaderStats(object):
    def process_response(self, request, response, spider):
        self._inc_response_count(response, spider.domain_name)
        return response
-        
+
    def process_exception(self, request, exception, spider):
        ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
        stats.incpath('_global/downloader/exception_count')
        stats.incpath('%s/downloader/exception_count' % spider.domain_name)
        stats.incpath('%s/downloader/exception_type_count/%s' % (spider.domain_name, ex_class))
-        if isinstance(exception, HttpException):
-            self._inc_response_count(exception.response, spider.domain_name)

    def _inc_response_count(self, response, domain):
        stats.incpath('_global/downloader/response_count')
--- a/scrapy/core/downloader/handlers.py
+++ b/scrapy/core/downloader/handlers.py
@ -16,7 +16,7 @@ except ImportError:
 from scrapy import optional_features
 from scrapy.core import signals
 from scrapy.http import Headers
-from scrapy.core.exceptions import HttpException, NotSupported
+from scrapy.core.exceptions import NotSupported
 from scrapy.utils.defer import defer_succeed
 from scrapy.conf import settings

@ -65,13 +65,7 @@ def create_factory(request, spider):
        signals.send_catch_log(signal=signals.response_downloaded, sender='download_http', response=r, spider=spider)
        return r

-    def _on_success(body):
-        response = _create_response(body)
-        if response.status not in (200, 201, 202):
-            raise HttpException(response.status, None, response)
-        return response
-
-    factory.deferred.addCallbacks(_on_success)
+    factory.deferred.addCallbacks(_create_response)
    return factory

 def download_http(request, spider):
--- a/scrapy/tests/test_downloadermiddleware_cookies.py
+++ b/scrapy/tests/test_downloadermiddleware_cookies.py
@ -3,7 +3,6 @@ from __future__ import with_statement
 from unittest import TestCase

 from scrapy.spider import spiders
-from scrapy.core.exceptions import HttpException
 from scrapy.http import Response, Request
 from scrapy.contrib.downloadermiddleware.cookies import CookiesMiddleware

@ -35,21 +34,6 @@ class CookiesMiddlewareTest(TestCase):
        assert self.mw.process_request(req2, self.spider) is None
        self.assertEquals(req2.headers.get('Cookie'), "C1=value1")

-    def test_http_exception(self):
-        req = Request('http://scrapytest.org/')
-        assert self.mw.process_request(req, self.spider) is None
-        assert 'Cookie' not in req.headers
-
-        headers = {'Set-Cookie': 'C1=value1; path=/'}
-        res = Response('http://scrapytest.org/', headers=headers)
-        exc = HttpException(302, 'Redirect', res)
-        assert self.mw.process_exception(req, exc, self.spider) is None
-        #assert exc.response.cookies
-
-        req2 = Request('http://scrapytest.org/sub1/')
-        assert self.mw.process_request(req2, self.spider) is None
-        self.assertEquals(req2.headers.get('Cookie'), "C1=value1")
-
    def test_dont_merge_cookies(self):
        # merge some cookies into jar
        headers = {'Set-Cookie': 'C1=value1; path=/'}
--- a/scrapy/tests/test_downloadermiddleware_redirect.py
+++ b/scrapy/tests/test_downloadermiddleware_redirect.py
@ -1,7 +1,6 @@
 import unittest

 from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware
-from scrapy.core.exceptions import HttpException
 from scrapy.dupefilter import dupefilter
 from scrapy.spider import spiders
 from scrapy.http import Request, Response, Headers
@ -22,28 +21,24 @@ class RedirectMiddlewareTest(unittest.TestCase):
        url = 'http://www.example.com/301'
        url2 = 'http://www.example.com/redirected'
        req = Request(url)
-        hdr = Headers({'Location': [url2]})
-        rsp = Response(url, headers=hdr)
-        exc = HttpException('301', None, rsp)
+        rsp = Response(url, headers={'Location': url2}, status=301)

-        req2 = self.mw.process_exception(req, exc, self.spider)
+        req2 = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, url2)

        # response without Location header but with status code is 3XX should be ignored
        del rsp.headers['Location']
-        assert self.mw.process_exception(req, exc, self.spider) is None
+        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_redirect_302(self):
        url = 'http://www.example.com/302'
        url2 = 'http://www.example.com/redirected2'
        req = Request(url, method='POST', body='test', 
            headers={'Content-Type': 'text/plain', 'Content-length': '4'})
-        hdr = Headers({'Location': [url2]})
-        rsp = Response(url, headers=hdr)
-        exc = HttpException('302', None, rsp)
+        rsp = Response(url, headers={'Location': url2}, status=302)

-        req2 = self.mw.process_exception(req, exc, self.spider)
+        req2 = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, url2)
        self.assertEqual(req2.method, 'GET')
@ -56,7 +51,7 @@ class RedirectMiddlewareTest(unittest.TestCase):

        # response without Location header but with status code is 3XX should be ignored
        del rsp.headers['Location']
-        assert self.mw.process_exception(req, exc, self.spider) is None
+        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_meta_refresh(self):
        body = """<html>
@ -82,24 +77,24 @@ class RedirectMiddlewareTest(unittest.TestCase):
    def test_max_redirect_times(self):
        self.mw.max_redirect_times = 1
        req = Request('http://scrapytest.org/302')
-        exc = HttpException('302', None, Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}))
+        rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)

-        req = self.mw.process_exception(req, exc, self.spider)
+        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        assert 'redirect_times' in req.meta
        self.assertEqual(req.meta['redirect_times'], 1)

-        req = self.mw.process_exception(req, exc, self.spider)
+        req = self.mw.process_response(req, rsp, self.spider)
        self.assertEqual(req, None)

    def test_ttl(self):
        self.mw.max_redirect_times = 100
        req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
-        exc = HttpException('302', None, Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}))
+        rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)

-        req = self.mw.process_exception(req, exc, self.spider)
+        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
-        req = self.mw.process_exception(req, exc, self.spider)
+        req = self.mw.process_response(req, rsp, self.spider)
        self.assertEqual(req, None)

 if __name__ == "__main__":
--- a/scrapy/tests/test_downloadermiddleware_retry.py
+++ b/scrapy/tests/test_downloadermiddleware_retry.py
@ -5,7 +5,6 @@ from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookup
                                   ConnectionLost

 from scrapy.contrib.downloadermiddleware.retry import RetryMiddleware
-from scrapy.core.exceptions import HttpException
 from scrapy.spider import spiders
 from scrapy.http import Request, Response

@ -17,20 +16,31 @@ class RetryTest(unittest.TestCase):
        self.mw = RetryMiddleware()
        self.mw.max_retry_times = 2

-    def test_process_exception_404(self):
-        req404 = Request('http://www.scrapytest.org/404')
-        exc404 = HttpException('404', None, Response('http://www.scrapytest.org/404', body=''))
+    def test_404(self):
+        req = Request('http://www.scrapytest.org/404')
+        rsp = Response('http://www.scrapytest.org/404', body='', status=404)

        # dont retry 404s
-        req = self.mw.process_exception(req404, exc404, self.spider)
-        self.assertTrue(req is None)
+        assert self.mw.process_response(req, rsp, self.spider) is rsp

-    def test_process_exception_503(self):
-        req503 = Request('http://www.scrapytest.org/503')
-        exc503 = HttpException('503', None, Response('http://www.scrapytest.org/503', body=''))
-        self._test_retry_exception(req503, exc503)
+    def test_503(self):
+        req = Request('http://www.scrapytest.org/503')
+        rsp = Response('http://www.scrapytest.org/503', body='', status=503)

-    def test_process_exception_twistederrors(self):
+        # first retry
+        req = self.mw.process_response(req, rsp, self.spider)
+        assert isinstance(req, Request)
+        self.assertEqual(req.meta['retry_times'], 1)
+
+        # second retry
+        req = self.mw.process_response(req, rsp, self.spider)
+        assert isinstance(req, Request)
+        self.assertEqual(req.meta['retry_times'], 2)
+
+        # discard it
+        assert self.mw.process_response(req, rsp, self.spider) is rsp
+
+    def test_twistederrors(self):
        for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost):
            req = Request('http://www.scrapytest.org/%s' % exc.__name__)
            self._test_retry_exception(req, exc())
--- a/scrapy/utils/response.py
+++ b/scrapy/utils/response.py
@ -4,6 +4,7 @@ scrapy.http.Response objects
 """

 import re
+from twisted.web import http
 from scrapy.http.response import Response

 def body_or_str(obj, unicode=True):
@ -36,3 +37,14 @@ def get_meta_refresh(response):
        match = META_REFRESH_RE.search(response.body[0:4096])
        response.cache['meta_refresh_url'] = match.groups() if match else (None, None)
    return response.cache['meta_refresh_url']
+
+def response_status_message(status):
+    """Return status code plus status text descriptive message
+
+    >>> response_status_message(200)
+    200 OK
+
+    >>> response_status_message(404)
+    404 Not Found
+    """
+    return '%s %s' % (status, http.responses.get(int(status)))