1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 00:23:52 +00:00

stop raising HttpException at download handlers and adapt download middlewares

This commit is contained in:
Daniel Grana 2009-05-27 16:51:36 -03:00
parent c8827552b6
commit 7eaa3ed24d
13 changed files with 125 additions and 127 deletions

View File

@ -655,6 +655,27 @@ Default: ``Not Defined``
The name of the current project. It matches the project module name as created
by ``startproject`` command, and is only defined by project settings file.
.. setting:: REDIRECT_MAX_TIMES
REDIRECT_MAX_TIMES
------------------
Default: ``20``
Defines the maximun times a request can be redirected. After this maximun the
request's response is returned as is. We used Firefox default value for the
same task.
.. setting:: REDIRECT_MAX_METAREFRESH_DELAY
REDIRECT_MAX_METAREFRESH_DELAY
---------------------------
Default: ``100``
Some sites use meta-refresh for redirecting to a session expired page, so we
restrict automatic redirection to a maximum delay (in seconds)
.. setting:: REQUEST_HEADER_ACCEPT
REQUEST_HEADER_ACCEPT

View File

@ -136,7 +136,8 @@ NEWSPIDER_MODULE = ''
PRIORITIZER = 'scrapy.core.prioritizers.RandomPrioritizer'
REDIRECTMIDDLEWARE_MAX_TIMES = 20 # uses Firefox default setting
REDIRECT_MAX_METAREFRESH_DELAY = 100
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
REQUEST_HEADER_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
REQUEST_HEADER_ACCEPT_LANGUAGE = 'en'
@ -146,7 +147,7 @@ REQUESTS_PER_DOMAIN = 8 # max simultaneous requests per domain
# contrib.middleware.retry.RetryMiddleware default settings
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
RETRY_HTTP_CODES = ['500', '503', '504', '400', '408']
ROBOTSTXT_OBEY = False

View File

@ -6,7 +6,6 @@ from pydispatch import dispatcher
from scrapy.core import signals
from scrapy.http import Response
from scrapy.http.cookies import CookieJar
from scrapy.core.exceptions import HttpException
from scrapy.conf import settings
from scrapy import log
@ -44,11 +43,6 @@ class CookiesMiddleware(object):
return response
# cookies should be set on non-200 responses too
def process_exception(self, request, exception, spider):
if isinstance(exception, HttpException):
self.process_response(request, exception.response, spider)
def domain_closed(self, domain):
self.jars.pop(domain, None)

View File

@ -1,13 +1,17 @@
from scrapy.core.exceptions import HttpException
from scrapy.utils.response import response_status_message
class ErrorPagesMiddleware(object):
"""This middleware allows the spiders to receive error (non 200) responses,
the same way the receive normal responses"""
"""This middleware filters out responses with status code others than 2XX
or defined in spider handle_httpstatus_list attribute.
def process_exception(self, request, exception, spider):
if isinstance(exception, HttpException):
statuses = getattr(spider, 'handle_httpstatus_list', None)
httpstatus = exception.response.status
if statuses and httpstatus in statuses:
return exception.response
TODO: move this mw to spidermiddleware and remove me
"""
def process_response(self, request, response, spider):
status = response.status
if 200 <= status < 300 or status in getattr(spider, 'handle_httpstatus_list', []):
return response
else:
raise HttpException(status, None, response)

View File

@ -10,12 +10,13 @@ from pydispatch import dispatcher
from scrapy.core import signals
from scrapy import log
from scrapy.http import Response, Headers
from scrapy.core.exceptions import NotConfigured, HttpException, IgnoreRequest
from scrapy.core.exceptions import NotConfigured, IgnoreRequest
from scrapy.core.downloader.responsetypes import responsetypes
from scrapy.utils.request import request_fingerprint
from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
from scrapy.conf import settings
class HttpCacheMiddleware(object):
def __init__(self):
if not settings['HTTPCACHE_DIR']:
@ -33,37 +34,25 @@ class HttpCacheMiddleware(object):
key = request_fingerprint(request)
domain = spider.domain_name
try:
response = self.cache.retrieve_response(domain, key)
except:
log.msg("Corrupt cache for %s" % request.url, log.WARNING)
response = False
if response:
if not 200 <= int(response.status) < 300:
raise HttpException(response.status, None, response)
return response
elif self.ignore_missing:
raise IgnoreRequest("Ignored request not in cache: %s" % request)
def process_response(self, request, response, spider):
if not is_cacheable(request):
return response
if isinstance(response, Response) and not response.meta.get('cached'):
if is_cacheable(request):
key = request_fingerprint(request)
domain = spider.domain_name
self.cache.store(domain, key, request, response)
self.cache.store(spider.domain_name, key, request, response)
return response
def process_exception(self, request, exception, spider):
if not is_cacheable(request):
return
if isinstance(exception, HttpException) and isinstance(exception.response, Response):
key = request_fingerprint(request)
domain = spider.domain_name
self.cache.store(domain, key, request, exception.response)
def is_cacheable(request):
return request.url.scheme in ['http', 'https']

View File

@ -1,41 +1,33 @@
from scrapy import log
from scrapy.core.exceptions import HttpException
from scrapy.utils.url import urljoin_rfc as urljoin
from scrapy.utils.response import get_meta_refresh
from scrapy.conf import settings
# some sites use meta-refresh for redirecting to a session expired page, so we
# restrict automatic redirection to a maximum delay (in number of seconds)
META_REFRESH_MAXSEC = 100
MAX_REDIRECT_LOOP = 10
class RedirectMiddleware(object):
"""Handle redirection of requests based on response status and meta-refresh html tag"""
def __init__(self):
self.max_redirect_times = settings.getint('REDIRECTMIDDLEWARE_MAX_TIMES')
def process_exception(self, request, exception, spider):
if not isinstance(exception, HttpException):
return
self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
def process_response(self, request, response, spider):
domain = spider.domain_name
status = exception.status
response = exception.response
if status in [302, 303] and 'Location' in response.headers:
if response.status in [302, 303] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url, method='GET', body='')
redirected.headers.pop('Content-Type', None)
redirected.headers.pop('Content-Length', None)
return self._redirect(redirected, request, spider, status)
return self._redirect(redirected, request, spider, response.status)
if status in [301, 307] and 'Location' in response.headers:
if response.status in [301, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, status)
return self._redirect(redirected, request, spider, response.status)
def process_response(self, request, response, spider):
interval, url = get_meta_refresh(response)
if url and int(interval) < META_REFRESH_MAXSEC:
if url and int(interval) < self.max_metarefresh_delay:
redirected = request.replace(url=urljoin(request.url, url))
return self._redirect(redirected, request, spider, 'meta refresh')

View File

@ -19,46 +19,50 @@ About HTTP errors to consider:
- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP
protocol. It's included by default because it's a common code used to
indicate server overload, which would be something we want to retry
- 200 is included by default (and shoudln't be removed) to check for partial
downloads errors, which means the TCP connection has broken in the middle of
a HTTP download
"""
from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost
from twisted.internet.defer import TimeoutError as UserTimeoutError
from twisted.web.client import PartialDownloadError
from scrapy import log
from scrapy.core.exceptions import HttpException
from scrapy.utils.request import request_fingerprint
from scrapy.utils.response import response_status_message
from scrapy.conf import settings
class RetryMiddleware(object):
EXCEPTIONS_TO_RETRY = (ServerTimeoutError, UserTimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost)
ConnectionLost, PartialDownloadError)
def __init__(self):
self.max_retry_times = settings.getint('RETRY_TIMES')
self.retry_http_codes = map(int, settings.getlist('RETRY_HTTP_CODES'))
def process_response(self, request, response, spider):
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
or (isinstance(exception, HttpException) \
and (int(exception.status) in self.retry_http_codes)):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
return self._retry(request, exception, spider)
retries = request.meta.get('retry_times', 0) + 1
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
if retries <= self.max_retry_times:
log.msg("Retrying %s (failed %d times): %s" % (request, retries, exception),
domain=spider.domain_name, level=log.DEBUG)
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
return retryreq
else:
log.msg("Discarding %s (failed %d times): %s" % (request, retries, exception),
domain=spider.domain_name, level=log.DEBUG)
if retries <= self.max_retry_times:
log.msg("Retrying %s (failed %d times): %s" % (request, retries, reason),
domain=spider.domain_name, level=log.DEBUG)
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
return retryreq
else:
log.msg("Discarding %s (failed %d times): %s" % (request, retries, reason),
domain=spider.domain_name, level=log.DEBUG)

View File

@ -1,4 +1,4 @@
from scrapy.core.exceptions import HttpException, NotConfigured
from scrapy.core.exceptions import NotConfigured
from scrapy.stats import stats
from scrapy.conf import settings
@ -18,7 +18,7 @@ class DownloaderStats(object):
def __init__(self):
if not settings.getbool('DOWNLOADER_STATS'):
raise NotConfigured
def process_request(self, request, spider):
stats.incpath('_global/downloader/request_count')
stats.incpath('%s/downloader/request_count' % spider.domain_name)
@ -30,14 +30,12 @@ class DownloaderStats(object):
def process_response(self, request, response, spider):
self._inc_response_count(response, spider.domain_name)
return response
def process_exception(self, request, exception, spider):
ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
stats.incpath('_global/downloader/exception_count')
stats.incpath('%s/downloader/exception_count' % spider.domain_name)
stats.incpath('%s/downloader/exception_type_count/%s' % (spider.domain_name, ex_class))
if isinstance(exception, HttpException):
self._inc_response_count(exception.response, spider.domain_name)
def _inc_response_count(self, response, domain):
stats.incpath('_global/downloader/response_count')

View File

@ -16,7 +16,7 @@ except ImportError:
from scrapy import optional_features
from scrapy.core import signals
from scrapy.http import Headers
from scrapy.core.exceptions import HttpException, NotSupported
from scrapy.core.exceptions import NotSupported
from scrapy.utils.defer import defer_succeed
from scrapy.conf import settings
@ -65,13 +65,7 @@ def create_factory(request, spider):
signals.send_catch_log(signal=signals.response_downloaded, sender='download_http', response=r, spider=spider)
return r
def _on_success(body):
response = _create_response(body)
if response.status not in (200, 201, 202):
raise HttpException(response.status, None, response)
return response
factory.deferred.addCallbacks(_on_success)
factory.deferred.addCallbacks(_create_response)
return factory
def download_http(request, spider):

View File

@ -3,7 +3,6 @@ from __future__ import with_statement
from unittest import TestCase
from scrapy.spider import spiders
from scrapy.core.exceptions import HttpException
from scrapy.http import Response, Request
from scrapy.contrib.downloadermiddleware.cookies import CookiesMiddleware
@ -35,21 +34,6 @@ class CookiesMiddlewareTest(TestCase):
assert self.mw.process_request(req2, self.spider) is None
self.assertEquals(req2.headers.get('Cookie'), "C1=value1")
def test_http_exception(self):
req = Request('http://scrapytest.org/')
assert self.mw.process_request(req, self.spider) is None
assert 'Cookie' not in req.headers
headers = {'Set-Cookie': 'C1=value1; path=/'}
res = Response('http://scrapytest.org/', headers=headers)
exc = HttpException(302, 'Redirect', res)
assert self.mw.process_exception(req, exc, self.spider) is None
#assert exc.response.cookies
req2 = Request('http://scrapytest.org/sub1/')
assert self.mw.process_request(req2, self.spider) is None
self.assertEquals(req2.headers.get('Cookie'), "C1=value1")
def test_dont_merge_cookies(self):
# merge some cookies into jar
headers = {'Set-Cookie': 'C1=value1; path=/'}

View File

@ -1,7 +1,6 @@
import unittest
from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware
from scrapy.core.exceptions import HttpException
from scrapy.dupefilter import dupefilter
from scrapy.spider import spiders
from scrapy.http import Request, Response, Headers
@ -22,28 +21,24 @@ class RedirectMiddlewareTest(unittest.TestCase):
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
req = Request(url)
hdr = Headers({'Location': [url2]})
rsp = Response(url, headers=hdr)
exc = HttpException('301', None, rsp)
rsp = Response(url, headers={'Location': url2}, status=301)
req2 = self.mw.process_exception(req, exc, self.spider)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_exception(req, exc, self.spider) is None
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_redirect_302(self):
url = 'http://www.example.com/302'
url2 = 'http://www.example.com/redirected2'
req = Request(url, method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
hdr = Headers({'Location': [url2]})
rsp = Response(url, headers=hdr)
exc = HttpException('302', None, rsp)
rsp = Response(url, headers={'Location': url2}, status=302)
req2 = self.mw.process_exception(req, exc, self.spider)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, 'GET')
@ -56,7 +51,7 @@ class RedirectMiddlewareTest(unittest.TestCase):
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_exception(req, exc, self.spider) is None
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_meta_refresh(self):
body = """<html>
@ -82,24 +77,24 @@ class RedirectMiddlewareTest(unittest.TestCase):
def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request('http://scrapytest.org/302')
exc = HttpException('302', None, Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}))
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = self.mw.process_exception(req, exc, self.spider)
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
assert 'redirect_times' in req.meta
self.assertEqual(req.meta['redirect_times'], 1)
req = self.mw.process_exception(req, exc, self.spider)
req = self.mw.process_response(req, rsp, self.spider)
self.assertEqual(req, None)
def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
exc = HttpException('302', None, Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}))
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = self.mw.process_exception(req, exc, self.spider)
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
req = self.mw.process_exception(req, exc, self.spider)
req = self.mw.process_response(req, rsp, self.spider)
self.assertEqual(req, None)
if __name__ == "__main__":

View File

@ -5,7 +5,6 @@ from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookup
ConnectionLost
from scrapy.contrib.downloadermiddleware.retry import RetryMiddleware
from scrapy.core.exceptions import HttpException
from scrapy.spider import spiders
from scrapy.http import Request, Response
@ -17,20 +16,31 @@ class RetryTest(unittest.TestCase):
self.mw = RetryMiddleware()
self.mw.max_retry_times = 2
def test_process_exception_404(self):
req404 = Request('http://www.scrapytest.org/404')
exc404 = HttpException('404', None, Response('http://www.scrapytest.org/404', body=''))
def test_404(self):
req = Request('http://www.scrapytest.org/404')
rsp = Response('http://www.scrapytest.org/404', body='', status=404)
# dont retry 404s
req = self.mw.process_exception(req404, exc404, self.spider)
self.assertTrue(req is None)
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_process_exception_503(self):
req503 = Request('http://www.scrapytest.org/503')
exc503 = HttpException('503', None, Response('http://www.scrapytest.org/503', body=''))
self._test_retry_exception(req503, exc503)
def test_503(self):
req = Request('http://www.scrapytest.org/503')
rsp = Response('http://www.scrapytest.org/503', body='', status=503)
def test_process_exception_twistederrors(self):
# first retry
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertEqual(req.meta['retry_times'], 1)
# second retry
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertEqual(req.meta['retry_times'], 2)
# discard it
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_twistederrors(self):
for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost):
req = Request('http://www.scrapytest.org/%s' % exc.__name__)
self._test_retry_exception(req, exc())

View File

@ -4,6 +4,7 @@ scrapy.http.Response objects
"""
import re
from twisted.web import http
from scrapy.http.response import Response
def body_or_str(obj, unicode=True):
@ -36,3 +37,14 @@ def get_meta_refresh(response):
match = META_REFRESH_RE.search(response.body[0:4096])
response.cache['meta_refresh_url'] = match.groups() if match else (None, None)
return response.cache['meta_refresh_url']
def response_status_message(status):
"""Return status code plus status text descriptive message
>>> response_status_message(200)
200 OK
>>> response_status_message(404)
404 Not Found
"""
return '%s %s' % (status, http.responses.get(int(status)))