mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 06:24:00 +00:00
Fix for #612 + integration-style tests for HttpErrorMiddleware
This commit is contained in:
parent
364790eb07
commit
75060d1424
@ -25,7 +25,7 @@ class HttpErrorMiddleware(object):
|
|||||||
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
|
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
|
||||||
|
|
||||||
def process_spider_input(self, response, spider):
|
def process_spider_input(self, response, spider):
|
||||||
if 200 <= response.status < 300: # common case
|
if 200 <= response.status < 300: # common case
|
||||||
return
|
return
|
||||||
meta = response.meta
|
meta = response.meta
|
||||||
if 'handle_httpstatus_all' in meta:
|
if 'handle_httpstatus_all' in meta:
|
||||||
@ -38,11 +38,14 @@ class HttpErrorMiddleware(object):
|
|||||||
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
|
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
|
||||||
if response.status in allowed_statuses:
|
if response.status in allowed_statuses:
|
||||||
return
|
return
|
||||||
log.msg(format="Ignoring HTTP response code: not handled or not allowed: %(status_code)d",
|
|
||||||
level=log.DEBUG, spider=spider,
|
|
||||||
status_code=response.status)
|
|
||||||
raise HttpError(response, 'Ignoring non-200 response')
|
raise HttpError(response, 'Ignoring non-200 response')
|
||||||
|
|
||||||
def process_spider_exception(self, response, exception, spider):
|
def process_spider_exception(self, response, exception, spider):
|
||||||
if isinstance(exception, HttpError):
|
if isinstance(exception, HttpError):
|
||||||
|
log.msg(
|
||||||
|
format="Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
||||||
|
level=log.DEBUG,
|
||||||
|
spider=spider,
|
||||||
|
response=response
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
|
@ -1,11 +1,51 @@
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from twisted.trial.unittest import TestCase as TrialTestCase
|
||||||
|
from twisted.internet import defer
|
||||||
|
|
||||||
|
from scrapy.utils.test import docrawl, get_testlog
|
||||||
|
from scrapy.tests.mockserver import MockServer
|
||||||
from scrapy.http import Response, Request
|
from scrapy.http import Response, Request
|
||||||
from scrapy.spider import Spider
|
from scrapy.spider import Spider
|
||||||
from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError
|
from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError
|
||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
|
|
||||||
|
|
||||||
|
class _HttpErrorSpider(Spider):
|
||||||
|
name = 'httperror'
|
||||||
|
start_urls = [
|
||||||
|
"http://localhost:8998/status?n=200",
|
||||||
|
"http://localhost:8998/status?n=404",
|
||||||
|
"http://localhost:8998/status?n=402",
|
||||||
|
"http://localhost:8998/status?n=500",
|
||||||
|
]
|
||||||
|
bypass_status_codes = set()
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(_HttpErrorSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.failed = set()
|
||||||
|
self.skipped = set()
|
||||||
|
self.parsed = set()
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for url in self.start_urls:
|
||||||
|
yield Request(url, self.parse, errback=self.on_error)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
self.parsed.add(response.url[-3:])
|
||||||
|
|
||||||
|
def on_error(self, failure):
|
||||||
|
if isinstance(failure.value, HttpError):
|
||||||
|
response = failure.value.response
|
||||||
|
if response.status in self.bypass_status_codes:
|
||||||
|
self.skipped.add(response.url[-3:])
|
||||||
|
return self.parse(response)
|
||||||
|
|
||||||
|
# it assumes there is a response attached to failure
|
||||||
|
self.failed.add(failure.value.response.url[-3:])
|
||||||
|
return failure
|
||||||
|
|
||||||
|
|
||||||
def _responses(request, status_codes):
|
def _responses(request, status_codes):
|
||||||
responses = []
|
responses = []
|
||||||
for code in status_codes:
|
for code in status_codes:
|
||||||
@ -48,6 +88,7 @@ class TestHttpErrorMiddleware(TestCase):
|
|||||||
self.assertEquals(None,
|
self.assertEquals(None,
|
||||||
self.mw.process_spider_input(self.res404, self.spider))
|
self.mw.process_spider_input(self.res404, self.spider))
|
||||||
|
|
||||||
|
|
||||||
class TestHttpErrorMiddlewareSettings(TestCase):
|
class TestHttpErrorMiddlewareSettings(TestCase):
|
||||||
"""Similar test, but with settings"""
|
"""Similar test, but with settings"""
|
||||||
|
|
||||||
@ -85,6 +126,7 @@ class TestHttpErrorMiddlewareSettings(TestCase):
|
|||||||
self.assertRaises(HttpError,
|
self.assertRaises(HttpError,
|
||||||
self.mw.process_spider_input, self.res402, self.spider)
|
self.mw.process_spider_input, self.res402, self.spider)
|
||||||
|
|
||||||
|
|
||||||
class TestHttpErrorMiddlewareHandleAll(TestCase):
|
class TestHttpErrorMiddlewareHandleAll(TestCase):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
@ -112,3 +154,34 @@ class TestHttpErrorMiddlewareHandleAll(TestCase):
|
|||||||
self.assertRaises(HttpError,
|
self.assertRaises(HttpError,
|
||||||
self.mw.process_spider_input, res402, self.spider)
|
self.mw.process_spider_input, res402, self.spider)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.mockserver = MockServer()
|
||||||
|
self.mockserver.__enter__()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.mockserver.__exit__(None, None, None)
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_middleware_works(self):
|
||||||
|
spider = _HttpErrorSpider()
|
||||||
|
yield docrawl(spider)
|
||||||
|
assert not spider.skipped, spider.skipped
|
||||||
|
self.assertEqual(spider.parsed, {'200'})
|
||||||
|
self.assertEqual(spider.failed, {'404', '402', '500'})
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_logging(self):
|
||||||
|
spider = _HttpErrorSpider(bypass_status_codes={402})
|
||||||
|
yield docrawl(spider)
|
||||||
|
# print(get_testlog())
|
||||||
|
self.assertEqual(spider.parsed, {'200', '402'})
|
||||||
|
self.assertEqual(spider.skipped, {'402'})
|
||||||
|
self.assertEqual(spider.failed, {'404', '500'})
|
||||||
|
|
||||||
|
log = get_testlog()
|
||||||
|
self.assertIn('Ignoring response <404', log)
|
||||||
|
self.assertIn('Ignoring response <500', log)
|
||||||
|
self.assertNotIn('Ignoring response <200', log)
|
||||||
|
self.assertNotIn('Ignoring response <402', log)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user