1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 06:24:00 +00:00

Fix for #612 + integration-style tests for HttpErrorMiddleware

This commit is contained in:
Mikhail Korobov 2014-03-19 04:04:24 +06:00
parent 364790eb07
commit 75060d1424
2 changed files with 80 additions and 4 deletions

View File

@ -25,7 +25,7 @@ class HttpErrorMiddleware(object):
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES') self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
def process_spider_input(self, response, spider): def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case if 200 <= response.status < 300: # common case
return return
meta = response.meta meta = response.meta
if 'handle_httpstatus_all' in meta: if 'handle_httpstatus_all' in meta:
@ -38,11 +38,14 @@ class HttpErrorMiddleware(object):
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list) allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
if response.status in allowed_statuses: if response.status in allowed_statuses:
return return
log.msg(format="Ignoring HTTP response code: not handled or not allowed: %(status_code)d",
level=log.DEBUG, spider=spider,
status_code=response.status)
raise HttpError(response, 'Ignoring non-200 response') raise HttpError(response, 'Ignoring non-200 response')
def process_spider_exception(self, response, exception, spider): def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError): if isinstance(exception, HttpError):
log.msg(
format="Ignoring response %(response)r: HTTP status code is not handled or not allowed",
level=log.DEBUG,
spider=spider,
response=response
)
return [] return []

View File

@ -1,11 +1,51 @@
from unittest import TestCase from unittest import TestCase
from twisted.trial.unittest import TestCase as TrialTestCase
from twisted.internet import defer
from scrapy.utils.test import docrawl, get_testlog
from scrapy.tests.mockserver import MockServer
from scrapy.http import Response, Request from scrapy.http import Response, Request
from scrapy.spider import Spider from scrapy.spider import Spider
from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError
from scrapy.settings import Settings from scrapy.settings import Settings
class _HttpErrorSpider(Spider):
name = 'httperror'
start_urls = [
"http://localhost:8998/status?n=200",
"http://localhost:8998/status?n=404",
"http://localhost:8998/status?n=402",
"http://localhost:8998/status?n=500",
]
bypass_status_codes = set()
def __init__(self, *args, **kwargs):
super(_HttpErrorSpider, self).__init__(*args, **kwargs)
self.failed = set()
self.skipped = set()
self.parsed = set()
def start_requests(self):
for url in self.start_urls:
yield Request(url, self.parse, errback=self.on_error)
def parse(self, response):
self.parsed.add(response.url[-3:])
def on_error(self, failure):
if isinstance(failure.value, HttpError):
response = failure.value.response
if response.status in self.bypass_status_codes:
self.skipped.add(response.url[-3:])
return self.parse(response)
# it assumes there is a response attached to failure
self.failed.add(failure.value.response.url[-3:])
return failure
def _responses(request, status_codes): def _responses(request, status_codes):
responses = [] responses = []
for code in status_codes: for code in status_codes:
@ -48,6 +88,7 @@ class TestHttpErrorMiddleware(TestCase):
self.assertEquals(None, self.assertEquals(None,
self.mw.process_spider_input(self.res404, self.spider)) self.mw.process_spider_input(self.res404, self.spider))
class TestHttpErrorMiddlewareSettings(TestCase): class TestHttpErrorMiddlewareSettings(TestCase):
"""Similar test, but with settings""" """Similar test, but with settings"""
@ -85,6 +126,7 @@ class TestHttpErrorMiddlewareSettings(TestCase):
self.assertRaises(HttpError, self.assertRaises(HttpError,
self.mw.process_spider_input, self.res402, self.spider) self.mw.process_spider_input, self.res402, self.spider)
class TestHttpErrorMiddlewareHandleAll(TestCase): class TestHttpErrorMiddlewareHandleAll(TestCase):
def setUp(self): def setUp(self):
@ -112,3 +154,34 @@ class TestHttpErrorMiddlewareHandleAll(TestCase):
self.assertRaises(HttpError, self.assertRaises(HttpError,
self.mw.process_spider_input, res402, self.spider) self.mw.process_spider_input, res402, self.spider)
class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_middleware_works(self):
spider = _HttpErrorSpider()
yield docrawl(spider)
assert not spider.skipped, spider.skipped
self.assertEqual(spider.parsed, {'200'})
self.assertEqual(spider.failed, {'404', '402', '500'})
@defer.inlineCallbacks
def test_logging(self):
spider = _HttpErrorSpider(bypass_status_codes={402})
yield docrawl(spider)
# print(get_testlog())
self.assertEqual(spider.parsed, {'200', '402'})
self.assertEqual(spider.skipped, {'402'})
self.assertEqual(spider.failed, {'404', '500'})
log = get_testlog()
self.assertIn('Ignoring response <404', log)
self.assertIn('Ignoring response <500', log)
self.assertNotIn('Ignoring response <200', log)
self.assertNotIn('Ignoring response <402', log)