1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 15:23:40 +00:00
scrapy/tests/test_downloadermiddleware_retry.py

187 lines
6.7 KiB
Python
Raw Normal View History

import unittest
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from twisted.web.client import ResponseFailed
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.spiders import Spider
from scrapy.http import Request, Response
from scrapy.utils.test import get_crawler
class RetryTest(unittest.TestCase):
def setUp(self):
2017-02-07 18:17:07 +05:00
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
self.mw = RetryMiddleware.from_crawler(self.crawler)
self.mw.max_retry_times = 2
def test_priority_adjust(self):
req = Request('http://www.scrapytest.org/503')
rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)
req2 = self.mw.process_response(req, rsp, self.spider)
assert req2.priority < req.priority
def test_404(self):
req = Request('http://www.scrapytest.org/404')
rsp = Response('http://www.scrapytest.org/404', body=b'', status=404)
# dont retry 404s
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_dont_retry(self):
req = Request('http://www.scrapytest.org/503', meta={'dont_retry': True})
rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)
# first retry
r = self.mw.process_response(req, rsp, self.spider)
assert r is rsp
# Test retry when dont_retry set to False
req = Request('http://www.scrapytest.org/503', meta={'dont_retry': False})
rsp = Response('http://www.scrapytest.org/503')
# first retry
r = self.mw.process_response(req, rsp, self.spider)
assert r is rsp
def test_dont_retry_exc(self):
req = Request('http://www.scrapytest.org/503', meta={'dont_retry': True})
r = self.mw.process_exception(req, DNSLookupError(), self.spider)
assert r is None
def test_503(self):
req = Request('http://www.scrapytest.org/503')
rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)
# first retry
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertEqual(req.meta['retry_times'], 1)
# second retry
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertEqual(req.meta['retry_times'], 2)
# discard it
assert self.mw.process_response(req, rsp, self.spider) is rsp
2017-02-07 18:17:07 +05:00
assert self.crawler.stats.get_value('retry/max_reached') == 1
assert self.crawler.stats.get_value('retry/reason_count/503 Service Unavailable') == 2
assert self.crawler.stats.get_value('retry/count') == 2
def test_twistederrors(self):
exceptions = [defer.TimeoutError, TCPTimedOutError, TimeoutError,
DNSLookupError, ConnectionRefusedError, ConnectionDone,
ConnectError, ConnectionLost, ResponseFailed]
for exc in exceptions:
req = Request('http://www.scrapytest.org/%s' % exc.__name__)
self._test_retry_exception(req, exc('foo'))
2017-02-07 18:17:07 +05:00
stats = self.crawler.stats
assert stats.get_value('retry/max_reached') == len(exceptions)
assert stats.get_value('retry/count') == len(exceptions) * 2
assert stats.get_value('retry/reason_count/twisted.internet.defer.TimeoutError') == 2
def _test_retry_exception(self, req, exception):
# first retry
req = self.mw.process_exception(req, exception, self.spider)
assert isinstance(req, Request)
self.assertEqual(req.meta['retry_times'], 1)
# second retry
req = self.mw.process_exception(req, exception, self.spider)
assert isinstance(req, Request)
self.assertEqual(req.meta['retry_times'], 2)
# discard it
req = self.mw.process_exception(req, exception, self.spider)
self.assertEqual(req, None)
2017-03-19 06:15:46 +05:30
class MaxRetryTimesTest(unittest.TestCase):
def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
self.mw = RetryMiddleware.from_crawler(self.crawler)
self.mw.max_retry_times = 2
2017-03-23 19:45:04 +05:30
self.invalid_url = 'http://www.scrapytest.org/invalid_url'
2017-03-19 06:15:46 +05:30
2017-03-23 19:45:04 +05:30
def test_with_settings_zero(self):
2017-03-19 06:15:46 +05:30
# SETTINGS: RETRY_TIMES = 0
self.mw.max_retry_times = 0
2017-03-23 19:45:04 +05:30
req = Request(self.invalid_url)
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
2017-03-19 06:15:46 +05:30
2017-03-23 19:45:04 +05:30
def test_with_metakey_zero(self):
# SETTINGS: meta(max_retry_times) = 0
meta_max_retry_times = 0
2017-03-23 19:45:04 +05:30
req = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
self._test_retry(req, DNSLookupError('foo'), meta_max_retry_times)
2017-03-19 06:15:46 +05:30
2017-03-23 19:45:04 +05:30
def test_without_metakey(self):
2017-03-23 19:45:04 +05:30
# SETTINGS: RETRY_TIMES is NON-ZERO
2017-03-19 06:15:46 +05:30
self.mw.max_retry_times = 5
2017-03-23 19:45:04 +05:30
req = Request(self.invalid_url)
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
2017-03-19 06:15:46 +05:30
2017-03-23 19:45:04 +05:30
def test_with_metakey_greater(self):
2017-03-15 04:12:32 +05:30
2017-03-19 06:15:46 +05:30
# SETINGS: RETRY_TIMES < meta(max_retry_times)
self.mw.max_retry_times = 2
2017-03-23 19:45:04 +05:30
meta_max_retry_times = 3
req1 = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
req2 = Request(self.invalid_url)
2017-03-19 06:15:46 +05:30
2017-03-23 19:45:04 +05:30
self._test_retry(req1, DNSLookupError('foo'), meta_max_retry_times)
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
def test_with_metakey_lesser(self):
2017-03-15 04:12:32 +05:30
# SETINGS: RETRY_TIMES > meta(max_retry_times)
2017-03-19 06:15:46 +05:30
self.mw.max_retry_times = 5
2017-03-23 19:45:04 +05:30
meta_max_retry_times = 4
2017-03-15 04:12:32 +05:30
2017-03-23 19:45:04 +05:30
req1 = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
req2 = Request(self.invalid_url)
self._test_retry(req1, DNSLookupError('foo'), meta_max_retry_times)
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
def test_with_dont_retry(self):
# SETTINGS: meta(max_retry_times) = 4
meta_max_retry_times = 4
req = Request(self.invalid_url, meta= \
{'max_retry_times': meta_max_retry_times, 'dont_retry': True})
2017-03-19 06:15:46 +05:30
self._test_retry(req, DNSLookupError('foo'), 0)
2017-03-15 04:12:32 +05:30
def _test_retry(self, req, exception, max_retry_times):
2017-03-23 19:45:04 +05:30
for i in range(0, max_retry_times):
req = self.mw.process_exception(req, exception, self.spider)
assert isinstance(req, Request)
# discard it
req = self.mw.process_exception(req, exception, self.spider)
self.assertEqual(req, None)
if __name__ == "__main__":
unittest.main()