mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 10:04:07 +00:00
187 lines
6.7 KiB
Python
187 lines
6.7 KiB
Python
import unittest
|
|
from twisted.internet import defer
|
|
from twisted.internet.error import TimeoutError, DNSLookupError, \
|
|
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
|
ConnectionLost, TCPTimedOutError
|
|
from twisted.web.client import ResponseFailed
|
|
|
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware
|
|
from scrapy.spiders import Spider
|
|
from scrapy.http import Request, Response
|
|
from scrapy.utils.test import get_crawler
|
|
|
|
|
|
class RetryTest(unittest.TestCase):
|
|
def setUp(self):
|
|
self.crawler = get_crawler(Spider)
|
|
self.spider = self.crawler._create_spider('foo')
|
|
self.mw = RetryMiddleware.from_crawler(self.crawler)
|
|
self.mw.max_retry_times = 2
|
|
|
|
def test_priority_adjust(self):
|
|
req = Request('http://www.scrapytest.org/503')
|
|
rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
|
assert req2.priority < req.priority
|
|
|
|
def test_404(self):
|
|
req = Request('http://www.scrapytest.org/404')
|
|
rsp = Response('http://www.scrapytest.org/404', body=b'', status=404)
|
|
|
|
# dont retry 404s
|
|
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
|
|
|
def test_dont_retry(self):
|
|
req = Request('http://www.scrapytest.org/503', meta={'dont_retry': True})
|
|
rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)
|
|
|
|
# first retry
|
|
r = self.mw.process_response(req, rsp, self.spider)
|
|
assert r is rsp
|
|
|
|
# Test retry when dont_retry set to False
|
|
req = Request('http://www.scrapytest.org/503', meta={'dont_retry': False})
|
|
rsp = Response('http://www.scrapytest.org/503')
|
|
|
|
# first retry
|
|
r = self.mw.process_response(req, rsp, self.spider)
|
|
assert r is rsp
|
|
|
|
def test_dont_retry_exc(self):
|
|
req = Request('http://www.scrapytest.org/503', meta={'dont_retry': True})
|
|
|
|
r = self.mw.process_exception(req, DNSLookupError(), self.spider)
|
|
assert r is None
|
|
|
|
def test_503(self):
|
|
req = Request('http://www.scrapytest.org/503')
|
|
rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)
|
|
|
|
# first retry
|
|
req = self.mw.process_response(req, rsp, self.spider)
|
|
assert isinstance(req, Request)
|
|
self.assertEqual(req.meta['retry_times'], 1)
|
|
|
|
# second retry
|
|
req = self.mw.process_response(req, rsp, self.spider)
|
|
assert isinstance(req, Request)
|
|
self.assertEqual(req.meta['retry_times'], 2)
|
|
|
|
# discard it
|
|
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
|
|
|
assert self.crawler.stats.get_value('retry/max_reached') == 1
|
|
assert self.crawler.stats.get_value('retry/reason_count/503 Service Unavailable') == 2
|
|
assert self.crawler.stats.get_value('retry/count') == 2
|
|
|
|
def test_twistederrors(self):
|
|
exceptions = [defer.TimeoutError, TCPTimedOutError, TimeoutError,
|
|
DNSLookupError, ConnectionRefusedError, ConnectionDone,
|
|
ConnectError, ConnectionLost, ResponseFailed]
|
|
|
|
for exc in exceptions:
|
|
req = Request('http://www.scrapytest.org/%s' % exc.__name__)
|
|
self._test_retry_exception(req, exc('foo'))
|
|
|
|
stats = self.crawler.stats
|
|
assert stats.get_value('retry/max_reached') == len(exceptions)
|
|
assert stats.get_value('retry/count') == len(exceptions) * 2
|
|
assert stats.get_value('retry/reason_count/twisted.internet.defer.TimeoutError') == 2
|
|
|
|
def _test_retry_exception(self, req, exception):
|
|
# first retry
|
|
req = self.mw.process_exception(req, exception, self.spider)
|
|
assert isinstance(req, Request)
|
|
self.assertEqual(req.meta['retry_times'], 1)
|
|
|
|
# second retry
|
|
req = self.mw.process_exception(req, exception, self.spider)
|
|
assert isinstance(req, Request)
|
|
self.assertEqual(req.meta['retry_times'], 2)
|
|
|
|
# discard it
|
|
req = self.mw.process_exception(req, exception, self.spider)
|
|
self.assertEqual(req, None)
|
|
|
|
|
|
class MaxRetryTimesTest(unittest.TestCase):
|
|
def setUp(self):
|
|
self.crawler = get_crawler(Spider)
|
|
self.spider = self.crawler._create_spider('foo')
|
|
self.mw = RetryMiddleware.from_crawler(self.crawler)
|
|
self.mw.max_retry_times = 2
|
|
self.invalid_url = 'http://www.scrapytest.org/invalid_url'
|
|
|
|
def test_with_settings_zero(self):
|
|
|
|
# SETTINGS: RETRY_TIMES = 0
|
|
self.mw.max_retry_times = 0
|
|
|
|
req = Request(self.invalid_url)
|
|
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
|
|
|
|
def test_with_metakey_zero(self):
|
|
|
|
# SETTINGS: meta(max_retry_times) = 0
|
|
meta_max_retry_times = 0
|
|
|
|
req = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
|
|
self._test_retry(req, DNSLookupError('foo'), meta_max_retry_times)
|
|
|
|
def test_without_metakey(self):
|
|
|
|
# SETTINGS: RETRY_TIMES is NON-ZERO
|
|
self.mw.max_retry_times = 5
|
|
|
|
req = Request(self.invalid_url)
|
|
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
|
|
|
|
def test_with_metakey_greater(self):
|
|
|
|
# SETINGS: RETRY_TIMES < meta(max_retry_times)
|
|
self.mw.max_retry_times = 2
|
|
meta_max_retry_times = 3
|
|
|
|
req1 = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
|
|
req2 = Request(self.invalid_url)
|
|
|
|
self._test_retry(req1, DNSLookupError('foo'), meta_max_retry_times)
|
|
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
|
|
|
|
def test_with_metakey_lesser(self):
|
|
|
|
# SETINGS: RETRY_TIMES > meta(max_retry_times)
|
|
self.mw.max_retry_times = 5
|
|
meta_max_retry_times = 4
|
|
|
|
req1 = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
|
|
req2 = Request(self.invalid_url)
|
|
|
|
self._test_retry(req1, DNSLookupError('foo'), meta_max_retry_times)
|
|
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
|
|
|
|
def test_with_dont_retry(self):
|
|
|
|
# SETTINGS: meta(max_retry_times) = 4
|
|
meta_max_retry_times = 4
|
|
|
|
req = Request(self.invalid_url, meta= \
|
|
{'max_retry_times': meta_max_retry_times, 'dont_retry': True})
|
|
|
|
self._test_retry(req, DNSLookupError('foo'), 0)
|
|
|
|
|
|
def _test_retry(self, req, exception, max_retry_times):
|
|
|
|
for i in range(0, max_retry_times):
|
|
req = self.mw.process_exception(req, exception, self.spider)
|
|
assert isinstance(req, Request)
|
|
|
|
# discard it
|
|
req = self.mw.process_exception(req, exception, self.spider)
|
|
self.assertEqual(req, None)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|