2013-03-20 14:46:04 -03:00
|
|
|
from twisted.internet import defer
|
2013-04-26 16:28:31 -03:00
|
|
|
from twisted.trial.unittest import TestCase
|
2013-05-16 13:01:02 -03:00
|
|
|
from scrapy.utils.test import get_crawler, get_testlog
|
|
|
|
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider
|
|
|
|
from scrapy.tests.mockserver import MockServer
|
2013-03-20 14:46:04 -03:00
|
|
|
|
2013-05-06 20:04:15 -03:00
|
|
|
|
2013-03-20 14:46:04 -03:00
|
|
|
def docrawl(spider, settings=None):
|
|
|
|
crawler = get_crawler(settings)
|
|
|
|
crawler.configure()
|
|
|
|
crawler.crawl(spider)
|
|
|
|
return crawler.start()
|
|
|
|
|
|
|
|
class CrawlTestCase(TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
2013-05-16 13:01:02 -03:00
|
|
|
self.mockserver = MockServer()
|
|
|
|
self.mockserver.__enter__()
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
def tearDown(self):
|
2013-05-16 13:01:02 -03:00
|
|
|
self.mockserver.__exit__(None, None, None)
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_follow_all(self):
|
|
|
|
spider = FollowAllSpider()
|
|
|
|
yield docrawl(spider)
|
|
|
|
self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_delay(self):
|
|
|
|
spider = FollowAllSpider()
|
2013-06-27 17:09:26 -03:00
|
|
|
yield docrawl(spider, {"DOWNLOAD_DELAY": 1})
|
2013-03-20 14:46:04 -03:00
|
|
|
t = spider.times[0]
|
2013-04-26 16:28:31 -03:00
|
|
|
for t2 in spider.times[1:]:
|
2013-06-27 17:09:26 -03:00
|
|
|
self.assertTrue(t2-t > 0.45, "download delay too small: %s" % (t2-t))
|
2013-04-26 16:28:31 -03:00
|
|
|
t = t2
|
2013-05-06 14:47:24 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_timeout_success(self):
|
|
|
|
spider = DelaySpider(n=0.5)
|
|
|
|
yield docrawl(spider)
|
|
|
|
self.assertTrue(spider.t1 > 0)
|
|
|
|
self.assertTrue(spider.t2 > 0)
|
|
|
|
self.assertTrue(spider.t2 > spider.t1)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_timeout_failure(self):
|
|
|
|
spider = DelaySpider(n=0.5)
|
|
|
|
yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
|
|
|
|
self.assertTrue(spider.t1 > 0)
|
|
|
|
self.assertTrue(spider.t2 == 0)
|
|
|
|
self.assertTrue(spider.t2_err > 0)
|
|
|
|
self.assertTrue(spider.t2_err > spider.t1)
|
2013-06-20 10:15:23 -03:00
|
|
|
# server hangs after receiving response headers
|
|
|
|
spider = DelaySpider(n=0.5, b=1)
|
|
|
|
yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
|
|
|
|
self.assertTrue(spider.t1 > 0)
|
|
|
|
self.assertTrue(spider.t2 == 0)
|
|
|
|
self.assertTrue(spider.t2_err > 0)
|
|
|
|
self.assertTrue(spider.t2_err > spider.t1)
|
2013-05-06 20:04:15 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_503(self):
|
|
|
|
spider = SimpleSpider("http://localhost:8998/status?n=503")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_conn_failed(self):
|
|
|
|
spider = SimpleSpider("http://localhost:65432/status?n=503")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_dns_error(self):
|
|
|
|
spider = SimpleSpider("http://localhost666/status?n=503")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
|
|
|
def _assert_retried(self):
|
|
|
|
log = get_testlog()
|
|
|
|
self.assertEqual(log.count("Retrying"), 2)
|
|
|
|
self.assertEqual(log.count("Gave up retrying"), 1)
|