2013-11-25 15:34:13 -02:00
|
|
|
import json
|
2014-02-08 16:38:19 -02:00
|
|
|
import socket
|
|
|
|
import mock
|
2013-03-20 14:46:04 -03:00
|
|
|
from twisted.internet import defer
|
2013-04-26 16:28:31 -03:00
|
|
|
from twisted.trial.unittest import TestCase
|
2014-03-19 03:30:58 +06:00
|
|
|
from scrapy.utils.test import docrawl, get_testlog
|
2014-07-30 16:53:28 -03:00
|
|
|
from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
2014-04-26 13:47:42 +02:00
|
|
|
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
|
2014-07-30 16:53:28 -03:00
|
|
|
from tests.mockserver import MockServer
|
2013-11-25 15:34:13 -02:00
|
|
|
from scrapy.http import Request
|
2013-03-20 14:46:04 -03:00
|
|
|
|
2013-05-06 20:04:15 -03:00
|
|
|
|
2013-03-20 14:46:04 -03:00
|
|
|
class CrawlTestCase(TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
2013-05-16 13:01:02 -03:00
|
|
|
self.mockserver = MockServer()
|
|
|
|
self.mockserver.__enter__()
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
def tearDown(self):
|
2013-05-16 13:01:02 -03:00
|
|
|
self.mockserver.__exit__(None, None, None)
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_follow_all(self):
|
|
|
|
spider = FollowAllSpider()
|
|
|
|
yield docrawl(spider)
|
2013-07-16 16:22:03 -03:00
|
|
|
self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_delay(self):
|
2013-07-16 16:22:03 -03:00
|
|
|
# short to long delays
|
|
|
|
yield self._test_delay(0.2, False)
|
|
|
|
yield self._test_delay(1, False)
|
|
|
|
# randoms
|
|
|
|
yield self._test_delay(0.2, True)
|
|
|
|
yield self._test_delay(1, True)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def _test_delay(self, delay, randomize):
|
|
|
|
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
|
|
|
|
spider = FollowAllSpider(maxlatency=delay * 2)
|
|
|
|
yield docrawl(spider, settings)
|
|
|
|
t = spider.times
|
|
|
|
totaltime = t[-1] - t[0]
|
|
|
|
avgd = totaltime / (len(t) - 1)
|
|
|
|
tolerance = 0.6 if randomize else 0.2
|
|
|
|
self.assertTrue(avgd > delay * (1 - tolerance),
|
|
|
|
"download delay too small: %s" % avgd)
|
2013-05-06 14:47:24 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_timeout_success(self):
|
|
|
|
spider = DelaySpider(n=0.5)
|
|
|
|
yield docrawl(spider)
|
|
|
|
self.assertTrue(spider.t1 > 0)
|
|
|
|
self.assertTrue(spider.t2 > 0)
|
|
|
|
self.assertTrue(spider.t2 > spider.t1)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_timeout_failure(self):
|
|
|
|
spider = DelaySpider(n=0.5)
|
|
|
|
yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
|
|
|
|
self.assertTrue(spider.t1 > 0)
|
|
|
|
self.assertTrue(spider.t2 == 0)
|
|
|
|
self.assertTrue(spider.t2_err > 0)
|
|
|
|
self.assertTrue(spider.t2_err > spider.t1)
|
2013-06-20 10:15:23 -03:00
|
|
|
# server hangs after receiving response headers
|
|
|
|
spider = DelaySpider(n=0.5, b=1)
|
|
|
|
yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
|
|
|
|
self.assertTrue(spider.t1 > 0)
|
|
|
|
self.assertTrue(spider.t2 == 0)
|
|
|
|
self.assertTrue(spider.t2_err > 0)
|
|
|
|
self.assertTrue(spider.t2_err > spider.t1)
|
2013-05-06 20:04:15 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_503(self):
|
|
|
|
spider = SimpleSpider("http://localhost:8998/status?n=503")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_conn_failed(self):
|
|
|
|
spider = SimpleSpider("http://localhost:65432/status?n=503")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_dns_error(self):
|
2014-02-08 16:38:19 -02:00
|
|
|
with mock.patch('socket.gethostbyname',
|
|
|
|
side_effect=socket.gaierror(-5, 'No address associated with hostname')):
|
|
|
|
spider = SimpleSpider("http://example.com/")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
2013-05-06 20:04:15 -03:00
|
|
|
|
2013-10-03 12:48:56 -03:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_bug_before_yield(self):
|
|
|
|
spider = BrokenStartRequestsSpider(fail_before_yield=1)
|
|
|
|
yield docrawl(spider)
|
|
|
|
errors = self.flushLoggedErrors(ZeroDivisionError)
|
|
|
|
self.assertEqual(len(errors), 1)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_bug_yielding(self):
|
|
|
|
spider = BrokenStartRequestsSpider(fail_yielding=1)
|
|
|
|
yield docrawl(spider)
|
|
|
|
errors = self.flushLoggedErrors(ZeroDivisionError)
|
|
|
|
self.assertEqual(len(errors), 1)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_lazyness(self):
|
|
|
|
settings = {"CONCURRENT_REQUESTS": 1}
|
|
|
|
spider = BrokenStartRequestsSpider()
|
|
|
|
yield docrawl(spider, settings)
|
|
|
|
#self.assertTrue(False, spider.seedsseen)
|
|
|
|
#self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
|
|
|
|
# spider.seedsseen)
|
|
|
|
|
2014-04-26 13:47:42 +02:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_dupes(self):
|
|
|
|
settings = {"CONCURRENT_REQUESTS": 1}
|
|
|
|
spider = DuplicateStartRequestsSpider(dont_filter=True,
|
|
|
|
distinct_urls=2,
|
|
|
|
dupe_factor=3)
|
|
|
|
yield docrawl(spider, settings)
|
|
|
|
self.assertEqual(spider.visited, 6)
|
|
|
|
|
|
|
|
spider = DuplicateStartRequestsSpider(dont_filter=False,
|
|
|
|
distinct_urls=3,
|
|
|
|
dupe_factor=4)
|
|
|
|
yield docrawl(spider, settings)
|
|
|
|
self.assertEqual(spider.visited, 3)
|
|
|
|
|
2013-08-27 11:53:18 -03:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_unbounded_response(self):
|
|
|
|
# Completeness of responses without Content-Length or Transfer-Encoding
|
|
|
|
# can not be determined, we treat them as valid but flagged as "partial"
|
|
|
|
from urllib import urlencode
|
|
|
|
query = urlencode({'raw': '''\
|
|
|
|
HTTP/1.1 200 OK
|
|
|
|
Server: Apache-Coyote/1.1
|
|
|
|
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
|
|
|
|
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
|
|
|
|
Pragma: no-cache
|
|
|
|
Expires: Thu, 01 Jan 1970 00:00:00 GMT
|
|
|
|
Cache-Control: no-cache
|
|
|
|
Cache-Control: no-store
|
|
|
|
Content-Type: text/html;charset=UTF-8
|
|
|
|
Content-Language: en
|
|
|
|
Date: Tue, 27 Aug 2013 13:05:05 GMT
|
|
|
|
Connection: close
|
|
|
|
|
|
|
|
foo body
|
|
|
|
with multiples lines
|
|
|
|
'''})
|
2013-08-27 12:12:50 -03:00
|
|
|
spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
|
2013-08-27 11:53:18 -03:00
|
|
|
yield docrawl(spider)
|
|
|
|
log = get_testlog()
|
|
|
|
self.assertEqual(log.count("Got response 200"), 1)
|
|
|
|
|
2013-08-21 14:32:22 -03:00
|
|
|
@defer.inlineCallbacks
|
2013-08-21 15:17:40 -03:00
|
|
|
def test_retry_conn_lost(self):
|
|
|
|
# connection lost after receiving data
|
|
|
|
spider = SimpleSpider("http://localhost:8998/drop?abort=0")
|
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_conn_aborted(self):
|
|
|
|
# connection lost before receiving data
|
|
|
|
spider = SimpleSpider("http://localhost:8998/drop?abort=1")
|
2013-08-21 14:32:22 -03:00
|
|
|
yield docrawl(spider)
|
|
|
|
self._assert_retried()
|
|
|
|
|
2013-05-06 20:04:15 -03:00
|
|
|
def _assert_retried(self):
|
|
|
|
log = get_testlog()
|
|
|
|
self.assertEqual(log.count("Retrying"), 2)
|
|
|
|
self.assertEqual(log.count("Gave up retrying"), 1)
|
2013-11-25 15:34:13 -02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_referer_header(self):
|
|
|
|
"""Referer header is set by RefererMiddleware unless it is already set"""
|
|
|
|
req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
|
|
|
|
req1 = req0.replace()
|
|
|
|
req2 = req0.replace(headers={'Referer': None})
|
|
|
|
req3 = req0.replace(headers={'Referer': 'http://example.com'})
|
|
|
|
req0.meta['next'] = req1
|
|
|
|
req1.meta['next'] = req2
|
|
|
|
req2.meta['next'] = req3
|
|
|
|
spider = SingleRequestSpider(seed=req0)
|
|
|
|
yield docrawl(spider)
|
|
|
|
# basic asserts in case of weird communication errors
|
|
|
|
self.assertIn('responses', spider.meta)
|
|
|
|
self.assertNotIn('failures', spider.meta)
|
|
|
|
# start requests doesn't set Referer header
|
|
|
|
echo0 = json.loads(spider.meta['responses'][2].body)
|
|
|
|
self.assertNotIn('Referer', echo0['headers'])
|
|
|
|
# following request sets Referer to start request url
|
|
|
|
echo1 = json.loads(spider.meta['responses'][1].body)
|
|
|
|
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
|
|
|
|
# next request avoids Referer header
|
|
|
|
echo2 = json.loads(spider.meta['responses'][2].body)
|
|
|
|
self.assertNotIn('Referer', echo2['headers'])
|
|
|
|
# last request explicitly sets a Referer header
|
|
|
|
echo3 = json.loads(spider.meta['responses'][3].body)
|
|
|
|
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
|
2014-02-14 15:16:36 -02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_engine_status(self):
|
|
|
|
from scrapy.utils.engine import get_engine_status
|
|
|
|
est = []
|
|
|
|
|
|
|
|
def cb(response):
|
|
|
|
est.append(get_engine_status(spider.crawler.engine))
|
|
|
|
|
|
|
|
spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb)
|
|
|
|
yield docrawl(spider)
|
|
|
|
self.assertEqual(len(est), 1, est)
|
|
|
|
s = dict(est[0])
|
|
|
|
self.assertEqual(s['engine.spider.name'], spider.name)
|
|
|
|
self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
|