2013-11-25 15:34:13 -02:00
|
|
|
import json
|
2015-02-28 07:23:54 -03:00
|
|
|
import logging
|
|
|
|
|
|
|
|
from testfixtures import LogCapture
|
2013-03-20 14:46:04 -03:00
|
|
|
from twisted.internet import defer
|
2013-04-26 16:28:31 -03:00
|
|
|
from twisted.trial.unittest import TestCase
|
2015-02-28 07:23:54 -03:00
|
|
|
|
2015-10-06 17:23:47 +05:00
|
|
|
from scrapy.http import Request
|
|
|
|
from scrapy.crawler import CrawlerRunner
|
2016-01-19 13:06:31 +03:00
|
|
|
from scrapy.utils.python import to_unicode
|
2014-07-30 16:53:28 -03:00
|
|
|
from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
2014-04-26 13:47:42 +02:00
|
|
|
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
|
2014-07-30 16:53:28 -03:00
|
|
|
from tests.mockserver import MockServer
|
2013-03-20 14:46:04 -03:00
|
|
|
|
2013-05-06 20:04:15 -03:00
|
|
|
|
2013-03-20 14:46:04 -03:00
|
|
|
class CrawlTestCase(TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
2013-05-16 13:01:02 -03:00
|
|
|
self.mockserver = MockServer()
|
|
|
|
self.mockserver.__enter__()
|
2015-10-06 17:23:47 +05:00
|
|
|
self.runner = CrawlerRunner()
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
def tearDown(self):
|
2013-05-16 13:01:02 -03:00
|
|
|
self.mockserver.__exit__(None, None, None)
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_follow_all(self):
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(FollowAllSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url
|
2013-03-20 14:46:04 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_delay(self):
|
2013-07-16 16:22:03 -03:00
|
|
|
# short to long delays
|
|
|
|
yield self._test_delay(0.2, False)
|
|
|
|
yield self._test_delay(1, False)
|
|
|
|
# randoms
|
|
|
|
yield self._test_delay(0.2, True)
|
|
|
|
yield self._test_delay(1, True)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def _test_delay(self, delay, randomize):
|
|
|
|
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
t = crawler.spider.times
|
2013-07-16 16:22:03 -03:00
|
|
|
totaltime = t[-1] - t[0]
|
|
|
|
avgd = totaltime / (len(t) - 1)
|
|
|
|
tolerance = 0.6 if randomize else 0.2
|
|
|
|
self.assertTrue(avgd > delay * (1 - tolerance),
|
|
|
|
"download delay too small: %s" % avgd)
|
2013-05-06 14:47:24 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_timeout_success(self):
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(DelaySpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(n=0.5, mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertTrue(crawler.spider.t1 > 0)
|
|
|
|
self.assertTrue(crawler.spider.t2 > 0)
|
|
|
|
self.assertTrue(crawler.spider.t2 > crawler.spider.t1)
|
2013-05-06 14:47:24 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_timeout_failure(self):
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(n=0.5, mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertTrue(crawler.spider.t1 > 0)
|
|
|
|
self.assertTrue(crawler.spider.t2 == 0)
|
|
|
|
self.assertTrue(crawler.spider.t2_err > 0)
|
|
|
|
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
|
2013-06-20 10:15:23 -03:00
|
|
|
# server hangs after receiving response headers
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertTrue(crawler.spider.t1 > 0)
|
|
|
|
self.assertTrue(crawler.spider.t2 == 0)
|
|
|
|
self.assertTrue(crawler.spider.t2_err > 0)
|
|
|
|
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
|
2013-05-06 20:04:15 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_503(self):
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture() as l:
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
self._assert_retried(l)
|
2013-05-06 20:04:15 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_conn_failed(self):
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture() as l:
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
self._assert_retried(l)
|
2013-05-06 20:04:15 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_dns_error(self):
|
2017-02-02 17:53:28 +01:00
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
|
|
|
with LogCapture() as l:
|
|
|
|
# try to fetch the homepage of a non-existent domain
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver)
|
2017-02-02 17:53:28 +01:00
|
|
|
self._assert_retried(l)
|
2013-05-06 20:04:15 -03:00
|
|
|
|
2013-10-03 12:48:56 -03:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_bug_before_yield(self):
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture('scrapy', level=logging.ERROR) as l:
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
|
|
|
|
self.assertEqual(len(l.records), 1)
|
|
|
|
record = l.records[0]
|
|
|
|
self.assertIsNotNone(record.exc_info)
|
|
|
|
self.assertIs(record.exc_info[0], ZeroDivisionError)
|
2013-10-03 12:48:56 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_bug_yielding(self):
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture('scrapy', level=logging.ERROR) as l:
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
|
|
|
|
self.assertEqual(len(l.records), 1)
|
|
|
|
record = l.records[0]
|
|
|
|
self.assertIsNotNone(record.exc_info)
|
|
|
|
self.assertIs(record.exc_info[0], ZeroDivisionError)
|
2013-10-03 12:48:56 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_lazyness(self):
|
|
|
|
settings = {"CONCURRENT_REQUESTS": 1}
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
#self.assertTrue(False, crawler.spider.seedsseen)
|
|
|
|
#self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99),
|
|
|
|
# crawler.spider.seedsseen)
|
2013-10-03 12:48:56 -03:00
|
|
|
|
2014-04-26 13:47:42 +02:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_start_requests_dupes(self):
|
|
|
|
settings = {"CONCURRENT_REQUESTS": 1}
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertEqual(crawler.spider.visited, 6)
|
2014-04-26 13:47:42 +02:00
|
|
|
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertEqual(crawler.spider.visited, 3)
|
2014-04-26 13:47:42 +02:00
|
|
|
|
2013-08-27 11:53:18 -03:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_unbounded_response(self):
|
|
|
|
# Completeness of responses without Content-Length or Transfer-Encoding
|
|
|
|
# can not be determined, we treat them as valid but flagged as "partial"
|
2015-07-29 15:34:27 +00:00
|
|
|
from six.moves.urllib.parse import urlencode
|
2013-08-27 11:53:18 -03:00
|
|
|
query = urlencode({'raw': '''\
|
|
|
|
HTTP/1.1 200 OK
|
|
|
|
Server: Apache-Coyote/1.1
|
|
|
|
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
|
|
|
|
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
|
|
|
|
Pragma: no-cache
|
|
|
|
Expires: Thu, 01 Jan 1970 00:00:00 GMT
|
|
|
|
Cache-Control: no-cache
|
|
|
|
Cache-Control: no-store
|
|
|
|
Content-Type: text/html;charset=UTF-8
|
|
|
|
Content-Language: en
|
|
|
|
Date: Tue, 27 Aug 2013 13:05:05 GMT
|
|
|
|
Connection: close
|
|
|
|
|
|
|
|
foo body
|
|
|
|
with multiples lines
|
|
|
|
'''})
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture() as l:
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
self.assertEqual(str(l).count("Got response 200"), 1)
|
2013-08-27 11:53:18 -03:00
|
|
|
|
2013-08-21 14:32:22 -03:00
|
|
|
@defer.inlineCallbacks
|
2013-08-21 15:17:40 -03:00
|
|
|
def test_retry_conn_lost(self):
|
|
|
|
# connection lost after receiving data
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture() as l:
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
self._assert_retried(l)
|
2013-08-21 15:17:40 -03:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_retry_conn_aborted(self):
|
|
|
|
# connection lost before receiving data
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
2015-02-28 07:23:54 -03:00
|
|
|
with LogCapture() as l:
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver)
|
2015-02-28 07:23:54 -03:00
|
|
|
self._assert_retried(l)
|
2013-08-21 14:32:22 -03:00
|
|
|
|
2015-02-28 07:23:54 -03:00
|
|
|
def _assert_retried(self, log):
|
|
|
|
self.assertEqual(str(log).count("Retrying"), 2)
|
|
|
|
self.assertEqual(str(log).count("Gave up retrying"), 1)
|
2013-11-25 15:34:13 -02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_referer_header(self):
|
|
|
|
"""Referer header is set by RefererMiddleware unless it is already set"""
|
2018-05-23 21:25:50 +03:00
|
|
|
req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1)
|
2013-11-25 15:34:13 -02:00
|
|
|
req1 = req0.replace()
|
|
|
|
req2 = req0.replace(headers={'Referer': None})
|
|
|
|
req3 = req0.replace(headers={'Referer': 'http://example.com'})
|
|
|
|
req0.meta['next'] = req1
|
|
|
|
req1.meta['next'] = req2
|
|
|
|
req2.meta['next'] = req3
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SingleRequestSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(seed=req0, mockserver=self.mockserver)
|
2013-11-25 15:34:13 -02:00
|
|
|
# basic asserts in case of weird communication errors
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertIn('responses', crawler.spider.meta)
|
|
|
|
self.assertNotIn('failures', crawler.spider.meta)
|
2013-11-25 15:34:13 -02:00
|
|
|
# start requests doesn't set Referer header
|
2016-01-19 13:06:31 +03:00
|
|
|
echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
|
2013-11-25 15:34:13 -02:00
|
|
|
self.assertNotIn('Referer', echo0['headers'])
|
|
|
|
# following request sets Referer to start request url
|
2016-01-19 13:06:31 +03:00
|
|
|
echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
|
2013-11-25 15:34:13 -02:00
|
|
|
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
|
|
|
|
# next request avoids Referer header
|
2016-01-19 13:06:31 +03:00
|
|
|
echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
|
2013-11-25 15:34:13 -02:00
|
|
|
self.assertNotIn('Referer', echo2['headers'])
|
|
|
|
# last request explicitly sets a Referer header
|
2016-01-19 13:06:31 +03:00
|
|
|
echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
|
2013-11-25 15:34:13 -02:00
|
|
|
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
|
2014-02-14 15:16:36 -02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_engine_status(self):
|
|
|
|
from scrapy.utils.engine import get_engine_status
|
|
|
|
est = []
|
|
|
|
|
|
|
|
def cb(response):
|
2014-07-31 04:12:12 -03:00
|
|
|
est.append(get_engine_status(crawler.engine))
|
2014-02-14 15:16:36 -02:00
|
|
|
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(SingleRequestSpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver)
|
2014-02-14 15:16:36 -02:00
|
|
|
self.assertEqual(len(est), 1, est)
|
|
|
|
s = dict(est[0])
|
2014-07-31 04:12:12 -03:00
|
|
|
self.assertEqual(s['engine.spider.name'], crawler.spider.name)
|
2014-02-14 15:16:36 -02:00
|
|
|
self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
|
2015-08-10 23:43:06 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_graceful_crawl_error_handling(self):
|
|
|
|
"""
|
|
|
|
Test whether errors happening anywhere in Crawler.crawl() are properly
|
|
|
|
reported (and not somehow swallowed) after a graceful engine shutdown.
|
|
|
|
The errors should not come from within Scrapy's core but from within
|
|
|
|
spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
|
|
|
|
SpiderMiddleware.process_start_requests(), etc.
|
|
|
|
"""
|
|
|
|
|
|
|
|
class TestError(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
class FaultySpider(SimpleSpider):
|
|
|
|
def start_requests(self):
|
|
|
|
raise TestError
|
|
|
|
|
2015-10-06 17:23:47 +05:00
|
|
|
crawler = self.runner.create_crawler(FaultySpider)
|
2018-05-23 21:25:50 +03:00
|
|
|
yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError)
|
2015-08-10 23:43:06 +02:00
|
|
|
self.assertFalse(crawler.crawling)
|
2015-10-06 17:23:47 +05:00
|
|
|
|
2016-11-08 11:35:42 +01:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_open_spider_error_on_faulty_pipeline(self):
|
|
|
|
settings = {
|
|
|
|
"ITEM_PIPELINES": {
|
|
|
|
"tests.pipelines.ZeroDivisionErrorPipeline": 300,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
|
|
|
|
yield self.assertFailure(
|
2018-05-23 21:25:50 +03:00
|
|
|
self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver),
|
2016-11-08 11:35:42 +01:00
|
|
|
ZeroDivisionError)
|
2016-11-08 11:46:16 +01:00
|
|
|
self.assertFalse(crawler.crawling)
|
2016-11-08 11:35:42 +01:00
|
|
|
|
2015-10-06 17:23:47 +05:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_crawlerrunner_accepts_crawler(self):
|
|
|
|
crawler = self.runner.create_crawler(SimpleSpider)
|
|
|
|
with LogCapture() as log:
|
2018-05-23 21:25:50 +03:00
|
|
|
yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
|
2015-10-06 17:23:47 +05:00
|
|
|
self.assertIn("Got response 200", str(log))
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_crawl_multiple(self):
|
2018-05-23 21:25:50 +03:00
|
|
|
self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
|
|
|
|
self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver)
|
2015-10-06 17:23:47 +05:00
|
|
|
|
|
|
|
with LogCapture() as log:
|
|
|
|
yield self.runner.join()
|
|
|
|
|
|
|
|
self._assert_retried(log)
|
|
|
|
self.assertIn("Got response 200", str(log))
|