2014-06-03 15:26:01 +02:00
|
|
|
from unittest import TextTestResult
|
2012-10-27 23:26:40 +02:00
|
|
|
|
2018-08-15 20:24:00 +03:00
|
|
|
from twisted.internet import defer
|
2018-08-09 21:07:25 +03:00
|
|
|
from twisted.python import failure
|
2012-09-21 00:12:46 +02:00
|
|
|
from twisted.trial import unittest
|
|
|
|
|
2018-08-11 22:18:43 +03:00
|
|
|
from scrapy import FormRequest
|
2018-08-15 20:24:00 +03:00
|
|
|
from scrapy.crawler import CrawlerRunner
|
2018-08-09 21:07:25 +03:00
|
|
|
from scrapy.spidermiddlewares.httperror import HttpError
|
2015-05-09 04:20:09 -03:00
|
|
|
from scrapy.spiders import Spider
|
2012-09-21 00:12:46 +02:00
|
|
|
from scrapy.http import Request
|
|
|
|
from scrapy.item import Item, Field
|
2018-08-11 17:50:56 +03:00
|
|
|
from scrapy.contracts import ContractsManager, Contract
|
2012-09-21 00:12:46 +02:00
|
|
|
from scrapy.contracts.default import (
|
|
|
|
UrlContract,
|
2019-08-31 02:44:09 -03:00
|
|
|
CallbackKeywordArgumentsContract,
|
2012-09-21 00:12:46 +02:00
|
|
|
ReturnsContract,
|
|
|
|
ScrapesContract,
|
|
|
|
)
|
2018-08-15 20:39:43 +03:00
|
|
|
from tests.mockserver import MockServer
|
2012-09-21 00:12:46 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestItem(Item):
|
|
|
|
name = Field()
|
|
|
|
url = Field()
|
|
|
|
|
|
|
|
|
2020-03-16 16:12:46 -03:00
|
|
|
class ResponseMock:
|
2012-09-21 00:12:46 +02:00
|
|
|
url = 'http://scrapy.org'
|
|
|
|
|
|
|
|
|
2018-08-11 17:50:56 +03:00
|
|
|
class CustomSuccessContract(Contract):
|
|
|
|
name = 'custom_success_contract'
|
|
|
|
|
|
|
|
def adjust_request_args(self, args):
|
|
|
|
args['url'] = 'http://scrapy.org'
|
|
|
|
return args
|
|
|
|
|
|
|
|
|
|
|
|
class CustomFailContract(Contract):
|
|
|
|
name = 'custom_fail_contract'
|
|
|
|
|
|
|
|
def adjust_request_args(self, args):
|
|
|
|
raise TypeError('Error in adjust_request_args')
|
|
|
|
|
|
|
|
|
2018-08-11 22:18:43 +03:00
|
|
|
class CustomFormContract(Contract):
|
|
|
|
name = 'custom_form'
|
2018-09-03 20:07:37 +03:00
|
|
|
request_cls = FormRequest
|
2018-08-11 22:18:43 +03:00
|
|
|
|
|
|
|
def adjust_request_args(self, args):
|
|
|
|
args['formdata'] = {'name': 'scrapy'}
|
|
|
|
return args
|
|
|
|
|
|
|
|
|
2013-12-28 00:47:32 +06:00
|
|
|
class TestSpider(Spider):
|
2012-09-21 00:12:46 +02:00
|
|
|
name = 'demo_spider'
|
|
|
|
|
|
|
|
def returns_request(self, response):
|
|
|
|
""" method which returns request
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns requests 1
|
|
|
|
"""
|
|
|
|
return Request('http://scrapy.org', callback=self.returns_item)
|
|
|
|
|
|
|
|
def returns_item(self, response):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
"""
|
|
|
|
return TestItem(url=response.url)
|
|
|
|
|
2019-08-31 02:44:09 -03:00
|
|
|
def returns_request_cb_kwargs(self, response, url):
|
|
|
|
""" method which returns request
|
|
|
|
@url https://example.org
|
|
|
|
@cb_kwargs {"url": "http://scrapy.org"}
|
|
|
|
@returns requests 1
|
|
|
|
"""
|
|
|
|
return Request(url, callback=self.returns_item_cb_kwargs)
|
|
|
|
|
|
|
|
def returns_item_cb_kwargs(self, response, name):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@cb_kwargs {"name": "Scrapy"}
|
|
|
|
@returns items 1 1
|
|
|
|
"""
|
|
|
|
return TestItem(name=name, url=response.url)
|
|
|
|
|
|
|
|
def returns_item_cb_kwargs_error_unexpected_keyword(self, response):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@cb_kwargs {"arg": "value"}
|
|
|
|
@returns items 1 1
|
|
|
|
"""
|
|
|
|
return TestItem(url=response.url)
|
|
|
|
|
|
|
|
def returns_item_cb_kwargs_error_missing_argument(self, response, arg):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
"""
|
|
|
|
return TestItem(url=response.url)
|
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
def returns_dict_item(self, response):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
"""
|
|
|
|
return {"url": response.url}
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
def returns_fail(self, response):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 0 0
|
|
|
|
"""
|
|
|
|
return TestItem(url=response.url)
|
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
def returns_dict_fail(self, response):
|
|
|
|
""" method which returns item
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 0 0
|
|
|
|
"""
|
|
|
|
return {'url': response.url}
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
def scrapes_item_ok(self, response):
|
|
|
|
""" returns item with name and url
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
@scrapes name url
|
|
|
|
"""
|
|
|
|
return TestItem(name='test', url=response.url)
|
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
def scrapes_dict_item_ok(self, response):
|
|
|
|
""" returns item with name and url
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
@scrapes name url
|
|
|
|
"""
|
|
|
|
return {'name': 'test', 'url': response.url}
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
def scrapes_item_fail(self, response):
|
|
|
|
""" returns item with no name
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
@scrapes name url
|
|
|
|
"""
|
|
|
|
return TestItem(url=response.url)
|
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
def scrapes_dict_item_fail(self, response):
|
|
|
|
""" returns item with no name
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
@scrapes name url
|
|
|
|
"""
|
|
|
|
return {'url': response.url}
|
|
|
|
|
2019-08-05 15:49:07 +02:00
|
|
|
def scrapes_multiple_missing_fields(self, response):
|
|
|
|
""" returns item with no name
|
|
|
|
@url http://scrapy.org
|
|
|
|
@returns items 1 1
|
|
|
|
@scrapes name url
|
|
|
|
"""
|
|
|
|
return {}
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
def parse_no_url(self, response):
|
|
|
|
""" method with no url
|
|
|
|
@returns items 1 1
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2018-08-11 22:18:43 +03:00
|
|
|
def custom_form(self, response):
|
|
|
|
"""
|
|
|
|
@url http://scrapy.org
|
|
|
|
@custom_form
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2018-08-11 17:50:56 +03:00
|
|
|
class CustomContractSuccessSpider(Spider):
|
|
|
|
name = 'custom_contract_success_spider'
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
"""
|
|
|
|
@custom_success_contract
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class CustomContractFailSpider(Spider):
|
|
|
|
name = 'custom_contract_fail_spider'
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
"""
|
|
|
|
@custom_fail_contract
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2018-08-11 18:49:12 +03:00
|
|
|
class InheritsTestSpider(TestSpider):
|
|
|
|
name = 'inherits_demo_spider'
|
|
|
|
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
class ContractsManagerTest(unittest.TestCase):
|
2018-08-11 17:50:56 +03:00
|
|
|
contracts = [
|
|
|
|
UrlContract,
|
2019-08-31 02:44:09 -03:00
|
|
|
CallbackKeywordArgumentsContract,
|
2018-08-11 17:50:56 +03:00
|
|
|
ReturnsContract,
|
|
|
|
ScrapesContract,
|
2018-09-05 11:26:59 -03:00
|
|
|
CustomFormContract,
|
2018-08-11 17:50:56 +03:00
|
|
|
CustomSuccessContract,
|
2018-09-05 11:26:59 -03:00
|
|
|
CustomFailContract,
|
2018-08-11 17:50:56 +03:00
|
|
|
]
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2012-10-27 23:26:40 +02:00
|
|
|
def setUp(self):
|
|
|
|
self.conman = ContractsManager(self.contracts)
|
2014-06-03 15:26:01 +02:00
|
|
|
self.results = TextTestResult(stream=None, descriptions=False, verbosity=0)
|
2012-10-27 23:26:40 +02:00
|
|
|
|
|
|
|
def should_succeed(self):
|
|
|
|
self.assertFalse(self.results.failures)
|
|
|
|
self.assertFalse(self.results.errors)
|
|
|
|
|
|
|
|
def should_fail(self):
|
|
|
|
self.assertTrue(self.results.failures)
|
|
|
|
self.assertFalse(self.results.errors)
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2018-08-11 17:50:56 +03:00
|
|
|
def should_error(self):
|
|
|
|
self.assertTrue(self.results.errors)
|
|
|
|
|
2012-10-27 23:26:40 +02:00
|
|
|
def test_contracts(self):
|
2012-10-28 17:53:38 +01:00
|
|
|
spider = TestSpider()
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
# extract contracts correctly
|
2012-10-28 17:53:38 +01:00
|
|
|
contracts = self.conman.extract_contracts(spider.returns_request)
|
2012-09-21 00:12:46 +02:00
|
|
|
self.assertEqual(len(contracts), 2)
|
2020-05-06 18:56:14 -03:00
|
|
|
self.assertEqual(
|
|
|
|
frozenset(type(x) for x in contracts),
|
2012-09-21 00:12:46 +02:00
|
|
|
frozenset([UrlContract, ReturnsContract]))
|
|
|
|
|
|
|
|
# returns request for valid method
|
2012-10-28 17:53:38 +01:00
|
|
|
request = self.conman.from_method(spider.returns_request, self.results)
|
2012-09-28 18:55:12 -03:00
|
|
|
self.assertNotEqual(request, None)
|
2012-09-21 00:12:46 +02:00
|
|
|
|
|
|
|
# no request for missing url
|
2012-10-28 17:53:38 +01:00
|
|
|
request = self.conman.from_method(spider.parse_no_url, self.results)
|
2012-09-28 18:55:12 -03:00
|
|
|
self.assertEqual(request, None)
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2019-08-31 02:44:09 -03:00
|
|
|
def test_cb_kwargs(self):
|
|
|
|
spider = TestSpider()
|
|
|
|
response = ResponseMock()
|
|
|
|
|
|
|
|
# extract contracts correctly
|
|
|
|
contracts = self.conman.extract_contracts(spider.returns_request_cb_kwargs)
|
|
|
|
self.assertEqual(len(contracts), 3)
|
|
|
|
self.assertEqual(frozenset(type(x) for x in contracts),
|
|
|
|
frozenset([UrlContract, CallbackKeywordArgumentsContract, ReturnsContract]))
|
2019-12-18 16:24:48 +01:00
|
|
|
|
2019-08-31 02:44:09 -03:00
|
|
|
contracts = self.conman.extract_contracts(spider.returns_item_cb_kwargs)
|
|
|
|
self.assertEqual(len(contracts), 3)
|
|
|
|
self.assertEqual(frozenset(type(x) for x in contracts),
|
|
|
|
frozenset([UrlContract, CallbackKeywordArgumentsContract, ReturnsContract]))
|
|
|
|
|
|
|
|
contracts = self.conman.extract_contracts(spider.returns_item_cb_kwargs_error_unexpected_keyword)
|
|
|
|
self.assertEqual(len(contracts), 3)
|
|
|
|
self.assertEqual(frozenset(type(x) for x in contracts),
|
|
|
|
frozenset([UrlContract, CallbackKeywordArgumentsContract, ReturnsContract]))
|
|
|
|
|
|
|
|
contracts = self.conman.extract_contracts(spider.returns_item_cb_kwargs_error_missing_argument)
|
|
|
|
self.assertEqual(len(contracts), 2)
|
|
|
|
self.assertEqual(frozenset(type(x) for x in contracts),
|
|
|
|
frozenset([UrlContract, ReturnsContract]))
|
|
|
|
|
|
|
|
# returns_request
|
|
|
|
request = self.conman.from_method(spider.returns_request_cb_kwargs, self.results)
|
|
|
|
request.callback(response, **request.cb_kwargs)
|
|
|
|
self.should_succeed()
|
|
|
|
|
|
|
|
# returns_item
|
|
|
|
request = self.conman.from_method(spider.returns_item_cb_kwargs, self.results)
|
|
|
|
request.callback(response, **request.cb_kwargs)
|
|
|
|
self.should_succeed()
|
|
|
|
|
|
|
|
# returns_item (error, callback doesn't take keyword arguments)
|
|
|
|
request = self.conman.from_method(spider.returns_item_cb_kwargs_error_unexpected_keyword, self.results)
|
|
|
|
request.callback(response, **request.cb_kwargs)
|
|
|
|
self.should_error()
|
|
|
|
|
|
|
|
# returns_item (error, contract doesn't provide keyword arguments)
|
|
|
|
request = self.conman.from_method(spider.returns_item_cb_kwargs_error_missing_argument, self.results)
|
|
|
|
request.callback(response, **request.cb_kwargs)
|
|
|
|
self.should_error()
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
def test_returns(self):
|
|
|
|
spider = TestSpider()
|
|
|
|
response = ResponseMock()
|
|
|
|
|
|
|
|
# returns_item
|
2012-10-27 23:26:40 +02:00
|
|
|
request = self.conman.from_method(spider.returns_item, self.results)
|
2014-06-03 15:26:01 +02:00
|
|
|
request.callback(response)
|
2012-10-27 23:26:40 +02:00
|
|
|
self.should_succeed()
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
# returns_dict_item
|
|
|
|
request = self.conman.from_method(spider.returns_dict_item, self.results)
|
|
|
|
request.callback(response)
|
|
|
|
self.should_succeed()
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
# returns_request
|
2012-10-27 23:26:40 +02:00
|
|
|
request = self.conman.from_method(spider.returns_request, self.results)
|
2014-06-03 15:26:01 +02:00
|
|
|
request.callback(response)
|
2012-10-27 23:26:40 +02:00
|
|
|
self.should_succeed()
|
2012-09-21 00:12:46 +02:00
|
|
|
|
|
|
|
# returns_fail
|
2012-10-27 23:26:40 +02:00
|
|
|
request = self.conman.from_method(spider.returns_fail, self.results)
|
|
|
|
request.callback(response)
|
|
|
|
self.should_fail()
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
# returns_dict_fail
|
|
|
|
request = self.conman.from_method(spider.returns_dict_fail, self.results)
|
|
|
|
request.callback(response)
|
|
|
|
self.should_fail()
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
def test_scrapes(self):
|
|
|
|
spider = TestSpider()
|
|
|
|
response = ResponseMock()
|
|
|
|
|
|
|
|
# scrapes_item_ok
|
2012-10-27 23:26:40 +02:00
|
|
|
request = self.conman.from_method(spider.scrapes_item_ok, self.results)
|
2014-06-03 15:26:01 +02:00
|
|
|
request.callback(response)
|
2012-10-27 23:26:40 +02:00
|
|
|
self.should_succeed()
|
2012-09-21 00:12:46 +02:00
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
# scrapes_dict_item_ok
|
|
|
|
request = self.conman.from_method(spider.scrapes_dict_item_ok, self.results)
|
|
|
|
request.callback(response)
|
|
|
|
self.should_succeed()
|
|
|
|
|
2012-09-21 00:12:46 +02:00
|
|
|
# scrapes_item_fail
|
2018-08-15 20:24:00 +03:00
|
|
|
request = self.conman.from_method(spider.scrapes_item_fail, self.results)
|
2012-10-27 23:26:40 +02:00
|
|
|
request.callback(response)
|
|
|
|
self.should_fail()
|
2015-03-18 07:26:56 +05:00
|
|
|
|
|
|
|
# scrapes_dict_item_fail
|
2018-08-15 20:24:00 +03:00
|
|
|
request = self.conman.from_method(spider.scrapes_dict_item_fail, self.results)
|
2015-03-18 07:26:56 +05:00
|
|
|
request.callback(response)
|
|
|
|
self.should_fail()
|
2018-08-11 17:50:56 +03:00
|
|
|
|
2019-08-05 15:49:07 +02:00
|
|
|
# scrapes_multiple_missing_fields
|
|
|
|
request = self.conman.from_method(spider.scrapes_multiple_missing_fields, self.results)
|
|
|
|
request.callback(response)
|
|
|
|
self.should_fail()
|
|
|
|
message = 'ContractFail: Missing fields: name, url'
|
|
|
|
assert message in self.results.failures[-1][-1]
|
|
|
|
|
2018-08-11 17:50:56 +03:00
|
|
|
def test_custom_contracts(self):
|
|
|
|
self.conman.from_spider(CustomContractSuccessSpider(), self.results)
|
|
|
|
self.should_succeed()
|
|
|
|
|
|
|
|
self.conman.from_spider(CustomContractFailSpider(), self.results)
|
|
|
|
self.should_error()
|
2018-08-11 18:34:37 +03:00
|
|
|
|
2018-08-09 21:07:25 +03:00
|
|
|
def test_errback(self):
|
|
|
|
spider = TestSpider()
|
|
|
|
response = ResponseMock()
|
|
|
|
|
|
|
|
try:
|
|
|
|
raise HttpError(response, 'Ignoring non-200 response')
|
|
|
|
except HttpError:
|
|
|
|
failure_mock = failure.Failure()
|
|
|
|
|
|
|
|
request = self.conman.from_method(spider.returns_request, self.results)
|
|
|
|
request.errback(failure_mock)
|
|
|
|
|
|
|
|
self.assertFalse(self.results.failures)
|
|
|
|
self.assertTrue(self.results.errors)
|
2018-08-11 18:49:12 +03:00
|
|
|
|
2018-08-15 20:24:00 +03:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_same_url(self):
|
2018-08-15 20:36:10 +03:00
|
|
|
|
|
|
|
class TestSameUrlSpider(Spider):
|
|
|
|
name = 'test_same_url'
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
2020-08-04 20:42:01 +02:00
|
|
|
super().__init__(*args, **kwargs)
|
2018-08-15 20:36:10 +03:00
|
|
|
self.visited = 0
|
|
|
|
|
2018-08-15 20:39:43 +03:00
|
|
|
def start_requests(s):
|
|
|
|
return self.conman.from_spider(s, self.results)
|
|
|
|
|
2018-08-15 20:36:10 +03:00
|
|
|
def parse_first(self, response):
|
|
|
|
self.visited += 1
|
|
|
|
return TestItem()
|
|
|
|
|
|
|
|
def parse_second(self, response):
|
|
|
|
self.visited += 1
|
|
|
|
return TestItem()
|
|
|
|
|
2018-08-15 20:39:43 +03:00
|
|
|
with MockServer() as mockserver:
|
2020-08-22 22:32:03 +02:00
|
|
|
contract_doc = f'@url {mockserver.url("/status?n=200")}'
|
2018-08-19 16:56:41 +03:00
|
|
|
|
2019-11-03 12:26:38 -03:00
|
|
|
TestSameUrlSpider.parse_first.__doc__ = contract_doc
|
|
|
|
TestSameUrlSpider.parse_second.__doc__ = contract_doc
|
2018-08-18 15:42:21 +03:00
|
|
|
|
|
|
|
crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
|
|
|
|
yield crawler.crawl()
|
2018-08-15 20:24:00 +03:00
|
|
|
|
|
|
|
self.assertEqual(crawler.spider.visited, 2)
|
2018-08-18 15:42:10 +03:00
|
|
|
|
2018-08-11 22:18:43 +03:00
|
|
|
def test_form_contract(self):
|
|
|
|
spider = TestSpider()
|
|
|
|
request = self.conman.from_method(spider.custom_form, self.results)
|
|
|
|
self.assertEqual(request.method, 'POST')
|
|
|
|
self.assertIsInstance(request, FormRequest)
|
2018-08-19 17:21:28 +03:00
|
|
|
|
2018-08-11 18:49:12 +03:00
|
|
|
def test_inherited_contracts(self):
|
|
|
|
spider = InheritsTestSpider()
|
|
|
|
|
|
|
|
requests = self.conman.from_spider(spider, self.results)
|
|
|
|
self.assertTrue(requests)
|