mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 22:24:24 +00:00
SEP-017 contracts
* load contracts from settings * refactored contracts manager * fixed callback bug, which caused responses to be evaluated with a wrong callback sometimes * "returns" contract
This commit is contained in:
parent
99b76eaa2c
commit
901987154e
@ -1,9 +1,21 @@
|
|||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|
||||||
|
from scrapy.conf import settings
|
||||||
from scrapy.command import ScrapyCommand
|
from scrapy.command import ScrapyCommand
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
|
from scrapy.contracts import ContractsManager
|
||||||
|
from scrapy.utils import display
|
||||||
|
from scrapy.utils.misc import load_object
|
||||||
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
|
||||||
from scrapy.contracts import Contract
|
def _generate(cb):
|
||||||
|
""" create a callback which does not return anything """
|
||||||
|
@wraps(cb)
|
||||||
|
def wrapper(response):
|
||||||
|
output = cb(response)
|
||||||
|
output = list(iterate_spider_output(output))
|
||||||
|
# display.pprint(output)
|
||||||
|
return wrapper
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(ScrapyCommand):
|
||||||
requires_project = True
|
requires_project = True
|
||||||
@ -15,6 +27,17 @@ class Command(ScrapyCommand):
|
|||||||
return "Check contracts for given spider"
|
return "Check contracts for given spider"
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
|
self.conman = ContractsManager()
|
||||||
|
|
||||||
|
# load contracts
|
||||||
|
contracts = settings['SPIDER_CONTRACTS_BASE'] + \
|
||||||
|
settings['SPIDER_CONTRACTS']
|
||||||
|
|
||||||
|
for contract in contracts:
|
||||||
|
concls = load_object(contract)
|
||||||
|
self.conman.register(concls)
|
||||||
|
|
||||||
|
# schedule requests
|
||||||
self.crawler.engine.has_capacity = lambda: True
|
self.crawler.engine.has_capacity = lambda: True
|
||||||
|
|
||||||
for spider in args or self.crawler.spiders.list():
|
for spider in args or self.crawler.spiders.list():
|
||||||
@ -22,29 +45,19 @@ class Command(ScrapyCommand):
|
|||||||
requests = self.get_requests(spider)
|
requests = self.get_requests(spider)
|
||||||
self.crawler.crawl(spider, requests)
|
self.crawler.crawl(spider, requests)
|
||||||
|
|
||||||
|
# start checks
|
||||||
self.crawler.start()
|
self.crawler.start()
|
||||||
|
|
||||||
def get_requests(self, spider):
|
def get_requests(self, spider):
|
||||||
requests = []
|
requests = []
|
||||||
|
|
||||||
for key, value in vars(type(spider)).iteritems():
|
for key, value in vars(type(spider)).items():
|
||||||
if callable(value) and value.__doc__:
|
if callable(value) and value.__doc__:
|
||||||
bound_method = value.__get__(spider, type(spider))
|
bound_method = value.__get__(spider, type(spider))
|
||||||
request = Request(url='http://scrapy.org', callback=bound_method)
|
request = self.conman.from_method(bound_method)
|
||||||
|
|
||||||
# register contract hooks to the request
|
if request:
|
||||||
contracts = Contract.from_method(value)
|
request.callback = _generate(request.callback)
|
||||||
for contract in contracts:
|
requests.append(request)
|
||||||
request = contract.prepare_request(request)
|
|
||||||
|
|
||||||
# discard anything the request might return
|
|
||||||
cb = request.callback
|
|
||||||
@wraps(cb)
|
|
||||||
def wrapper(response):
|
|
||||||
cb(response)
|
|
||||||
|
|
||||||
request.callback = wrapper
|
|
||||||
|
|
||||||
requests.append(request)
|
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
@ -1,2 +1,76 @@
|
|||||||
from .base import Contract, ContractType
|
import re
|
||||||
from .default import *
|
import inspect
|
||||||
|
from functools import wraps
|
||||||
|
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
from scrapy.utils.misc import get_spec
|
||||||
|
from scrapy.exceptions import ContractFail
|
||||||
|
|
||||||
|
class ContractsManager(object):
|
||||||
|
registered = {}
|
||||||
|
|
||||||
|
def register(self, contract):
|
||||||
|
self.registered[contract.name] = contract
|
||||||
|
|
||||||
|
def extract_contracts(self, method):
|
||||||
|
contracts = []
|
||||||
|
for line in method.__doc__.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
if line.startswith('@'):
|
||||||
|
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
|
||||||
|
args = re.split(r'\s*\,\s*', args)
|
||||||
|
|
||||||
|
contracts.append(self.registered[name](method, *args))
|
||||||
|
|
||||||
|
return contracts
|
||||||
|
|
||||||
|
def from_method(self, method):
|
||||||
|
contracts = self.extract_contracts(method)
|
||||||
|
if contracts:
|
||||||
|
# calculate request args
|
||||||
|
args = get_spec(Request.__init__)[1]
|
||||||
|
args['callback'] = method
|
||||||
|
for contract in contracts:
|
||||||
|
args = contract.adjust_request_args(args)
|
||||||
|
|
||||||
|
# create and prepare request
|
||||||
|
assert 'url' in args, "Method '%s' does not have an url contract" % method.__name__
|
||||||
|
request = Request(**args)
|
||||||
|
for contract in contracts:
|
||||||
|
request = contract.prepare_request(request)
|
||||||
|
|
||||||
|
return request
|
||||||
|
|
||||||
|
class Contract(object):
|
||||||
|
""" Abstract class for contracts """
|
||||||
|
|
||||||
|
def __init__(self, method, *args):
|
||||||
|
self.method = method
|
||||||
|
self.args = args
|
||||||
|
|
||||||
|
def prepare_request(self, request):
|
||||||
|
cb = request.callback
|
||||||
|
@wraps(cb)
|
||||||
|
def wrapper(response):
|
||||||
|
self.pre_process(response)
|
||||||
|
output = list(iterate_spider_output(cb(response)))
|
||||||
|
self.post_process(output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
request.callback = wrapper
|
||||||
|
request = self.modify_request(request)
|
||||||
|
return request
|
||||||
|
|
||||||
|
def adjust_request_args(self, args):
|
||||||
|
return args
|
||||||
|
|
||||||
|
def modify_request(self, request):
|
||||||
|
return request
|
||||||
|
|
||||||
|
def pre_process(self, response):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def post_process(self, output):
|
||||||
|
pass
|
||||||
|
@ -1,74 +0,0 @@
|
|||||||
import re
|
|
||||||
from functools import wraps
|
|
||||||
|
|
||||||
from scrapy.utils.spider import iterate_spider_output
|
|
||||||
|
|
||||||
class ContractType(type):
|
|
||||||
""" Metaclass for contracts
|
|
||||||
- automatically registers contracts in the root `Contract` class
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __new__(meta, name, bases, dct):
|
|
||||||
# only allow single inheritence
|
|
||||||
assert len(bases) == 1, 'Multiple inheritance is not allowed'
|
|
||||||
base = bases[0]
|
|
||||||
|
|
||||||
# ascend in inheritence chain
|
|
||||||
while type(base) not in [type, meta]:
|
|
||||||
base = type(base)
|
|
||||||
|
|
||||||
# register this as a valid contract
|
|
||||||
cls = type.__new__(meta, name, bases, dct)
|
|
||||||
if type(base) != type:
|
|
||||||
base.registered[cls.name] = cls
|
|
||||||
return cls
|
|
||||||
|
|
||||||
|
|
||||||
class Contract(object):
|
|
||||||
""" Abstract class for contracts
|
|
||||||
- keeps a reference of all derived classes in `registered`
|
|
||||||
"""
|
|
||||||
|
|
||||||
__metaclass__ = ContractType
|
|
||||||
registered = {}
|
|
||||||
|
|
||||||
def __init__(self, method, *args):
|
|
||||||
self.method = method
|
|
||||||
self.args = args
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_method(cls, method):
|
|
||||||
contracts = []
|
|
||||||
for line in method.__doc__.split('\n'):
|
|
||||||
line = line.strip()
|
|
||||||
|
|
||||||
if line.startswith('@'):
|
|
||||||
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
|
|
||||||
args = re.split(r'[\,\s+]', args)
|
|
||||||
args = filter(lambda x:x, args)
|
|
||||||
|
|
||||||
contracts.append(cls.registered[name](method, *args))
|
|
||||||
|
|
||||||
return contracts
|
|
||||||
|
|
||||||
def prepare_request(self, request):
|
|
||||||
cb = request.callback
|
|
||||||
@wraps(cb)
|
|
||||||
def wrapper(response):
|
|
||||||
self.pre_process(response)
|
|
||||||
output = list(iterate_spider_output(cb(response)))
|
|
||||||
self.post_process(output)
|
|
||||||
return output
|
|
||||||
|
|
||||||
request.callback = wrapper
|
|
||||||
request = self.modify_request(request)
|
|
||||||
return request
|
|
||||||
|
|
||||||
def modify_request(self, request):
|
|
||||||
return request
|
|
||||||
|
|
||||||
def pre_process(self, response):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def post_process(self, output):
|
|
||||||
pass
|
|
@ -1,23 +1,77 @@
|
|||||||
from scrapy.item import BaseItem
|
from scrapy.item import BaseItem
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.exceptions import ContractFail
|
||||||
|
|
||||||
from .base import Contract
|
from . import Contract
|
||||||
|
|
||||||
|
|
||||||
# contracts
|
# contracts
|
||||||
class UrlContract(Contract):
|
class UrlContract(Contract):
|
||||||
|
""" Contract to set the url of the request (mandatory)
|
||||||
|
@url http://scrapy.org
|
||||||
|
"""
|
||||||
|
|
||||||
name = 'url'
|
name = 'url'
|
||||||
|
|
||||||
def modify_request(self, request):
|
def adjust_request_args(self, args):
|
||||||
return request.replace(url=self.args[0])
|
args['url'] = self.args[0]
|
||||||
|
return args
|
||||||
|
|
||||||
class ReturnsRequestContract(Contract):
|
class ReturnsContract(Contract):
|
||||||
name = 'returns_request'
|
""" Contract to check the output of a callback
|
||||||
|
@returns items, 1
|
||||||
|
@returns requests, 1+
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = 'returns'
|
||||||
|
objects = {
|
||||||
|
'requests': Request,
|
||||||
|
'items': BaseItem,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(ReturnsContract, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
if len(self.args) != 2:
|
||||||
|
raise ContractError("Returns Contract must have two arguments")
|
||||||
|
self.obj_name, self.raw_num = self.args
|
||||||
|
|
||||||
|
# validate input
|
||||||
|
self.obj_type = self.objects[self.obj_name]
|
||||||
|
|
||||||
|
self.modifier = self.raw_num[-1]
|
||||||
|
if self.modifier in ['+', '-']:
|
||||||
|
self.num = int(self.raw_num[:-1])
|
||||||
|
else:
|
||||||
|
self.num = int(self.raw_num)
|
||||||
|
self.modifier = None
|
||||||
|
|
||||||
|
def post_process(self, output):
|
||||||
|
occurences = 0
|
||||||
|
for x in output:
|
||||||
|
if isinstance(x, self.obj_type):
|
||||||
|
occurences += 1
|
||||||
|
|
||||||
|
if self.modifier == '+':
|
||||||
|
assertion = (occurences >= self.num)
|
||||||
|
elif self.modifier == '-':
|
||||||
|
assertion = (occurences <= self.num)
|
||||||
|
else:
|
||||||
|
assertion = (occurences == self.num)
|
||||||
|
|
||||||
|
if not assertion:
|
||||||
|
raise ContractFail("Returned %s %s, expected %s" % \
|
||||||
|
(occurences, self.obj_name, self.raw_num))
|
||||||
|
|
||||||
class ScrapesContract(Contract):
|
class ScrapesContract(Contract):
|
||||||
|
""" Contract to check presence of fields in scraped items
|
||||||
|
@scrapes page_name, page_body
|
||||||
|
"""
|
||||||
name = 'scrapes'
|
name = 'scrapes'
|
||||||
|
|
||||||
def post_process(self, output):
|
def post_process(self, output):
|
||||||
for x in output:
|
for x in output:
|
||||||
if isinstance(x, BaseItem):
|
if isinstance(x, BaseItem):
|
||||||
for arg in self.args:
|
for arg in self.args:
|
||||||
assert arg in x, '%r field is missing' % arg
|
if not arg in x:
|
||||||
|
raise ContractFail('%r field is missing' % arg)
|
||||||
|
@ -50,3 +50,6 @@ class ScrapyDeprecationWarning(Warning):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class ContractFail(Exception):
|
||||||
|
"""Error in constructing contracts for a method"""
|
||||||
|
pass
|
||||||
|
@ -241,3 +241,10 @@ WEBSERVICE_RESOURCES_BASE = {
|
|||||||
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||||
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SPIDER_CONTRACTS = []
|
||||||
|
SPIDER_CONTRACTS_BASE = [
|
||||||
|
'scrapy.contracts.default.UrlContract',
|
||||||
|
'scrapy.contracts.default.ReturnsContract',
|
||||||
|
'scrapy.contracts.default.ScrapesContract',
|
||||||
|
]
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
"""Helper functions which doesn't fit anywhere else"""
|
"""Helper functions which doesn't fit anywhere else"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import inspect
|
||||||
import hashlib
|
import hashlib
|
||||||
from pkgutil import iter_modules
|
from pkgutil import iter_modules
|
||||||
|
|
||||||
@ -104,3 +105,17 @@ def md5sum(file):
|
|||||||
m.update(d)
|
m.update(d)
|
||||||
return m.hexdigest()
|
return m.hexdigest()
|
||||||
|
|
||||||
|
def get_spec(func):
|
||||||
|
"""Returns (args, kwargs) touple for a function
|
||||||
|
|
||||||
|
>>> import re
|
||||||
|
>>> get_spec(re.match)
|
||||||
|
(['pattern', 'string'], {'flags': 0})
|
||||||
|
"""
|
||||||
|
spec = inspect.getargspec(func)
|
||||||
|
defaults = spec.defaults or []
|
||||||
|
|
||||||
|
firstdefault = len(spec.args) - len(defaults)
|
||||||
|
args = spec.args[:firstdefault]
|
||||||
|
kwargs = dict(zip(spec.args[firstdefault:], defaults))
|
||||||
|
return args, kwargs
|
||||||
|
Loading…
x
Reference in New Issue
Block a user