mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 22:24:24 +00:00
SEP-017 contracts
* load contracts from settings * refactored contracts manager * fixed callback bug, which caused responses to be evaluated with a wrong callback sometimes * "returns" contract
This commit is contained in:
parent
99b76eaa2c
commit
901987154e
@ -1,9 +1,21 @@
|
||||
from functools import wraps
|
||||
|
||||
from scrapy.conf import settings
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.contracts import ContractsManager
|
||||
from scrapy.utils import display
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
from scrapy.contracts import Contract
|
||||
def _generate(cb):
|
||||
""" create a callback which does not return anything """
|
||||
@wraps(cb)
|
||||
def wrapper(response):
|
||||
output = cb(response)
|
||||
output = list(iterate_spider_output(output))
|
||||
# display.pprint(output)
|
||||
return wrapper
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
@ -15,6 +27,17 @@ class Command(ScrapyCommand):
|
||||
return "Check contracts for given spider"
|
||||
|
||||
def run(self, args, opts):
|
||||
self.conman = ContractsManager()
|
||||
|
||||
# load contracts
|
||||
contracts = settings['SPIDER_CONTRACTS_BASE'] + \
|
||||
settings['SPIDER_CONTRACTS']
|
||||
|
||||
for contract in contracts:
|
||||
concls = load_object(contract)
|
||||
self.conman.register(concls)
|
||||
|
||||
# schedule requests
|
||||
self.crawler.engine.has_capacity = lambda: True
|
||||
|
||||
for spider in args or self.crawler.spiders.list():
|
||||
@ -22,29 +45,19 @@ class Command(ScrapyCommand):
|
||||
requests = self.get_requests(spider)
|
||||
self.crawler.crawl(spider, requests)
|
||||
|
||||
# start checks
|
||||
self.crawler.start()
|
||||
|
||||
def get_requests(self, spider):
|
||||
requests = []
|
||||
|
||||
for key, value in vars(type(spider)).iteritems():
|
||||
for key, value in vars(type(spider)).items():
|
||||
if callable(value) and value.__doc__:
|
||||
bound_method = value.__get__(spider, type(spider))
|
||||
request = Request(url='http://scrapy.org', callback=bound_method)
|
||||
request = self.conman.from_method(bound_method)
|
||||
|
||||
# register contract hooks to the request
|
||||
contracts = Contract.from_method(value)
|
||||
for contract in contracts:
|
||||
request = contract.prepare_request(request)
|
||||
|
||||
# discard anything the request might return
|
||||
cb = request.callback
|
||||
@wraps(cb)
|
||||
def wrapper(response):
|
||||
cb(response)
|
||||
|
||||
request.callback = wrapper
|
||||
|
||||
requests.append(request)
|
||||
if request:
|
||||
request.callback = _generate(request.callback)
|
||||
requests.append(request)
|
||||
|
||||
return requests
|
||||
|
@ -1,2 +1,76 @@
|
||||
from .base import Contract, ContractType
|
||||
from .default import *
|
||||
import re
|
||||
import inspect
|
||||
from functools import wraps
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.misc import get_spec
|
||||
from scrapy.exceptions import ContractFail
|
||||
|
||||
class ContractsManager(object):
|
||||
registered = {}
|
||||
|
||||
def register(self, contract):
|
||||
self.registered[contract.name] = contract
|
||||
|
||||
def extract_contracts(self, method):
|
||||
contracts = []
|
||||
for line in method.__doc__.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if line.startswith('@'):
|
||||
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
|
||||
args = re.split(r'\s*\,\s*', args)
|
||||
|
||||
contracts.append(self.registered[name](method, *args))
|
||||
|
||||
return contracts
|
||||
|
||||
def from_method(self, method):
|
||||
contracts = self.extract_contracts(method)
|
||||
if contracts:
|
||||
# calculate request args
|
||||
args = get_spec(Request.__init__)[1]
|
||||
args['callback'] = method
|
||||
for contract in contracts:
|
||||
args = contract.adjust_request_args(args)
|
||||
|
||||
# create and prepare request
|
||||
assert 'url' in args, "Method '%s' does not have an url contract" % method.__name__
|
||||
request = Request(**args)
|
||||
for contract in contracts:
|
||||
request = contract.prepare_request(request)
|
||||
|
||||
return request
|
||||
|
||||
class Contract(object):
|
||||
""" Abstract class for contracts """
|
||||
|
||||
def __init__(self, method, *args):
|
||||
self.method = method
|
||||
self.args = args
|
||||
|
||||
def prepare_request(self, request):
|
||||
cb = request.callback
|
||||
@wraps(cb)
|
||||
def wrapper(response):
|
||||
self.pre_process(response)
|
||||
output = list(iterate_spider_output(cb(response)))
|
||||
self.post_process(output)
|
||||
return output
|
||||
|
||||
request.callback = wrapper
|
||||
request = self.modify_request(request)
|
||||
return request
|
||||
|
||||
def adjust_request_args(self, args):
|
||||
return args
|
||||
|
||||
def modify_request(self, request):
|
||||
return request
|
||||
|
||||
def pre_process(self, response):
|
||||
pass
|
||||
|
||||
def post_process(self, output):
|
||||
pass
|
||||
|
@ -1,74 +0,0 @@
|
||||
import re
|
||||
from functools import wraps
|
||||
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
class ContractType(type):
|
||||
""" Metaclass for contracts
|
||||
- automatically registers contracts in the root `Contract` class
|
||||
"""
|
||||
|
||||
def __new__(meta, name, bases, dct):
|
||||
# only allow single inheritence
|
||||
assert len(bases) == 1, 'Multiple inheritance is not allowed'
|
||||
base = bases[0]
|
||||
|
||||
# ascend in inheritence chain
|
||||
while type(base) not in [type, meta]:
|
||||
base = type(base)
|
||||
|
||||
# register this as a valid contract
|
||||
cls = type.__new__(meta, name, bases, dct)
|
||||
if type(base) != type:
|
||||
base.registered[cls.name] = cls
|
||||
return cls
|
||||
|
||||
|
||||
class Contract(object):
|
||||
""" Abstract class for contracts
|
||||
- keeps a reference of all derived classes in `registered`
|
||||
"""
|
||||
|
||||
__metaclass__ = ContractType
|
||||
registered = {}
|
||||
|
||||
def __init__(self, method, *args):
|
||||
self.method = method
|
||||
self.args = args
|
||||
|
||||
@classmethod
|
||||
def from_method(cls, method):
|
||||
contracts = []
|
||||
for line in method.__doc__.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if line.startswith('@'):
|
||||
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
|
||||
args = re.split(r'[\,\s+]', args)
|
||||
args = filter(lambda x:x, args)
|
||||
|
||||
contracts.append(cls.registered[name](method, *args))
|
||||
|
||||
return contracts
|
||||
|
||||
def prepare_request(self, request):
|
||||
cb = request.callback
|
||||
@wraps(cb)
|
||||
def wrapper(response):
|
||||
self.pre_process(response)
|
||||
output = list(iterate_spider_output(cb(response)))
|
||||
self.post_process(output)
|
||||
return output
|
||||
|
||||
request.callback = wrapper
|
||||
request = self.modify_request(request)
|
||||
return request
|
||||
|
||||
def modify_request(self, request):
|
||||
return request
|
||||
|
||||
def pre_process(self, response):
|
||||
pass
|
||||
|
||||
def post_process(self, output):
|
||||
pass
|
@ -1,23 +1,77 @@
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import ContractFail
|
||||
|
||||
from .base import Contract
|
||||
from . import Contract
|
||||
|
||||
|
||||
# contracts
|
||||
class UrlContract(Contract):
|
||||
""" Contract to set the url of the request (mandatory)
|
||||
@url http://scrapy.org
|
||||
"""
|
||||
|
||||
name = 'url'
|
||||
|
||||
def modify_request(self, request):
|
||||
return request.replace(url=self.args[0])
|
||||
def adjust_request_args(self, args):
|
||||
args['url'] = self.args[0]
|
||||
return args
|
||||
|
||||
class ReturnsRequestContract(Contract):
|
||||
name = 'returns_request'
|
||||
class ReturnsContract(Contract):
|
||||
""" Contract to check the output of a callback
|
||||
@returns items, 1
|
||||
@returns requests, 1+
|
||||
"""
|
||||
|
||||
name = 'returns'
|
||||
objects = {
|
||||
'requests': Request,
|
||||
'items': BaseItem,
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ReturnsContract, self).__init__(*args, **kwargs)
|
||||
|
||||
if len(self.args) != 2:
|
||||
raise ContractError("Returns Contract must have two arguments")
|
||||
self.obj_name, self.raw_num = self.args
|
||||
|
||||
# validate input
|
||||
self.obj_type = self.objects[self.obj_name]
|
||||
|
||||
self.modifier = self.raw_num[-1]
|
||||
if self.modifier in ['+', '-']:
|
||||
self.num = int(self.raw_num[:-1])
|
||||
else:
|
||||
self.num = int(self.raw_num)
|
||||
self.modifier = None
|
||||
|
||||
def post_process(self, output):
|
||||
occurences = 0
|
||||
for x in output:
|
||||
if isinstance(x, self.obj_type):
|
||||
occurences += 1
|
||||
|
||||
if self.modifier == '+':
|
||||
assertion = (occurences >= self.num)
|
||||
elif self.modifier == '-':
|
||||
assertion = (occurences <= self.num)
|
||||
else:
|
||||
assertion = (occurences == self.num)
|
||||
|
||||
if not assertion:
|
||||
raise ContractFail("Returned %s %s, expected %s" % \
|
||||
(occurences, self.obj_name, self.raw_num))
|
||||
|
||||
class ScrapesContract(Contract):
|
||||
""" Contract to check presence of fields in scraped items
|
||||
@scrapes page_name, page_body
|
||||
"""
|
||||
name = 'scrapes'
|
||||
|
||||
def post_process(self, output):
|
||||
for x in output:
|
||||
if isinstance(x, BaseItem):
|
||||
for arg in self.args:
|
||||
assert arg in x, '%r field is missing' % arg
|
||||
if not arg in x:
|
||||
raise ContractFail('%r field is missing' % arg)
|
||||
|
@ -50,3 +50,6 @@ class ScrapyDeprecationWarning(Warning):
|
||||
"""
|
||||
pass
|
||||
|
||||
class ContractFail(Exception):
|
||||
"""Error in constructing contracts for a method"""
|
||||
pass
|
||||
|
@ -241,3 +241,10 @@ WEBSERVICE_RESOURCES_BASE = {
|
||||
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||
}
|
||||
|
||||
SPIDER_CONTRACTS = []
|
||||
SPIDER_CONTRACTS_BASE = [
|
||||
'scrapy.contracts.default.UrlContract',
|
||||
'scrapy.contracts.default.ReturnsContract',
|
||||
'scrapy.contracts.default.ScrapesContract',
|
||||
]
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""Helper functions which doesn't fit anywhere else"""
|
||||
|
||||
import re
|
||||
import inspect
|
||||
import hashlib
|
||||
from pkgutil import iter_modules
|
||||
|
||||
@ -104,3 +105,17 @@ def md5sum(file):
|
||||
m.update(d)
|
||||
return m.hexdigest()
|
||||
|
||||
def get_spec(func):
|
||||
"""Returns (args, kwargs) touple for a function
|
||||
|
||||
>>> import re
|
||||
>>> get_spec(re.match)
|
||||
(['pattern', 'string'], {'flags': 0})
|
||||
"""
|
||||
spec = inspect.getargspec(func)
|
||||
defaults = spec.defaults or []
|
||||
|
||||
firstdefault = len(spec.args) - len(defaults)
|
||||
args = spec.args[:firstdefault]
|
||||
kwargs = dict(zip(spec.args[firstdefault:], defaults))
|
||||
return args, kwargs
|
||||
|
Loading…
x
Reference in New Issue
Block a user