1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 19:03:53 +00:00

SEP-017 contracts

* load contracts from settings
* refactored contracts manager
* fixed callback bug, which caused responses to be evaluated with a
wrong callback sometimes
* "returns" contract
This commit is contained in:
Alex Cepoi 2012-08-24 16:42:13 +02:00
parent 99b76eaa2c
commit 901987154e
7 changed files with 191 additions and 99 deletions

View File

@ -1,9 +1,21 @@
from functools import wraps
from scrapy.conf import settings
from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.contracts import ContractsManager
from scrapy.utils import display
from scrapy.utils.misc import load_object
from scrapy.utils.spider import iterate_spider_output
from scrapy.contracts import Contract
def _generate(cb):
""" create a callback which does not return anything """
@wraps(cb)
def wrapper(response):
output = cb(response)
output = list(iterate_spider_output(output))
# display.pprint(output)
return wrapper
class Command(ScrapyCommand):
requires_project = True
@ -15,6 +27,17 @@ class Command(ScrapyCommand):
return "Check contracts for given spider"
def run(self, args, opts):
self.conman = ContractsManager()
# load contracts
contracts = settings['SPIDER_CONTRACTS_BASE'] + \
settings['SPIDER_CONTRACTS']
for contract in contracts:
concls = load_object(contract)
self.conman.register(concls)
# schedule requests
self.crawler.engine.has_capacity = lambda: True
for spider in args or self.crawler.spiders.list():
@ -22,29 +45,19 @@ class Command(ScrapyCommand):
requests = self.get_requests(spider)
self.crawler.crawl(spider, requests)
# start checks
self.crawler.start()
def get_requests(self, spider):
requests = []
for key, value in vars(type(spider)).iteritems():
for key, value in vars(type(spider)).items():
if callable(value) and value.__doc__:
bound_method = value.__get__(spider, type(spider))
request = Request(url='http://scrapy.org', callback=bound_method)
request = self.conman.from_method(bound_method)
# register contract hooks to the request
contracts = Contract.from_method(value)
for contract in contracts:
request = contract.prepare_request(request)
# discard anything the request might return
cb = request.callback
@wraps(cb)
def wrapper(response):
cb(response)
request.callback = wrapper
requests.append(request)
if request:
request.callback = _generate(request.callback)
requests.append(request)
return requests

View File

@ -1,2 +1,76 @@
from .base import Contract, ContractType
from .default import *
import re
import inspect
from functools import wraps
from scrapy.http import Request
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import get_spec
from scrapy.exceptions import ContractFail
class ContractsManager(object):
registered = {}
def register(self, contract):
self.registered[contract.name] = contract
def extract_contracts(self, method):
contracts = []
for line in method.__doc__.split('\n'):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'\s*\,\s*', args)
contracts.append(self.registered[name](method, *args))
return contracts
def from_method(self, method):
contracts = self.extract_contracts(method)
if contracts:
# calculate request args
args = get_spec(Request.__init__)[1]
args['callback'] = method
for contract in contracts:
args = contract.adjust_request_args(args)
# create and prepare request
assert 'url' in args, "Method '%s' does not have an url contract" % method.__name__
request = Request(**args)
for contract in contracts:
request = contract.prepare_request(request)
return request
class Contract(object):
""" Abstract class for contracts """
def __init__(self, method, *args):
self.method = method
self.args = args
def prepare_request(self, request):
cb = request.callback
@wraps(cb)
def wrapper(response):
self.pre_process(response)
output = list(iterate_spider_output(cb(response)))
self.post_process(output)
return output
request.callback = wrapper
request = self.modify_request(request)
return request
def adjust_request_args(self, args):
return args
def modify_request(self, request):
return request
def pre_process(self, response):
pass
def post_process(self, output):
pass

View File

@ -1,74 +0,0 @@
import re
from functools import wraps
from scrapy.utils.spider import iterate_spider_output
class ContractType(type):
""" Metaclass for contracts
- automatically registers contracts in the root `Contract` class
"""
def __new__(meta, name, bases, dct):
# only allow single inheritence
assert len(bases) == 1, 'Multiple inheritance is not allowed'
base = bases[0]
# ascend in inheritence chain
while type(base) not in [type, meta]:
base = type(base)
# register this as a valid contract
cls = type.__new__(meta, name, bases, dct)
if type(base) != type:
base.registered[cls.name] = cls
return cls
class Contract(object):
""" Abstract class for contracts
- keeps a reference of all derived classes in `registered`
"""
__metaclass__ = ContractType
registered = {}
def __init__(self, method, *args):
self.method = method
self.args = args
@classmethod
def from_method(cls, method):
contracts = []
for line in method.__doc__.split('\n'):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'[\,\s+]', args)
args = filter(lambda x:x, args)
contracts.append(cls.registered[name](method, *args))
return contracts
def prepare_request(self, request):
cb = request.callback
@wraps(cb)
def wrapper(response):
self.pre_process(response)
output = list(iterate_spider_output(cb(response)))
self.post_process(output)
return output
request.callback = wrapper
request = self.modify_request(request)
return request
def modify_request(self, request):
return request
def pre_process(self, response):
pass
def post_process(self, output):
pass

View File

@ -1,23 +1,77 @@
from scrapy.item import BaseItem
from scrapy.http import Request
from scrapy.exceptions import ContractFail
from .base import Contract
from . import Contract
# contracts
class UrlContract(Contract):
""" Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""
name = 'url'
def modify_request(self, request):
return request.replace(url=self.args[0])
def adjust_request_args(self, args):
args['url'] = self.args[0]
return args
class ReturnsRequestContract(Contract):
name = 'returns_request'
class ReturnsContract(Contract):
""" Contract to check the output of a callback
@returns items, 1
@returns requests, 1+
"""
name = 'returns'
objects = {
'requests': Request,
'items': BaseItem,
}
def __init__(self, *args, **kwargs):
super(ReturnsContract, self).__init__(*args, **kwargs)
if len(self.args) != 2:
raise ContractError("Returns Contract must have two arguments")
self.obj_name, self.raw_num = self.args
# validate input
self.obj_type = self.objects[self.obj_name]
self.modifier = self.raw_num[-1]
if self.modifier in ['+', '-']:
self.num = int(self.raw_num[:-1])
else:
self.num = int(self.raw_num)
self.modifier = None
def post_process(self, output):
occurences = 0
for x in output:
if isinstance(x, self.obj_type):
occurences += 1
if self.modifier == '+':
assertion = (occurences >= self.num)
elif self.modifier == '-':
assertion = (occurences <= self.num)
else:
assertion = (occurences == self.num)
if not assertion:
raise ContractFail("Returned %s %s, expected %s" % \
(occurences, self.obj_name, self.raw_num))
class ScrapesContract(Contract):
""" Contract to check presence of fields in scraped items
@scrapes page_name, page_body
"""
name = 'scrapes'
def post_process(self, output):
for x in output:
if isinstance(x, BaseItem):
for arg in self.args:
assert arg in x, '%r field is missing' % arg
if not arg in x:
raise ContractFail('%r field is missing' % arg)

View File

@ -50,3 +50,6 @@ class ScrapyDeprecationWarning(Warning):
"""
pass
class ContractFail(Exception):
"""Error in constructing contracts for a method"""
pass

View File

@ -241,3 +241,10 @@ WEBSERVICE_RESOURCES_BASE = {
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
'scrapy.contrib.webservice.stats.StatsResource': 1,
}
SPIDER_CONTRACTS = []
SPIDER_CONTRACTS_BASE = [
'scrapy.contracts.default.UrlContract',
'scrapy.contracts.default.ReturnsContract',
'scrapy.contracts.default.ScrapesContract',
]

View File

@ -1,6 +1,7 @@
"""Helper functions which doesn't fit anywhere else"""
import re
import inspect
import hashlib
from pkgutil import iter_modules
@ -104,3 +105,17 @@ def md5sum(file):
m.update(d)
return m.hexdigest()
def get_spec(func):
"""Returns (args, kwargs) touple for a function
>>> import re
>>> get_spec(re.match)
(['pattern', 'string'], {'flags': 0})
"""
spec = inspect.getargspec(func)
defaults = spec.defaults or []
firstdefault = len(spec.args) - len(defaults)
args = spec.args[:firstdefault]
kwargs = dict(zip(spec.args[firstdefault:], defaults))
return args, kwargs