1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 22:24:24 +00:00

SEP-017 contracts

* load contracts from settings
* refactored contracts manager
* fixed callback bug, which caused responses to be evaluated with a
wrong callback sometimes
* "returns" contract
This commit is contained in:
Alex Cepoi 2012-08-24 16:42:13 +02:00
parent 99b76eaa2c
commit 901987154e
7 changed files with 191 additions and 99 deletions

View File

@ -1,9 +1,21 @@
from functools import wraps from functools import wraps
from scrapy.conf import settings
from scrapy.command import ScrapyCommand from scrapy.command import ScrapyCommand
from scrapy.http import Request from scrapy.http import Request
from scrapy.contracts import ContractsManager
from scrapy.utils import display
from scrapy.utils.misc import load_object
from scrapy.utils.spider import iterate_spider_output
from scrapy.contracts import Contract def _generate(cb):
""" create a callback which does not return anything """
@wraps(cb)
def wrapper(response):
output = cb(response)
output = list(iterate_spider_output(output))
# display.pprint(output)
return wrapper
class Command(ScrapyCommand): class Command(ScrapyCommand):
requires_project = True requires_project = True
@ -15,6 +27,17 @@ class Command(ScrapyCommand):
return "Check contracts for given spider" return "Check contracts for given spider"
def run(self, args, opts): def run(self, args, opts):
self.conman = ContractsManager()
# load contracts
contracts = settings['SPIDER_CONTRACTS_BASE'] + \
settings['SPIDER_CONTRACTS']
for contract in contracts:
concls = load_object(contract)
self.conman.register(concls)
# schedule requests
self.crawler.engine.has_capacity = lambda: True self.crawler.engine.has_capacity = lambda: True
for spider in args or self.crawler.spiders.list(): for spider in args or self.crawler.spiders.list():
@ -22,29 +45,19 @@ class Command(ScrapyCommand):
requests = self.get_requests(spider) requests = self.get_requests(spider)
self.crawler.crawl(spider, requests) self.crawler.crawl(spider, requests)
# start checks
self.crawler.start() self.crawler.start()
def get_requests(self, spider): def get_requests(self, spider):
requests = [] requests = []
for key, value in vars(type(spider)).iteritems(): for key, value in vars(type(spider)).items():
if callable(value) and value.__doc__: if callable(value) and value.__doc__:
bound_method = value.__get__(spider, type(spider)) bound_method = value.__get__(spider, type(spider))
request = Request(url='http://scrapy.org', callback=bound_method) request = self.conman.from_method(bound_method)
# register contract hooks to the request if request:
contracts = Contract.from_method(value) request.callback = _generate(request.callback)
for contract in contracts: requests.append(request)
request = contract.prepare_request(request)
# discard anything the request might return
cb = request.callback
@wraps(cb)
def wrapper(response):
cb(response)
request.callback = wrapper
requests.append(request)
return requests return requests

View File

@ -1,2 +1,76 @@
from .base import Contract, ContractType import re
from .default import * import inspect
from functools import wraps
from scrapy.http import Request
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import get_spec
from scrapy.exceptions import ContractFail
class ContractsManager(object):
registered = {}
def register(self, contract):
self.registered[contract.name] = contract
def extract_contracts(self, method):
contracts = []
for line in method.__doc__.split('\n'):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'\s*\,\s*', args)
contracts.append(self.registered[name](method, *args))
return contracts
def from_method(self, method):
contracts = self.extract_contracts(method)
if contracts:
# calculate request args
args = get_spec(Request.__init__)[1]
args['callback'] = method
for contract in contracts:
args = contract.adjust_request_args(args)
# create and prepare request
assert 'url' in args, "Method '%s' does not have an url contract" % method.__name__
request = Request(**args)
for contract in contracts:
request = contract.prepare_request(request)
return request
class Contract(object):
""" Abstract class for contracts """
def __init__(self, method, *args):
self.method = method
self.args = args
def prepare_request(self, request):
cb = request.callback
@wraps(cb)
def wrapper(response):
self.pre_process(response)
output = list(iterate_spider_output(cb(response)))
self.post_process(output)
return output
request.callback = wrapper
request = self.modify_request(request)
return request
def adjust_request_args(self, args):
return args
def modify_request(self, request):
return request
def pre_process(self, response):
pass
def post_process(self, output):
pass

View File

@ -1,74 +0,0 @@
import re
from functools import wraps
from scrapy.utils.spider import iterate_spider_output
class ContractType(type):
""" Metaclass for contracts
- automatically registers contracts in the root `Contract` class
"""
def __new__(meta, name, bases, dct):
# only allow single inheritence
assert len(bases) == 1, 'Multiple inheritance is not allowed'
base = bases[0]
# ascend in inheritence chain
while type(base) not in [type, meta]:
base = type(base)
# register this as a valid contract
cls = type.__new__(meta, name, bases, dct)
if type(base) != type:
base.registered[cls.name] = cls
return cls
class Contract(object):
""" Abstract class for contracts
- keeps a reference of all derived classes in `registered`
"""
__metaclass__ = ContractType
registered = {}
def __init__(self, method, *args):
self.method = method
self.args = args
@classmethod
def from_method(cls, method):
contracts = []
for line in method.__doc__.split('\n'):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'[\,\s+]', args)
args = filter(lambda x:x, args)
contracts.append(cls.registered[name](method, *args))
return contracts
def prepare_request(self, request):
cb = request.callback
@wraps(cb)
def wrapper(response):
self.pre_process(response)
output = list(iterate_spider_output(cb(response)))
self.post_process(output)
return output
request.callback = wrapper
request = self.modify_request(request)
return request
def modify_request(self, request):
return request
def pre_process(self, response):
pass
def post_process(self, output):
pass

View File

@ -1,23 +1,77 @@
from scrapy.item import BaseItem from scrapy.item import BaseItem
from scrapy.http import Request
from scrapy.exceptions import ContractFail
from .base import Contract from . import Contract
# contracts # contracts
class UrlContract(Contract): class UrlContract(Contract):
""" Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""
name = 'url' name = 'url'
def modify_request(self, request): def adjust_request_args(self, args):
return request.replace(url=self.args[0]) args['url'] = self.args[0]
return args
class ReturnsRequestContract(Contract): class ReturnsContract(Contract):
name = 'returns_request' """ Contract to check the output of a callback
@returns items, 1
@returns requests, 1+
"""
name = 'returns'
objects = {
'requests': Request,
'items': BaseItem,
}
def __init__(self, *args, **kwargs):
super(ReturnsContract, self).__init__(*args, **kwargs)
if len(self.args) != 2:
raise ContractError("Returns Contract must have two arguments")
self.obj_name, self.raw_num = self.args
# validate input
self.obj_type = self.objects[self.obj_name]
self.modifier = self.raw_num[-1]
if self.modifier in ['+', '-']:
self.num = int(self.raw_num[:-1])
else:
self.num = int(self.raw_num)
self.modifier = None
def post_process(self, output):
occurences = 0
for x in output:
if isinstance(x, self.obj_type):
occurences += 1
if self.modifier == '+':
assertion = (occurences >= self.num)
elif self.modifier == '-':
assertion = (occurences <= self.num)
else:
assertion = (occurences == self.num)
if not assertion:
raise ContractFail("Returned %s %s, expected %s" % \
(occurences, self.obj_name, self.raw_num))
class ScrapesContract(Contract): class ScrapesContract(Contract):
""" Contract to check presence of fields in scraped items
@scrapes page_name, page_body
"""
name = 'scrapes' name = 'scrapes'
def post_process(self, output): def post_process(self, output):
for x in output: for x in output:
if isinstance(x, BaseItem): if isinstance(x, BaseItem):
for arg in self.args: for arg in self.args:
assert arg in x, '%r field is missing' % arg if not arg in x:
raise ContractFail('%r field is missing' % arg)

View File

@ -50,3 +50,6 @@ class ScrapyDeprecationWarning(Warning):
""" """
pass pass
class ContractFail(Exception):
"""Error in constructing contracts for a method"""
pass

View File

@ -241,3 +241,10 @@ WEBSERVICE_RESOURCES_BASE = {
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1, 'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
'scrapy.contrib.webservice.stats.StatsResource': 1, 'scrapy.contrib.webservice.stats.StatsResource': 1,
} }
SPIDER_CONTRACTS = []
SPIDER_CONTRACTS_BASE = [
'scrapy.contracts.default.UrlContract',
'scrapy.contracts.default.ReturnsContract',
'scrapy.contracts.default.ScrapesContract',
]

View File

@ -1,6 +1,7 @@
"""Helper functions which doesn't fit anywhere else""" """Helper functions which doesn't fit anywhere else"""
import re import re
import inspect
import hashlib import hashlib
from pkgutil import iter_modules from pkgutil import iter_modules
@ -104,3 +105,17 @@ def md5sum(file):
m.update(d) m.update(d)
return m.hexdigest() return m.hexdigest()
def get_spec(func):
"""Returns (args, kwargs) touple for a function
>>> import re
>>> get_spec(re.match)
(['pattern', 'string'], {'flags': 0})
"""
spec = inspect.getargspec(func)
defaults = spec.defaults or []
firstdefault = len(spec.args) - len(defaults)
args = spec.args[:firstdefault]
kwargs = dict(zip(spec.args[firstdefault:], defaults))
return args, kwargs