From 901987154eebcbaa1ce3e37b7c31892aae353d57 Mon Sep 17 00:00:00 2001 From: Alex Cepoi Date: Fri, 24 Aug 2012 16:42:13 +0200 Subject: [PATCH] SEP-017 contracts * load contracts from settings * refactored contracts manager * fixed callback bug, which caused responses to be evaluated with a wrong callback sometimes * "returns" contract --- scrapy/commands/check.py | 47 ++++++++++------- scrapy/contracts/__init__.py | 78 ++++++++++++++++++++++++++++- scrapy/contracts/base.py | 74 --------------------------- scrapy/contracts/default.py | 66 +++++++++++++++++++++--- scrapy/exceptions.py | 3 ++ scrapy/settings/default_settings.py | 7 +++ scrapy/utils/misc.py | 15 ++++++ 7 files changed, 191 insertions(+), 99 deletions(-) delete mode 100644 scrapy/contracts/base.py diff --git a/scrapy/commands/check.py b/scrapy/commands/check.py index e68add5eb..d548e472c 100644 --- a/scrapy/commands/check.py +++ b/scrapy/commands/check.py @@ -1,9 +1,21 @@ from functools import wraps +from scrapy.conf import settings from scrapy.command import ScrapyCommand from scrapy.http import Request +from scrapy.contracts import ContractsManager +from scrapy.utils import display +from scrapy.utils.misc import load_object +from scrapy.utils.spider import iterate_spider_output -from scrapy.contracts import Contract +def _generate(cb): + """ create a callback which does not return anything """ + @wraps(cb) + def wrapper(response): + output = cb(response) + output = list(iterate_spider_output(output)) + # display.pprint(output) + return wrapper class Command(ScrapyCommand): requires_project = True @@ -15,6 +27,17 @@ class Command(ScrapyCommand): return "Check contracts for given spider" def run(self, args, opts): + self.conman = ContractsManager() + + # load contracts + contracts = settings['SPIDER_CONTRACTS_BASE'] + \ + settings['SPIDER_CONTRACTS'] + + for contract in contracts: + concls = load_object(contract) + self.conman.register(concls) + + # schedule requests self.crawler.engine.has_capacity = lambda: True for spider in args or self.crawler.spiders.list(): @@ -22,29 +45,19 @@ class Command(ScrapyCommand): requests = self.get_requests(spider) self.crawler.crawl(spider, requests) + # start checks self.crawler.start() def get_requests(self, spider): requests = [] - for key, value in vars(type(spider)).iteritems(): + for key, value in vars(type(spider)).items(): if callable(value) and value.__doc__: bound_method = value.__get__(spider, type(spider)) - request = Request(url='http://scrapy.org', callback=bound_method) + request = self.conman.from_method(bound_method) - # register contract hooks to the request - contracts = Contract.from_method(value) - for contract in contracts: - request = contract.prepare_request(request) - - # discard anything the request might return - cb = request.callback - @wraps(cb) - def wrapper(response): - cb(response) - - request.callback = wrapper - - requests.append(request) + if request: + request.callback = _generate(request.callback) + requests.append(request) return requests diff --git a/scrapy/contracts/__init__.py b/scrapy/contracts/__init__.py index fb66291c7..c1f2038b6 100644 --- a/scrapy/contracts/__init__.py +++ b/scrapy/contracts/__init__.py @@ -1,2 +1,76 @@ -from .base import Contract, ContractType -from .default import * +import re +import inspect +from functools import wraps + +from scrapy.http import Request +from scrapy.utils.spider import iterate_spider_output +from scrapy.utils.misc import get_spec +from scrapy.exceptions import ContractFail + +class ContractsManager(object): + registered = {} + + def register(self, contract): + self.registered[contract.name] = contract + + def extract_contracts(self, method): + contracts = [] + for line in method.__doc__.split('\n'): + line = line.strip() + + if line.startswith('@'): + name, args = re.match(r'@(\w+)\s*(.*)', line).groups() + args = re.split(r'\s*\,\s*', args) + + contracts.append(self.registered[name](method, *args)) + + return contracts + + def from_method(self, method): + contracts = self.extract_contracts(method) + if contracts: + # calculate request args + args = get_spec(Request.__init__)[1] + args['callback'] = method + for contract in contracts: + args = contract.adjust_request_args(args) + + # create and prepare request + assert 'url' in args, "Method '%s' does not have an url contract" % method.__name__ + request = Request(**args) + for contract in contracts: + request = contract.prepare_request(request) + + return request + +class Contract(object): + """ Abstract class for contracts """ + + def __init__(self, method, *args): + self.method = method + self.args = args + + def prepare_request(self, request): + cb = request.callback + @wraps(cb) + def wrapper(response): + self.pre_process(response) + output = list(iterate_spider_output(cb(response))) + self.post_process(output) + return output + + request.callback = wrapper + request = self.modify_request(request) + return request + + def adjust_request_args(self, args): + return args + + def modify_request(self, request): + return request + + def pre_process(self, response): + pass + + def post_process(self, output): + pass diff --git a/scrapy/contracts/base.py b/scrapy/contracts/base.py deleted file mode 100644 index f5756be31..000000000 --- a/scrapy/contracts/base.py +++ /dev/null @@ -1,74 +0,0 @@ -import re -from functools import wraps - -from scrapy.utils.spider import iterate_spider_output - -class ContractType(type): - """ Metaclass for contracts - - automatically registers contracts in the root `Contract` class - """ - - def __new__(meta, name, bases, dct): - # only allow single inheritence - assert len(bases) == 1, 'Multiple inheritance is not allowed' - base = bases[0] - - # ascend in inheritence chain - while type(base) not in [type, meta]: - base = type(base) - - # register this as a valid contract - cls = type.__new__(meta, name, bases, dct) - if type(base) != type: - base.registered[cls.name] = cls - return cls - - -class Contract(object): - """ Abstract class for contracts - - keeps a reference of all derived classes in `registered` - """ - - __metaclass__ = ContractType - registered = {} - - def __init__(self, method, *args): - self.method = method - self.args = args - - @classmethod - def from_method(cls, method): - contracts = [] - for line in method.__doc__.split('\n'): - line = line.strip() - - if line.startswith('@'): - name, args = re.match(r'@(\w+)\s*(.*)', line).groups() - args = re.split(r'[\,\s+]', args) - args = filter(lambda x:x, args) - - contracts.append(cls.registered[name](method, *args)) - - return contracts - - def prepare_request(self, request): - cb = request.callback - @wraps(cb) - def wrapper(response): - self.pre_process(response) - output = list(iterate_spider_output(cb(response))) - self.post_process(output) - return output - - request.callback = wrapper - request = self.modify_request(request) - return request - - def modify_request(self, request): - return request - - def pre_process(self, response): - pass - - def post_process(self, output): - pass diff --git a/scrapy/contracts/default.py b/scrapy/contracts/default.py index 77f2e8c75..043a8902a 100644 --- a/scrapy/contracts/default.py +++ b/scrapy/contracts/default.py @@ -1,23 +1,77 @@ from scrapy.item import BaseItem +from scrapy.http import Request +from scrapy.exceptions import ContractFail -from .base import Contract +from . import Contract # contracts class UrlContract(Contract): + """ Contract to set the url of the request (mandatory) + @url http://scrapy.org + """ + name = 'url' - def modify_request(self, request): - return request.replace(url=self.args[0]) + def adjust_request_args(self, args): + args['url'] = self.args[0] + return args -class ReturnsRequestContract(Contract): - name = 'returns_request' +class ReturnsContract(Contract): + """ Contract to check the output of a callback + @returns items, 1 + @returns requests, 1+ + """ + + name = 'returns' + objects = { + 'requests': Request, + 'items': BaseItem, + } + + def __init__(self, *args, **kwargs): + super(ReturnsContract, self).__init__(*args, **kwargs) + + if len(self.args) != 2: + raise ContractError("Returns Contract must have two arguments") + self.obj_name, self.raw_num = self.args + + # validate input + self.obj_type = self.objects[self.obj_name] + + self.modifier = self.raw_num[-1] + if self.modifier in ['+', '-']: + self.num = int(self.raw_num[:-1]) + else: + self.num = int(self.raw_num) + self.modifier = None + + def post_process(self, output): + occurences = 0 + for x in output: + if isinstance(x, self.obj_type): + occurences += 1 + + if self.modifier == '+': + assertion = (occurences >= self.num) + elif self.modifier == '-': + assertion = (occurences <= self.num) + else: + assertion = (occurences == self.num) + + if not assertion: + raise ContractFail("Returned %s %s, expected %s" % \ + (occurences, self.obj_name, self.raw_num)) class ScrapesContract(Contract): + """ Contract to check presence of fields in scraped items + @scrapes page_name, page_body + """ name = 'scrapes' def post_process(self, output): for x in output: if isinstance(x, BaseItem): for arg in self.args: - assert arg in x, '%r field is missing' % arg + if not arg in x: + raise ContractFail('%r field is missing' % arg) diff --git a/scrapy/exceptions.py b/scrapy/exceptions.py index 8f29a4c8d..5ecc962ba 100644 --- a/scrapy/exceptions.py +++ b/scrapy/exceptions.py @@ -50,3 +50,6 @@ class ScrapyDeprecationWarning(Warning): """ pass +class ContractFail(Exception): + """Error in constructing contracts for a method""" + pass diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 3baf4cdb7..37a1bd689 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -241,3 +241,10 @@ WEBSERVICE_RESOURCES_BASE = { 'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1, 'scrapy.contrib.webservice.stats.StatsResource': 1, } + +SPIDER_CONTRACTS = [] +SPIDER_CONTRACTS_BASE = [ + 'scrapy.contracts.default.UrlContract', + 'scrapy.contracts.default.ReturnsContract', + 'scrapy.contracts.default.ScrapesContract', +] diff --git a/scrapy/utils/misc.py b/scrapy/utils/misc.py index fe9b6d058..449923ab1 100644 --- a/scrapy/utils/misc.py +++ b/scrapy/utils/misc.py @@ -1,6 +1,7 @@ """Helper functions which doesn't fit anywhere else""" import re +import inspect import hashlib from pkgutil import iter_modules @@ -104,3 +105,17 @@ def md5sum(file): m.update(d) return m.hexdigest() +def get_spec(func): + """Returns (args, kwargs) touple for a function + + >>> import re + >>> get_spec(re.match) + (['pattern', 'string'], {'flags': 0}) + """ + spec = inspect.getargspec(func) + defaults = spec.defaults or [] + + firstdefault = len(spec.args) - len(defaults) + args = spec.args[:firstdefault] + kwargs = dict(zip(spec.args[firstdefault:], defaults)) + return args, kwargs