1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 22:04:01 +00:00

SEP-017 contracts: first draft

This commit is contained in:
Alex Cepoi 2012-08-21 02:47:35 +02:00
parent 19bcb44c25
commit 99b76eaa2c
5 changed files with 155 additions and 0 deletions

6
.gitignore vendored
View File

@ -1,6 +1,12 @@
*.pyc
*swp
*~
_trial_temp
dropin.cache
docs/build
*egg-info
.tox
build/
dist/

50
scrapy/commands/check.py Normal file
View File

@ -0,0 +1,50 @@
from functools import wraps
from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.contracts import Contract
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Check contracts for given spider"
def run(self, args, opts):
self.crawler.engine.has_capacity = lambda: True
for spider in args or self.crawler.spiders.list():
spider = self.crawler.spiders.create(spider)
requests = self.get_requests(spider)
self.crawler.crawl(spider, requests)
self.crawler.start()
def get_requests(self, spider):
requests = []
for key, value in vars(type(spider)).iteritems():
if callable(value) and value.__doc__:
bound_method = value.__get__(spider, type(spider))
request = Request(url='http://scrapy.org', callback=bound_method)
# register contract hooks to the request
contracts = Contract.from_method(value)
for contract in contracts:
request = contract.prepare_request(request)
# discard anything the request might return
cb = request.callback
@wraps(cb)
def wrapper(response):
cb(response)
request.callback = wrapper
requests.append(request)
return requests

View File

@ -0,0 +1,2 @@
from .base import Contract, ContractType
from .default import *

74
scrapy/contracts/base.py Normal file
View File

@ -0,0 +1,74 @@
import re
from functools import wraps
from scrapy.utils.spider import iterate_spider_output
class ContractType(type):
""" Metaclass for contracts
- automatically registers contracts in the root `Contract` class
"""
def __new__(meta, name, bases, dct):
# only allow single inheritence
assert len(bases) == 1, 'Multiple inheritance is not allowed'
base = bases[0]
# ascend in inheritence chain
while type(base) not in [type, meta]:
base = type(base)
# register this as a valid contract
cls = type.__new__(meta, name, bases, dct)
if type(base) != type:
base.registered[cls.name] = cls
return cls
class Contract(object):
""" Abstract class for contracts
- keeps a reference of all derived classes in `registered`
"""
__metaclass__ = ContractType
registered = {}
def __init__(self, method, *args):
self.method = method
self.args = args
@classmethod
def from_method(cls, method):
contracts = []
for line in method.__doc__.split('\n'):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'[\,\s+]', args)
args = filter(lambda x:x, args)
contracts.append(cls.registered[name](method, *args))
return contracts
def prepare_request(self, request):
cb = request.callback
@wraps(cb)
def wrapper(response):
self.pre_process(response)
output = list(iterate_spider_output(cb(response)))
self.post_process(output)
return output
request.callback = wrapper
request = self.modify_request(request)
return request
def modify_request(self, request):
return request
def pre_process(self, response):
pass
def post_process(self, output):
pass

View File

@ -0,0 +1,23 @@
from scrapy.item import BaseItem
from .base import Contract
# contracts
class UrlContract(Contract):
name = 'url'
def modify_request(self, request):
return request.replace(url=self.args[0])
class ReturnsRequestContract(Contract):
name = 'returns_request'
class ScrapesContract(Contract):
name = 'scrapes'
def post_process(self, output):
for x in output:
if isinstance(x, BaseItem):
for arg in self.args:
assert arg in x, '%r field is missing' % arg