1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 20:44:18 +00:00

removed crawlspider v2 tests

This commit is contained in:
Pablo Hoffman 2011-06-03 18:26:17 -03:00
parent 03ae481cad
commit 61cc95df7c
6 changed files with 0 additions and 1002 deletions

View File

@ -1,94 +0,0 @@
from twisted.trial import unittest
from scrapy.http import Request
from scrapy.http import Response
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
import re
class MatchersTest(unittest.TestCase):
def setUp(self):
pass
def test_base_matcher(self):
matcher = BaseMatcher()
request = Request('http://example.com')
response = Response('http://example.com')
self.assertTrue(matcher.matches_request(request))
self.assertTrue(matcher.matches_response(response))
def test_url_matcher(self):
matcher = UrlMatcher('http://example.com')
request = Request('http://example.com')
response = Response('http://example.com')
self.failUnless(matcher.matches_request(request))
self.failUnless(matcher.matches_request(response))
request = Request('http://example2.com')
response = Response('http://example2.com')
self.failIf(matcher.matches_request(request))
self.failIf(matcher.matches_request(response))
def test_url_regex_matcher(self):
matcher = UrlRegexMatcher(r'sample')
urls = (
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/sample4.html',
)
for url in urls:
request, response = Request(url), Response(url)
self.failUnless(matcher.matches_request(request))
self.failUnless(matcher.matches_response(response))
matcher = UrlRegexMatcher(r'sample_fail')
for url in urls:
request, response = Request(url), Response(url)
self.failIf(matcher.matches_request(request))
self.failIf(matcher.matches_response(response))
matcher = UrlRegexMatcher(r'SAMPLE\d+', re.IGNORECASE)
for url in urls:
request, response = Request(url), Response(url)
self.failUnless(matcher.matches_request(request))
self.failUnless(matcher.matches_response(response))
def test_url_list_matcher(self):
urls = (
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/sample4.html',
)
urls2 = (
'http://example.com/sample5.html',
'http://example.com/sample6.html',
'http://example.com/sample7.html',
'http://example.com/sample8.html',
'http://example.com/',
)
matcher = UrlListMatcher(urls)
# match urls
for url in urls:
request, response = Request(url), Response(url)
self.failUnless(matcher.matches_request(request))
self.failUnless(matcher.matches_response(response))
# non-match urls
for url in urls2:
request, response = Request(url), Response(url)
self.failIf(matcher.matches_request(request))
self.failIf(matcher.matches_response(response))

View File

@ -1,156 +0,0 @@
from twisted.trial import unittest
from scrapy.http import Request
from scrapy.http import HtmlResponse
from scrapy.tests import get_testdata
from scrapy.contrib_exp.crawlspider.reqext import BaseSgmlRequestExtractor
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
from scrapy.contrib_exp.crawlspider.reqext import XPathRequestExtractor
class AbstractRequestExtractorTest(unittest.TestCase):
def _requests_equals(self, list1, list2):
"""Compares request's urls and link_text"""
for (r1, r2) in zip(list1, list2):
if r1.url != r2.url:
return False
if r1.meta['link_text'] != r2.meta['link_text']:
return False
# all equal
return True
class RequestExtractorTest(AbstractRequestExtractorTest):
def test_basic(self):
base_url = 'http://example.org/somepage/index.html'
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/" /></p></body></html>"""
requests = [
Request('http://example.org/somepage/item/12.html',
meta={'link_text': 'Item 12'}),
Request('http://example.org/about.html',
meta={'link_text': 'About us'}),
Request('http://example.org/othercat.html',
meta={'link_text': 'Other category'}),
Request('http://example.org/',
meta={'link_text': ''}),
]
response = HtmlResponse(base_url, body=html)
reqx = BaseSgmlRequestExtractor() # default: tag=a, attr=href
self.failUnless(
self._requests_equals(requests, reqx.extract_requests(response))
)
def test_base_url(self):
reqx = BaseSgmlRequestExtractor()
html = """<html><head><title>Page title<title>
<base href="http://otherdomain.com/base/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse("https://example.org/p/index.html", body=html)
reqs = reqx.extract_requests(response)
self.failUnless(self._requests_equals( \
[Request('http://otherdomain.com/base/item/12.html', \
meta={'link_text': 'Item 12'})], reqs), reqs)
# base url is an absolute path and relative to host
html = """<html><head><title>Page title<title>
<base href="/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse("https://example.org/p/index.html", body=html)
reqs = reqx.extract_requests(response)
self.failUnless(self._requests_equals( \
[Request('https://example.org/item/12.html', \
meta={'link_text': 'Item 12'})], reqs), reqs)
# base url has no scheme
html = """<html><head><title>Page title<title>
<base href="//noscheme.com/base/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse("https://example.org/p/index.html", body=html)
reqs = reqx.extract_requests(response)
self.failUnless(self._requests_equals( \
[Request('https://noscheme.com/base/item/12.html', \
meta={'link_text': 'Item 12'})], reqs), reqs)
def test_extraction_encoding(self):
#TODO: use own fixtures
body = get_testdata('link_extractor', 'linkextractor_noenc.html')
response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body,
headers={'Content-Type': ['text/html; charset=utf-8']})
response_noenc = HtmlResponse(url='http://example.com/noenc',
body=body)
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
response_latin1 = HtmlResponse(url='http://example.com/latin1',
body=body)
reqx = BaseSgmlRequestExtractor()
self.failUnless(
self._requests_equals(
reqx.extract_requests(response_utf8),
[ Request(url='http://example.com/sample_%C3%B1.html',
meta={'link_text': ''}),
Request(url='http://example.com/sample_%E2%82%AC.html',
meta={'link_text':
'sample \xe2\x82\xac text'.decode('utf-8')}) ]
)
)
self.failUnless(
self._requests_equals(
reqx.extract_requests(response_noenc),
[ Request(url='http://example.com/sample_%C3%B1.html',
meta={'link_text': ''}),
Request(url='http://example.com/sample_%E2%82%AC.html',
meta={'link_text':
'sample \xe2\x82\xac text'.decode('utf-8')}) ]
)
)
self.failUnless(
self._requests_equals(
reqx.extract_requests(response_latin1),
[ Request(url='http://example.com/sample_%F1.html',
meta={'link_text': ''}),
Request(url='http://example.com/sample_%E1.html',
meta={'link_text':
'sample \xe1 text'.decode('latin1')}) ]
)
)
class SgmlRequestExtractorTest(AbstractRequestExtractorTest):
pass
class XPathRequestExtractorTest(AbstractRequestExtractorTest):
def setUp(self):
# TODO: use own fixtures
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
self.response = HtmlResponse(url='http://example.com/index', body=body)
def test_restrict_xpaths(self):
reqx = XPathRequestExtractor('//div[@id="subwrapper"]')
self.failUnless(
self._requests_equals(
reqx.extract_requests(self.response),
[ Request(url='http://example.com/sample1.html',
meta={'link_text': ''}),
Request(url='http://example.com/sample2.html',
meta={'link_text': 'sample 2'}) ]
)
)

View File

@ -1,124 +0,0 @@
from twisted.internet import defer
from twisted.trial import unittest
from scrapy.http import Request
from scrapy.http import HtmlResponse
from scrapy.utils.python import equal_attributes
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
from scrapy.contrib_exp.crawlspider.reqgen import RequestGenerator
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
from scrapy.contrib_exp.crawlspider.reqproc import FilterDomain
from scrapy.contrib_exp.crawlspider.reqproc import FilterUrl
from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
class RequestGeneratorTest(unittest.TestCase):
def setUp(self):
url = 'http://example.org/somepage/index.html'
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/" /></p></body></html>"""
self.response = HtmlResponse(url, body=html)
self.deferred = defer.Deferred()
self.requests = [
Request('http://example.org/somepage/item/12.html',
meta={'link_text': 'Item 12'}),
Request('http://example.org/about.html',
meta={'link_text': 'About us'}),
Request('http://example.org/othercat.html',
meta={'link_text': 'Other category'}),
Request('http://example.org/',
meta={'link_text': ''}),
]
def _equal_requests_list(self, list1, list2):
list1 = list(list1)
list2 = list(list2)
if not len(list1) == len(list2):
return False
for (req1, req2) in zip(list1, list2):
if not equal_attributes(req1, req2, ['url']):
return False
return True
def test_basic(self):
reqgen = RequestGenerator([], [], callback=self.deferred)
# returns generator
requests = reqgen.generate_requests(self.response)
self.failUnlessEqual(list(requests), [])
def test_request_extractor(self):
extractors = [
SgmlRequestExtractor()
]
# extract all requests
reqgen = RequestGenerator(extractors, [], callback=self.deferred)
requests = reqgen.generate_requests(self.response)
self.failUnless(self._equal_requests_list(requests, self.requests))
def test_request_processor(self):
extractors = [
SgmlRequestExtractor()
]
processors = [
Canonicalize(),
FilterDupes(),
]
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
requests = reqgen.generate_requests(self.response)
self.failUnless(self._equal_requests_list(requests, self.requests))
# filter domain
processors = [
Canonicalize(),
FilterDupes(),
FilterDomain(deny='example.org'),
]
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
requests = reqgen.generate_requests(self.response)
self.failUnlessEqual(list(requests), [])
# filter url
processors = [
Canonicalize(),
FilterDupes(),
FilterUrl(deny=(r'about', r'othercat')),
]
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
requests = reqgen.generate_requests(self.response)
self.failUnless(self._equal_requests_list(requests, [
Request('http://example.org/somepage/item/12.html',
meta={'link_text': 'Item 12'}),
Request('http://example.org/',
meta={'link_text': ''}),
]))
processors = [
Canonicalize(),
FilterDupes(),
FilterUrl(allow=r'/somepage/'),
]
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
requests = reqgen.generate_requests(self.response)
self.failUnless(self._equal_requests_list(requests, [
Request('http://example.org/somepage/item/12.html',
meta={'link_text': 'Item 12'}),
]))

View File

@ -1,144 +0,0 @@
from twisted.trial import unittest
from scrapy.http import Request
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
from scrapy.contrib_exp.crawlspider.reqproc import FilterDomain
from scrapy.contrib_exp.crawlspider.reqproc import FilterUrl
from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
import copy
class RequestProcessorsTest(unittest.TestCase):
def test_canonicalize_requests(self):
urls = [
'http://example.com/do?&b=1&a=2&c=3',
'http://example.com/do?123,&q=a space',
]
urls_after = [
'http://example.com/do?a=2&b=1&c=3',
'http://example.com/do?123%2C=&q=a+space',
]
proc = Canonicalize()
results = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(results, urls_after)
def test_unique_requests(self):
urls = [
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/sample1.html',
'http://example.com/sample2.html',
]
urls_unique = [
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/sample3.html',
]
proc = FilterDupes()
results = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(results, urls_unique)
# Check custom attributes
requests = [
Request('http://example.com', method='GET'),
Request('http://example.com', method='POST'),
]
proc = FilterDupes('url', 'method')
self.failUnlessEqual(len(list(proc(requests))), 2)
proc = FilterDupes('url')
self.failUnlessEqual(len(list(proc(requests))), 1)
def test_filter_domain(self):
urls = [
'http://blah1.com/index',
'http://blah2.com/index',
'http://blah1.com/section',
'http://blah2.com/section',
]
proc = FilterDomain(allow=('blah1.com'), deny=('blah2.com'))
filtered = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(filtered, [
'http://blah1.com/index',
'http://blah1.com/section',
])
proc = FilterDomain(deny=('blah1.com', 'blah2.com'))
filtered = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(filtered, [])
proc = FilterDomain(allow=('blah1.com', 'blah2.com'))
filtered = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(filtered, urls)
def test_filter_url(self):
urls = [
'http://blah1.com/index',
'http://blah2.com/index',
'http://blah1.com/section',
'http://blah2.com/section',
]
proc = FilterUrl(allow=(r'blah1'), deny=(r'blah2'))
filtered = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(filtered, [
'http://blah1.com/index',
'http://blah1.com/section',
])
proc = FilterUrl(deny=('blah1', 'blah2'))
filtered = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(filtered, [])
proc = FilterUrl(allow=('index$', 'section$'))
filtered = [req.url for req in proc(Request(url) for url in urls)]
self.failUnlessEquals(filtered, urls)
def test_all_processors(self):
urls = [
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/do?&b=1&a=2&c=3',
'http://example.com/do?123,&q=a space',
]
urls_processed = [
'http://example.com/sample1.html',
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/do?a=2&b=1&c=3',
'http://example.com/do?123%2C=&q=a+space',
]
processors = [
Canonicalize(),
FilterDupes(),
]
def _process(requests):
"""Apply all processors"""
# copy list
processed = [copy.copy(req) for req in requests]
for proc in processors:
processed = proc(processed)
return processed
# empty requests
results1 = [r.url for r in _process([])]
self.failUnlessEquals(results1, [])
# try urls
requests = (Request(url) for url in urls)
results2 = [r.url for r in _process(requests)]
self.failUnlessEquals(results2, urls_processed)

View File

@ -1,262 +0,0 @@
from twisted.trial import unittest
from scrapy.http import HtmlResponse
from scrapy.spider import BaseSpider
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
from scrapy.contrib_exp.crawlspider.rules import CompiledRule
from scrapy.contrib_exp.crawlspider.rules import Rule
from scrapy.contrib_exp.crawlspider.rules import RulesManager
from functools import partial
class RuleInitializationTest(unittest.TestCase):
def test_fail_if_rule_null(self):
# fail on empty rule
self.failUnlessRaises(ValueError, Rule)
self.failUnlessRaises(ValueError, Rule,
**dict(callback=None, follow=None))
self.failUnlessRaises(ValueError, Rule,
**dict(callback=None, follow=False))
def test_minimal_arguments_to_instantiation(self):
# not fail if callback set
self.failUnless(Rule(callback=lambda: True))
# not fail if follow set
self.failUnless(Rule(follow=True))
def test_validate_default_attributes(self):
# test null Rule
rule = Rule(follow=True)
self.failUnlessEqual(None, rule.matcher)
self.failUnlessEqual(None, rule.callback)
self.failUnlessEqual({}, rule.cb_kwargs)
# follow default False
self.failUnlessEqual(True, rule.follow)
def test_validate_attributes_set(self):
matcher = BaseMatcher()
callback = lambda: True
rule = Rule(matcher, callback, True, a=1)
# test attributes
self.failUnlessEqual(matcher, rule.matcher)
self.failUnlessEqual(callback, rule.callback)
self.failUnlessEqual({'a': 1}, rule.cb_kwargs)
self.failUnlessEqual(True, rule.follow)
class CompiledRuleInitializationTest(unittest.TestCase):
def test_fail_on_invalid_matcher(self):
# pass with valid matcher
self.failUnless(CompiledRule(BaseMatcher()),
"Failed CompiledRule instantiation")
# at least needs valid matcher
self.assertRaises(AssertionError, CompiledRule, None)
self.assertRaises(AssertionError, CompiledRule, False)
self.assertRaises(AssertionError, CompiledRule, True)
def test_fail_on_invalid_callback(self):
# pass with valid callback
callback = lambda: True
self.failUnless(CompiledRule(BaseMatcher(), callback))
# pass with callback none
self.failUnless(CompiledRule(BaseMatcher(), None))
# assert on invalid callback
self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
'myfunc')
# numeric variable
var = 123
self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
var)
class A:
pass
# random instance
self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
A())
def test_fail_on_invalid_follow_value(self):
callback = lambda: True
matcher = BaseMatcher()
# pass bool
self.failUnless(CompiledRule(matcher, callback, True))
self.failUnless(CompiledRule(matcher, callback, False))
# assert with non-bool
self.assertRaises(AssertionError, CompiledRule, matcher,
callback, None)
self.assertRaises(AssertionError, CompiledRule, matcher,
callback, 1)
def test_validate_default_attributes(self):
callback = lambda: True
matcher = BaseMatcher()
rule = CompiledRule(matcher, callback, True)
# test attributes
self.failUnlessEqual(matcher, rule.matcher)
self.failUnlessEqual(callback, rule.callback)
self.failUnlessEqual(True, rule.follow)
class RulesTest(unittest.TestCase):
def test_rules_manager_basic(self):
spider = BaseSpider('foo')
response1 = HtmlResponse('http://example.org')
response2 = HtmlResponse('http://othersite.org')
rulesman = RulesManager([], spider)
# should return none
self.failIf(rulesman.get_rule_from_response(response1))
self.failIf(rulesman.get_rule_from_response(response2))
# rules manager with match-all rule
rulesman = RulesManager([
Rule(BaseMatcher(), follow=True),
], spider)
# returns CompiledRule
rule1 = rulesman.get_rule_from_response(response1)
rule2 = rulesman.get_rule_from_response(response2)
self.failUnless(isinstance(rule1, CompiledRule))
self.failUnless(isinstance(rule2, CompiledRule))
self.assert_(rule1 is rule2)
self.failUnlessEqual(rule1.callback, None)
self.failUnlessEqual(rule1.follow, True)
def test_rules_manager_empty_rule(self):
spider = BaseSpider('foo')
response = HtmlResponse('http://example.org')
rulesman = RulesManager([Rule(follow=True)], spider)
rule = rulesman.get_rule_from_response(response)
# default matcher if None: BaseMatcher
self.failUnless(isinstance(rule.matcher, BaseMatcher))
def test_rules_manager_default_matcher(self):
spider = BaseSpider('foo')
response = HtmlResponse('http://example.org')
callback = lambda x: None
rulesman = RulesManager([
Rule('http://example.org', callback),
], spider, default_matcher=UrlMatcher)
rule = rulesman.get_rule_from_response(response)
self.failUnless(isinstance(rule.matcher, UrlMatcher))
def test_rules_manager_matchers(self):
spider = BaseSpider('foo')
response1 = HtmlResponse('http://example.org')
response2 = HtmlResponse('http://othersite.org')
urlmatcher = UrlMatcher('http://example.org')
basematcher = BaseMatcher()
# callback needed for Rule
callback = lambda x: None
# test fail matcher resolve
self.assertRaises(ValueError, RulesManager,
[Rule(False, callback)], spider)
self.assertRaises(ValueError, RulesManager,
[Rule(spider, callback)], spider)
rulesman = RulesManager([
Rule(urlmatcher, callback),
Rule(basematcher, callback),
], spider)
# response1 matches example.org
rule1 = rulesman.get_rule_from_response(response1)
# response2 is catch by BaseMatcher()
rule2 = rulesman.get_rule_from_response(response2)
self.failUnlessEqual(rule1.matcher, urlmatcher)
self.failUnlessEqual(rule2.matcher, basematcher)
# reverse order. BaseMatcher should match all
rulesman = RulesManager([
Rule(basematcher, callback),
Rule(urlmatcher, callback),
], spider)
rule1 = rulesman.get_rule_from_response(response1)
rule2 = rulesman.get_rule_from_response(response2)
self.failUnlessEqual(rule1.matcher, basematcher)
self.failUnlessEqual(rule2.matcher, basematcher)
self.failUnless(rule1 is rule2)
def test_rules_manager_callbacks(self):
mycallback = lambda: True
spider = BaseSpider('foo')
spider.parse_item = lambda: True
response1 = HtmlResponse('http://example.org')
response2 = HtmlResponse('http://othersite.org')
rulesman = RulesManager([
Rule('example', mycallback),
Rule('othersite', 'parse_item'),
], spider, default_matcher=UrlRegexMatcher)
rule1 = rulesman.get_rule_from_response(response1)
rule2 = rulesman.get_rule_from_response(response2)
self.failUnlessEqual(rule1.callback, mycallback)
self.failUnlessEqual(rule2.callback, spider.parse_item)
# fail unknown callback
self.assertRaises(AttributeError, RulesManager, [
Rule(BaseMatcher(), 'mycallback')
], spider)
# fail not callable
spider.not_callable = True
self.assertRaises(AttributeError, RulesManager, [
Rule(BaseMatcher(), 'not_callable')
], spider)
def test_rules_manager_callback_with_arguments(self):
spider = BaseSpider('foo')
response = HtmlResponse('http://example.org')
kwargs = {'a': 1}
def myfunc(**mykwargs):
return mykwargs
# verify return validation
self.failUnlessEquals(kwargs, myfunc(**kwargs))
# test callback w/o arguments
rulesman = RulesManager([
Rule(BaseMatcher(), myfunc),
], spider)
rule = rulesman.get_rule_from_response(response)
# without arguments should return same callback
self.failUnlessEqual(rule.callback, myfunc)
# test callback w/ arguments
rulesman = RulesManager([
Rule(BaseMatcher(), myfunc, **kwargs),
], spider)
rule = rulesman.get_rule_from_response(response)
# with argument should return partial applied callback
self.failUnless(isinstance(rule.callback, partial))
self.failUnlessEquals(kwargs, rule.callback())

View File

@ -1,222 +0,0 @@
from twisted.trial import unittest
from scrapy.http import Request
from scrapy.http import HtmlResponse
from scrapy.item import BaseItem
from scrapy.utils.spider import iterate_spider_output
# basics
from scrapy.contrib_exp.crawlspider import CrawlSpider
from scrapy.contrib_exp.crawlspider import Rule
# matchers
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
# extractors
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
# processors
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
# mock items
class Item1(BaseItem):
pass
class Item2(BaseItem):
pass
class Item3(BaseItem):
pass
class CrawlSpiderTest(unittest.TestCase):
def spider_factory(self, rules=[],
extractors=[], processors=[],
start_urls=[]):
# mock spider
class Spider(CrawlSpider):
def parse_item1(self, response):
return Item1()
def parse_item2(self, response):
return Item2()
def parse_item3(self, response):
return Item3()
def parse_request1(self, response):
return Request('http://example.org/request1')
def parse_request2(self, response):
return Request('http://example.org/request2')
Spider.start_urls = start_urls
Spider.rules = rules
Spider.request_extractors = extractors
Spider.request_processors = processors
return Spider('foo')
def test_start_url_auto_rule(self):
spider = self.spider_factory()
# zero spider rules
self.failUnlessEqual(len(spider.rules), 0)
self.failUnlessEqual(len(spider._rulesman._rules), 0)
spider = self.spider_factory(start_urls=['http://example.org'])
self.failUnlessEqual(len(spider.rules), 0)
self.failUnlessEqual(len(spider._rulesman._rules), 1)
def test_start_url_matcher(self):
url = 'http://example.org'
spider = self.spider_factory(start_urls=[url])
response = HtmlResponse(url)
rule = spider._rulesman.get_rule_from_response(response)
self.failUnless(isinstance(rule.matcher, UrlListMatcher))
response = HtmlResponse(url + '/item.html')
rule = spider._rulesman.get_rule_from_response(response)
self.failUnless(rule is None)
# TODO: remove this block
# in previous version get_rule returns rule from response.request
response.request = Request(url)
rule = spider._rulesman.get_rule_from_response(response.request)
self.failUnless(isinstance(rule.matcher, UrlListMatcher))
self.failUnlessEqual(rule.follow, True)
def test_parse_callback(self):
response = HtmlResponse('http://example.org')
rules = (
Rule(BaseMatcher(), 'parse_item1'),
)
spider = self.spider_factory(rules)
result = list(spider.parse(response))
self.failUnlessEqual(len(result), 1)
self.failUnless(isinstance(result[0], Item1))
def test_crawling_start_url(self):
url = 'http://example.org/'
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/" /></p></body></html>"""
response = HtmlResponse(url, body=html)
extractors = (SgmlRequestExtractor(), )
spider = self.spider_factory(start_urls=[url],
extractors=extractors)
result = list(spider.parse(response))
# 1 request extracted: example.org/
# because requests returns only matching
self.failUnlessEqual(len(result), 1)
# we will add catch-all rule to extract all
callback = lambda x: None
rules = [Rule(r'\.html$', callback=callback)]
spider = self.spider_factory(rules, start_urls=[url],
extractors=extractors)
result = list(spider.parse(response))
# 4 requests extracted
# 3 of .html pattern
# 1 of start url patter
self.failUnlessEqual(len(result), 4)
def test_crawling_simple_rule(self):
url = 'http://example.org/somepage/index.html'
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/" /></p></body></html>"""
response = HtmlResponse(url, body=html)
rules = (
# first response callback
Rule(r'index\.html', 'parse_item1'),
)
spider = self.spider_factory(rules)
result = list(spider.parse(response))
# should return Item1
self.failUnlessEqual(len(result), 1)
self.failUnless(isinstance(result[0], Item1))
# test request generation
rules = (
# first response without callback and follow flag
Rule(r'index\.html', follow=True),
Rule(r'(\.html|/)$', 'parse_item1'),
)
spider = self.spider_factory(rules)
result = list(spider.parse(response))
# 0 because spider does not have extractors
self.failUnlessEqual(len(result), 0)
extractors = (SgmlRequestExtractor(), )
# instance spider with extractor
spider = self.spider_factory(rules, extractors)
result = list(spider.parse(response))
# 4 requests extracted
self.failUnlessEqual(len(result), 4)
def test_crawling_multiple_rules(self):
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/" /></p></body></html>"""
response = HtmlResponse('http://example.org/index.html', body=html)
response1 = HtmlResponse('http://example.org/1.html')
response2 = HtmlResponse('http://example.org/othercat.html')
rules = (
Rule(r'\d+\.html$', 'parse_item1'),
Rule(r'othercat\.html$', 'parse_item2'),
# follow-only rules
Rule(r'index\.html', 'parse_item3', follow=True)
)
extractors = [SgmlRequestExtractor()]
spider = self.spider_factory(rules, extractors)
result = list(spider.parse(response))
# 1 Item 2 Requests
self.failUnlessEqual(len(result), 3)
# parse_item3
self.failUnless(isinstance(result[0], Item3))
only_requests = lambda r: isinstance(r, Request)
requests = filter(only_requests, result[1:])
self.failUnlessEqual(len(requests), 2)
self.failUnless(all(requests))
result1 = list(spider.parse(response1))
# parse_item1
self.failUnlessEqual(len(result1), 1)
self.failUnless(isinstance(result1[0], Item1))
result2 = list(spider.parse(response2))
# parse_item2
self.failUnlessEqual(len(result2), 1)
self.failUnless(isinstance(result2[0], Item2))