mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 20:44:18 +00:00
removed crawlspider v2 tests
This commit is contained in:
parent
03ae481cad
commit
61cc95df7c
@ -1,94 +0,0 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import Response
|
||||
|
||||
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
|
||||
|
||||
import re
|
||||
|
||||
class MatchersTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
pass
|
||||
|
||||
def test_base_matcher(self):
|
||||
matcher = BaseMatcher()
|
||||
|
||||
request = Request('http://example.com')
|
||||
response = Response('http://example.com')
|
||||
|
||||
self.assertTrue(matcher.matches_request(request))
|
||||
self.assertTrue(matcher.matches_response(response))
|
||||
|
||||
def test_url_matcher(self):
|
||||
matcher = UrlMatcher('http://example.com')
|
||||
|
||||
request = Request('http://example.com')
|
||||
response = Response('http://example.com')
|
||||
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_request(response))
|
||||
|
||||
request = Request('http://example2.com')
|
||||
response = Response('http://example2.com')
|
||||
|
||||
self.failIf(matcher.matches_request(request))
|
||||
self.failIf(matcher.matches_request(response))
|
||||
|
||||
def test_url_regex_matcher(self):
|
||||
matcher = UrlRegexMatcher(r'sample')
|
||||
urls = (
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/sample4.html',
|
||||
)
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_response(response))
|
||||
|
||||
matcher = UrlRegexMatcher(r'sample_fail')
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failIf(matcher.matches_request(request))
|
||||
self.failIf(matcher.matches_response(response))
|
||||
|
||||
matcher = UrlRegexMatcher(r'SAMPLE\d+', re.IGNORECASE)
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_response(response))
|
||||
|
||||
def test_url_list_matcher(self):
|
||||
urls = (
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/sample4.html',
|
||||
)
|
||||
urls2 = (
|
||||
'http://example.com/sample5.html',
|
||||
'http://example.com/sample6.html',
|
||||
'http://example.com/sample7.html',
|
||||
'http://example.com/sample8.html',
|
||||
'http://example.com/',
|
||||
)
|
||||
matcher = UrlListMatcher(urls)
|
||||
|
||||
# match urls
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_response(response))
|
||||
|
||||
# non-match urls
|
||||
for url in urls2:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failIf(matcher.matches_request(request))
|
||||
self.failIf(matcher.matches_response(response))
|
||||
|
@ -1,156 +0,0 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.tests import get_testdata
|
||||
|
||||
from scrapy.contrib_exp.crawlspider.reqext import BaseSgmlRequestExtractor
|
||||
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
|
||||
from scrapy.contrib_exp.crawlspider.reqext import XPathRequestExtractor
|
||||
|
||||
class AbstractRequestExtractorTest(unittest.TestCase):
|
||||
|
||||
def _requests_equals(self, list1, list2):
|
||||
"""Compares request's urls and link_text"""
|
||||
for (r1, r2) in zip(list1, list2):
|
||||
if r1.url != r2.url:
|
||||
return False
|
||||
if r1.meta['link_text'] != r2.meta['link_text']:
|
||||
return False
|
||||
# all equal
|
||||
return True
|
||||
|
||||
|
||||
class RequestExtractorTest(AbstractRequestExtractorTest):
|
||||
|
||||
def test_basic(self):
|
||||
base_url = 'http://example.org/somepage/index.html'
|
||||
html = """<html><head><title>Page title<title>
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
<p><a href="/about.html">About us</a></p>
|
||||
<img src="/logo.png" alt="Company logo (not a link)" />
|
||||
<p><a href="../othercat.html">Other category</a></p>
|
||||
<p><a href="/" /></p></body></html>"""
|
||||
requests = [
|
||||
Request('http://example.org/somepage/item/12.html',
|
||||
meta={'link_text': 'Item 12'}),
|
||||
Request('http://example.org/about.html',
|
||||
meta={'link_text': 'About us'}),
|
||||
Request('http://example.org/othercat.html',
|
||||
meta={'link_text': 'Other category'}),
|
||||
Request('http://example.org/',
|
||||
meta={'link_text': ''}),
|
||||
]
|
||||
|
||||
response = HtmlResponse(base_url, body=html)
|
||||
reqx = BaseSgmlRequestExtractor() # default: tag=a, attr=href
|
||||
|
||||
self.failUnless(
|
||||
self._requests_equals(requests, reqx.extract_requests(response))
|
||||
)
|
||||
|
||||
def test_base_url(self):
|
||||
reqx = BaseSgmlRequestExtractor()
|
||||
|
||||
html = """<html><head><title>Page title<title>
|
||||
<base href="http://otherdomain.com/base/" />
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
</body></html>"""
|
||||
response = HtmlResponse("https://example.org/p/index.html", body=html)
|
||||
reqs = reqx.extract_requests(response)
|
||||
self.failUnless(self._requests_equals( \
|
||||
[Request('http://otherdomain.com/base/item/12.html', \
|
||||
meta={'link_text': 'Item 12'})], reqs), reqs)
|
||||
|
||||
# base url is an absolute path and relative to host
|
||||
html = """<html><head><title>Page title<title>
|
||||
<base href="/" />
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
</body></html>"""
|
||||
response = HtmlResponse("https://example.org/p/index.html", body=html)
|
||||
reqs = reqx.extract_requests(response)
|
||||
self.failUnless(self._requests_equals( \
|
||||
[Request('https://example.org/item/12.html', \
|
||||
meta={'link_text': 'Item 12'})], reqs), reqs)
|
||||
|
||||
# base url has no scheme
|
||||
html = """<html><head><title>Page title<title>
|
||||
<base href="//noscheme.com/base/" />
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
</body></html>"""
|
||||
response = HtmlResponse("https://example.org/p/index.html", body=html)
|
||||
reqs = reqx.extract_requests(response)
|
||||
self.failUnless(self._requests_equals( \
|
||||
[Request('https://noscheme.com/base/item/12.html', \
|
||||
meta={'link_text': 'Item 12'})], reqs), reqs)
|
||||
|
||||
def test_extraction_encoding(self):
|
||||
#TODO: use own fixtures
|
||||
body = get_testdata('link_extractor', 'linkextractor_noenc.html')
|
||||
response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body,
|
||||
headers={'Content-Type': ['text/html; charset=utf-8']})
|
||||
response_noenc = HtmlResponse(url='http://example.com/noenc',
|
||||
body=body)
|
||||
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
|
||||
response_latin1 = HtmlResponse(url='http://example.com/latin1',
|
||||
body=body)
|
||||
|
||||
reqx = BaseSgmlRequestExtractor()
|
||||
self.failUnless(
|
||||
self._requests_equals(
|
||||
reqx.extract_requests(response_utf8),
|
||||
[ Request(url='http://example.com/sample_%C3%B1.html',
|
||||
meta={'link_text': ''}),
|
||||
Request(url='http://example.com/sample_%E2%82%AC.html',
|
||||
meta={'link_text':
|
||||
'sample \xe2\x82\xac text'.decode('utf-8')}) ]
|
||||
)
|
||||
)
|
||||
|
||||
self.failUnless(
|
||||
self._requests_equals(
|
||||
reqx.extract_requests(response_noenc),
|
||||
[ Request(url='http://example.com/sample_%C3%B1.html',
|
||||
meta={'link_text': ''}),
|
||||
Request(url='http://example.com/sample_%E2%82%AC.html',
|
||||
meta={'link_text':
|
||||
'sample \xe2\x82\xac text'.decode('utf-8')}) ]
|
||||
)
|
||||
)
|
||||
|
||||
self.failUnless(
|
||||
self._requests_equals(
|
||||
reqx.extract_requests(response_latin1),
|
||||
[ Request(url='http://example.com/sample_%F1.html',
|
||||
meta={'link_text': ''}),
|
||||
Request(url='http://example.com/sample_%E1.html',
|
||||
meta={'link_text':
|
||||
'sample \xe1 text'.decode('latin1')}) ]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class SgmlRequestExtractorTest(AbstractRequestExtractorTest):
|
||||
pass
|
||||
|
||||
|
||||
class XPathRequestExtractorTest(AbstractRequestExtractorTest):
|
||||
|
||||
def setUp(self):
|
||||
# TODO: use own fixtures
|
||||
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
||||
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
||||
|
||||
|
||||
def test_restrict_xpaths(self):
|
||||
reqx = XPathRequestExtractor('//div[@id="subwrapper"]')
|
||||
self.failUnless(
|
||||
self._requests_equals(
|
||||
reqx.extract_requests(self.response),
|
||||
[ Request(url='http://example.com/sample1.html',
|
||||
meta={'link_text': ''}),
|
||||
Request(url='http://example.com/sample2.html',
|
||||
meta={'link_text': 'sample 2'}) ]
|
||||
)
|
||||
)
|
||||
|
@ -1,124 +0,0 @@
|
||||
from twisted.internet import defer
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.python import equal_attributes
|
||||
|
||||
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
|
||||
from scrapy.contrib_exp.crawlspider.reqgen import RequestGenerator
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterDomain
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterUrl
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
|
||||
|
||||
class RequestGeneratorTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
url = 'http://example.org/somepage/index.html'
|
||||
html = """<html><head><title>Page title<title>
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
<p><a href="/about.html">About us</a></p>
|
||||
<img src="/logo.png" alt="Company logo (not a link)" />
|
||||
<p><a href="../othercat.html">Other category</a></p>
|
||||
<p><a href="/" /></p></body></html>"""
|
||||
|
||||
self.response = HtmlResponse(url, body=html)
|
||||
self.deferred = defer.Deferred()
|
||||
self.requests = [
|
||||
Request('http://example.org/somepage/item/12.html',
|
||||
meta={'link_text': 'Item 12'}),
|
||||
Request('http://example.org/about.html',
|
||||
meta={'link_text': 'About us'}),
|
||||
Request('http://example.org/othercat.html',
|
||||
meta={'link_text': 'Other category'}),
|
||||
Request('http://example.org/',
|
||||
meta={'link_text': ''}),
|
||||
]
|
||||
|
||||
def _equal_requests_list(self, list1, list2):
|
||||
list1 = list(list1)
|
||||
list2 = list(list2)
|
||||
if not len(list1) == len(list2):
|
||||
return False
|
||||
|
||||
for (req1, req2) in zip(list1, list2):
|
||||
if not equal_attributes(req1, req2, ['url']):
|
||||
return False
|
||||
return True
|
||||
|
||||
def test_basic(self):
|
||||
reqgen = RequestGenerator([], [], callback=self.deferred)
|
||||
# returns generator
|
||||
requests = reqgen.generate_requests(self.response)
|
||||
self.failUnlessEqual(list(requests), [])
|
||||
|
||||
def test_request_extractor(self):
|
||||
extractors = [
|
||||
SgmlRequestExtractor()
|
||||
]
|
||||
|
||||
# extract all requests
|
||||
reqgen = RequestGenerator(extractors, [], callback=self.deferred)
|
||||
requests = reqgen.generate_requests(self.response)
|
||||
self.failUnless(self._equal_requests_list(requests, self.requests))
|
||||
|
||||
def test_request_processor(self):
|
||||
extractors = [
|
||||
SgmlRequestExtractor()
|
||||
]
|
||||
|
||||
processors = [
|
||||
Canonicalize(),
|
||||
FilterDupes(),
|
||||
]
|
||||
|
||||
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
|
||||
requests = reqgen.generate_requests(self.response)
|
||||
self.failUnless(self._equal_requests_list(requests, self.requests))
|
||||
|
||||
# filter domain
|
||||
processors = [
|
||||
Canonicalize(),
|
||||
FilterDupes(),
|
||||
FilterDomain(deny='example.org'),
|
||||
]
|
||||
|
||||
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
|
||||
requests = reqgen.generate_requests(self.response)
|
||||
self.failUnlessEqual(list(requests), [])
|
||||
|
||||
# filter url
|
||||
processors = [
|
||||
Canonicalize(),
|
||||
FilterDupes(),
|
||||
FilterUrl(deny=(r'about', r'othercat')),
|
||||
]
|
||||
|
||||
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
|
||||
requests = reqgen.generate_requests(self.response)
|
||||
|
||||
self.failUnless(self._equal_requests_list(requests, [
|
||||
Request('http://example.org/somepage/item/12.html',
|
||||
meta={'link_text': 'Item 12'}),
|
||||
Request('http://example.org/',
|
||||
meta={'link_text': ''}),
|
||||
]))
|
||||
|
||||
processors = [
|
||||
Canonicalize(),
|
||||
FilterDupes(),
|
||||
FilterUrl(allow=r'/somepage/'),
|
||||
]
|
||||
|
||||
reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
|
||||
requests = reqgen.generate_requests(self.response)
|
||||
|
||||
self.failUnless(self._equal_requests_list(requests, [
|
||||
Request('http://example.org/somepage/item/12.html',
|
||||
meta={'link_text': 'Item 12'}),
|
||||
]))
|
||||
|
||||
|
||||
|
||||
|
@ -1,144 +0,0 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterDomain
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterUrl
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
|
||||
|
||||
import copy
|
||||
|
||||
class RequestProcessorsTest(unittest.TestCase):
|
||||
|
||||
def test_canonicalize_requests(self):
|
||||
urls = [
|
||||
'http://example.com/do?&b=1&a=2&c=3',
|
||||
'http://example.com/do?123,&q=a space',
|
||||
]
|
||||
urls_after = [
|
||||
'http://example.com/do?a=2&b=1&c=3',
|
||||
'http://example.com/do?123%2C=&q=a+space',
|
||||
]
|
||||
|
||||
proc = Canonicalize()
|
||||
results = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(results, urls_after)
|
||||
|
||||
def test_unique_requests(self):
|
||||
urls = [
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
]
|
||||
urls_unique = [
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
]
|
||||
|
||||
proc = FilterDupes()
|
||||
results = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(results, urls_unique)
|
||||
|
||||
# Check custom attributes
|
||||
requests = [
|
||||
Request('http://example.com', method='GET'),
|
||||
Request('http://example.com', method='POST'),
|
||||
]
|
||||
proc = FilterDupes('url', 'method')
|
||||
self.failUnlessEqual(len(list(proc(requests))), 2)
|
||||
|
||||
proc = FilterDupes('url')
|
||||
self.failUnlessEqual(len(list(proc(requests))), 1)
|
||||
|
||||
def test_filter_domain(self):
|
||||
urls = [
|
||||
'http://blah1.com/index',
|
||||
'http://blah2.com/index',
|
||||
'http://blah1.com/section',
|
||||
'http://blah2.com/section',
|
||||
]
|
||||
|
||||
proc = FilterDomain(allow=('blah1.com'), deny=('blah2.com'))
|
||||
filtered = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(filtered, [
|
||||
'http://blah1.com/index',
|
||||
'http://blah1.com/section',
|
||||
])
|
||||
|
||||
proc = FilterDomain(deny=('blah1.com', 'blah2.com'))
|
||||
filtered = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(filtered, [])
|
||||
|
||||
proc = FilterDomain(allow=('blah1.com', 'blah2.com'))
|
||||
filtered = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(filtered, urls)
|
||||
|
||||
def test_filter_url(self):
|
||||
urls = [
|
||||
'http://blah1.com/index',
|
||||
'http://blah2.com/index',
|
||||
'http://blah1.com/section',
|
||||
'http://blah2.com/section',
|
||||
]
|
||||
|
||||
proc = FilterUrl(allow=(r'blah1'), deny=(r'blah2'))
|
||||
filtered = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(filtered, [
|
||||
'http://blah1.com/index',
|
||||
'http://blah1.com/section',
|
||||
])
|
||||
|
||||
proc = FilterUrl(deny=('blah1', 'blah2'))
|
||||
filtered = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(filtered, [])
|
||||
|
||||
proc = FilterUrl(allow=('index$', 'section$'))
|
||||
filtered = [req.url for req in proc(Request(url) for url in urls)]
|
||||
self.failUnlessEquals(filtered, urls)
|
||||
|
||||
|
||||
|
||||
def test_all_processors(self):
|
||||
urls = [
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/do?&b=1&a=2&c=3',
|
||||
'http://example.com/do?123,&q=a space',
|
||||
]
|
||||
urls_processed = [
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/do?a=2&b=1&c=3',
|
||||
'http://example.com/do?123%2C=&q=a+space',
|
||||
]
|
||||
|
||||
processors = [
|
||||
Canonicalize(),
|
||||
FilterDupes(),
|
||||
]
|
||||
|
||||
def _process(requests):
|
||||
"""Apply all processors"""
|
||||
# copy list
|
||||
processed = [copy.copy(req) for req in requests]
|
||||
for proc in processors:
|
||||
processed = proc(processed)
|
||||
return processed
|
||||
|
||||
# empty requests
|
||||
results1 = [r.url for r in _process([])]
|
||||
self.failUnlessEquals(results1, [])
|
||||
|
||||
# try urls
|
||||
requests = (Request(url) for url in urls)
|
||||
results2 = [r.url for r in _process(requests)]
|
||||
self.failUnlessEquals(results2, urls_processed)
|
||||
|
@ -1,262 +0,0 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
|
||||
|
||||
from scrapy.contrib_exp.crawlspider.rules import CompiledRule
|
||||
from scrapy.contrib_exp.crawlspider.rules import Rule
|
||||
from scrapy.contrib_exp.crawlspider.rules import RulesManager
|
||||
|
||||
from functools import partial
|
||||
|
||||
class RuleInitializationTest(unittest.TestCase):
|
||||
|
||||
def test_fail_if_rule_null(self):
|
||||
# fail on empty rule
|
||||
self.failUnlessRaises(ValueError, Rule)
|
||||
self.failUnlessRaises(ValueError, Rule,
|
||||
**dict(callback=None, follow=None))
|
||||
self.failUnlessRaises(ValueError, Rule,
|
||||
**dict(callback=None, follow=False))
|
||||
|
||||
def test_minimal_arguments_to_instantiation(self):
|
||||
# not fail if callback set
|
||||
self.failUnless(Rule(callback=lambda: True))
|
||||
# not fail if follow set
|
||||
self.failUnless(Rule(follow=True))
|
||||
|
||||
def test_validate_default_attributes(self):
|
||||
# test null Rule
|
||||
rule = Rule(follow=True)
|
||||
self.failUnlessEqual(None, rule.matcher)
|
||||
self.failUnlessEqual(None, rule.callback)
|
||||
self.failUnlessEqual({}, rule.cb_kwargs)
|
||||
# follow default False
|
||||
self.failUnlessEqual(True, rule.follow)
|
||||
|
||||
def test_validate_attributes_set(self):
|
||||
matcher = BaseMatcher()
|
||||
callback = lambda: True
|
||||
rule = Rule(matcher, callback, True, a=1)
|
||||
# test attributes
|
||||
self.failUnlessEqual(matcher, rule.matcher)
|
||||
self.failUnlessEqual(callback, rule.callback)
|
||||
self.failUnlessEqual({'a': 1}, rule.cb_kwargs)
|
||||
self.failUnlessEqual(True, rule.follow)
|
||||
|
||||
class CompiledRuleInitializationTest(unittest.TestCase):
|
||||
|
||||
def test_fail_on_invalid_matcher(self):
|
||||
# pass with valid matcher
|
||||
self.failUnless(CompiledRule(BaseMatcher()),
|
||||
"Failed CompiledRule instantiation")
|
||||
|
||||
# at least needs valid matcher
|
||||
self.assertRaises(AssertionError, CompiledRule, None)
|
||||
self.assertRaises(AssertionError, CompiledRule, False)
|
||||
self.assertRaises(AssertionError, CompiledRule, True)
|
||||
|
||||
def test_fail_on_invalid_callback(self):
|
||||
# pass with valid callback
|
||||
callback = lambda: True
|
||||
self.failUnless(CompiledRule(BaseMatcher(), callback))
|
||||
# pass with callback none
|
||||
self.failUnless(CompiledRule(BaseMatcher(), None))
|
||||
|
||||
# assert on invalid callback
|
||||
self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
|
||||
'myfunc')
|
||||
|
||||
# numeric variable
|
||||
var = 123
|
||||
self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
|
||||
var)
|
||||
|
||||
class A:
|
||||
pass
|
||||
|
||||
# random instance
|
||||
self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
|
||||
A())
|
||||
|
||||
|
||||
def test_fail_on_invalid_follow_value(self):
|
||||
callback = lambda: True
|
||||
matcher = BaseMatcher()
|
||||
# pass bool
|
||||
self.failUnless(CompiledRule(matcher, callback, True))
|
||||
self.failUnless(CompiledRule(matcher, callback, False))
|
||||
|
||||
# assert with non-bool
|
||||
self.assertRaises(AssertionError, CompiledRule, matcher,
|
||||
callback, None)
|
||||
self.assertRaises(AssertionError, CompiledRule, matcher,
|
||||
callback, 1)
|
||||
|
||||
def test_validate_default_attributes(self):
|
||||
callback = lambda: True
|
||||
matcher = BaseMatcher()
|
||||
rule = CompiledRule(matcher, callback, True)
|
||||
|
||||
# test attributes
|
||||
self.failUnlessEqual(matcher, rule.matcher)
|
||||
self.failUnlessEqual(callback, rule.callback)
|
||||
self.failUnlessEqual(True, rule.follow)
|
||||
|
||||
|
||||
class RulesTest(unittest.TestCase):
|
||||
def test_rules_manager_basic(self):
|
||||
spider = BaseSpider('foo')
|
||||
response1 = HtmlResponse('http://example.org')
|
||||
response2 = HtmlResponse('http://othersite.org')
|
||||
rulesman = RulesManager([], spider)
|
||||
|
||||
# should return none
|
||||
self.failIf(rulesman.get_rule_from_response(response1))
|
||||
self.failIf(rulesman.get_rule_from_response(response2))
|
||||
|
||||
# rules manager with match-all rule
|
||||
rulesman = RulesManager([
|
||||
Rule(BaseMatcher(), follow=True),
|
||||
], spider)
|
||||
|
||||
# returns CompiledRule
|
||||
rule1 = rulesman.get_rule_from_response(response1)
|
||||
rule2 = rulesman.get_rule_from_response(response2)
|
||||
|
||||
self.failUnless(isinstance(rule1, CompiledRule))
|
||||
self.failUnless(isinstance(rule2, CompiledRule))
|
||||
self.assert_(rule1 is rule2)
|
||||
self.failUnlessEqual(rule1.callback, None)
|
||||
self.failUnlessEqual(rule1.follow, True)
|
||||
|
||||
def test_rules_manager_empty_rule(self):
|
||||
spider = BaseSpider('foo')
|
||||
response = HtmlResponse('http://example.org')
|
||||
|
||||
rulesman = RulesManager([Rule(follow=True)], spider)
|
||||
|
||||
rule = rulesman.get_rule_from_response(response)
|
||||
# default matcher if None: BaseMatcher
|
||||
self.failUnless(isinstance(rule.matcher, BaseMatcher))
|
||||
|
||||
def test_rules_manager_default_matcher(self):
|
||||
spider = BaseSpider('foo')
|
||||
response = HtmlResponse('http://example.org')
|
||||
callback = lambda x: None
|
||||
|
||||
rulesman = RulesManager([
|
||||
Rule('http://example.org', callback),
|
||||
], spider, default_matcher=UrlMatcher)
|
||||
|
||||
rule = rulesman.get_rule_from_response(response)
|
||||
self.failUnless(isinstance(rule.matcher, UrlMatcher))
|
||||
|
||||
def test_rules_manager_matchers(self):
|
||||
spider = BaseSpider('foo')
|
||||
response1 = HtmlResponse('http://example.org')
|
||||
response2 = HtmlResponse('http://othersite.org')
|
||||
|
||||
urlmatcher = UrlMatcher('http://example.org')
|
||||
basematcher = BaseMatcher()
|
||||
# callback needed for Rule
|
||||
callback = lambda x: None
|
||||
|
||||
# test fail matcher resolve
|
||||
self.assertRaises(ValueError, RulesManager,
|
||||
[Rule(False, callback)], spider)
|
||||
self.assertRaises(ValueError, RulesManager,
|
||||
[Rule(spider, callback)], spider)
|
||||
|
||||
rulesman = RulesManager([
|
||||
Rule(urlmatcher, callback),
|
||||
Rule(basematcher, callback),
|
||||
], spider)
|
||||
|
||||
# response1 matches example.org
|
||||
rule1 = rulesman.get_rule_from_response(response1)
|
||||
# response2 is catch by BaseMatcher()
|
||||
rule2 = rulesman.get_rule_from_response(response2)
|
||||
|
||||
self.failUnlessEqual(rule1.matcher, urlmatcher)
|
||||
self.failUnlessEqual(rule2.matcher, basematcher)
|
||||
|
||||
# reverse order. BaseMatcher should match all
|
||||
rulesman = RulesManager([
|
||||
Rule(basematcher, callback),
|
||||
Rule(urlmatcher, callback),
|
||||
], spider)
|
||||
|
||||
rule1 = rulesman.get_rule_from_response(response1)
|
||||
rule2 = rulesman.get_rule_from_response(response2)
|
||||
|
||||
self.failUnlessEqual(rule1.matcher, basematcher)
|
||||
self.failUnlessEqual(rule2.matcher, basematcher)
|
||||
self.failUnless(rule1 is rule2)
|
||||
|
||||
def test_rules_manager_callbacks(self):
|
||||
mycallback = lambda: True
|
||||
|
||||
spider = BaseSpider('foo')
|
||||
spider.parse_item = lambda: True
|
||||
|
||||
response1 = HtmlResponse('http://example.org')
|
||||
response2 = HtmlResponse('http://othersite.org')
|
||||
|
||||
rulesman = RulesManager([
|
||||
Rule('example', mycallback),
|
||||
Rule('othersite', 'parse_item'),
|
||||
], spider, default_matcher=UrlRegexMatcher)
|
||||
|
||||
rule1 = rulesman.get_rule_from_response(response1)
|
||||
rule2 = rulesman.get_rule_from_response(response2)
|
||||
|
||||
self.failUnlessEqual(rule1.callback, mycallback)
|
||||
self.failUnlessEqual(rule2.callback, spider.parse_item)
|
||||
|
||||
# fail unknown callback
|
||||
self.assertRaises(AttributeError, RulesManager, [
|
||||
Rule(BaseMatcher(), 'mycallback')
|
||||
], spider)
|
||||
# fail not callable
|
||||
spider.not_callable = True
|
||||
self.assertRaises(AttributeError, RulesManager, [
|
||||
Rule(BaseMatcher(), 'not_callable')
|
||||
], spider)
|
||||
|
||||
|
||||
def test_rules_manager_callback_with_arguments(self):
|
||||
spider = BaseSpider('foo')
|
||||
response = HtmlResponse('http://example.org')
|
||||
|
||||
kwargs = {'a': 1}
|
||||
|
||||
def myfunc(**mykwargs):
|
||||
return mykwargs
|
||||
|
||||
# verify return validation
|
||||
self.failUnlessEquals(kwargs, myfunc(**kwargs))
|
||||
|
||||
# test callback w/o arguments
|
||||
rulesman = RulesManager([
|
||||
Rule(BaseMatcher(), myfunc),
|
||||
], spider)
|
||||
rule = rulesman.get_rule_from_response(response)
|
||||
|
||||
# without arguments should return same callback
|
||||
self.failUnlessEqual(rule.callback, myfunc)
|
||||
|
||||
# test callback w/ arguments
|
||||
rulesman = RulesManager([
|
||||
Rule(BaseMatcher(), myfunc, **kwargs),
|
||||
], spider)
|
||||
rule = rulesman.get_rule_from_response(response)
|
||||
|
||||
# with argument should return partial applied callback
|
||||
self.failUnless(isinstance(rule.callback, partial))
|
||||
self.failUnlessEquals(kwargs, rule.callback())
|
||||
|
||||
|
@ -1,222 +0,0 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
# basics
|
||||
from scrapy.contrib_exp.crawlspider import CrawlSpider
|
||||
from scrapy.contrib_exp.crawlspider import Rule
|
||||
|
||||
# matchers
|
||||
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
|
||||
|
||||
# extractors
|
||||
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
|
||||
|
||||
# processors
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
|
||||
|
||||
|
||||
# mock items
|
||||
class Item1(BaseItem):
|
||||
pass
|
||||
|
||||
class Item2(BaseItem):
|
||||
pass
|
||||
|
||||
class Item3(BaseItem):
|
||||
pass
|
||||
|
||||
|
||||
class CrawlSpiderTest(unittest.TestCase):
|
||||
|
||||
def spider_factory(self, rules=[],
|
||||
extractors=[], processors=[],
|
||||
start_urls=[]):
|
||||
# mock spider
|
||||
class Spider(CrawlSpider):
|
||||
def parse_item1(self, response):
|
||||
return Item1()
|
||||
|
||||
def parse_item2(self, response):
|
||||
return Item2()
|
||||
|
||||
def parse_item3(self, response):
|
||||
return Item3()
|
||||
|
||||
def parse_request1(self, response):
|
||||
return Request('http://example.org/request1')
|
||||
|
||||
def parse_request2(self, response):
|
||||
return Request('http://example.org/request2')
|
||||
|
||||
Spider.start_urls = start_urls
|
||||
Spider.rules = rules
|
||||
Spider.request_extractors = extractors
|
||||
Spider.request_processors = processors
|
||||
|
||||
return Spider('foo')
|
||||
|
||||
def test_start_url_auto_rule(self):
|
||||
spider = self.spider_factory()
|
||||
# zero spider rules
|
||||
self.failUnlessEqual(len(spider.rules), 0)
|
||||
self.failUnlessEqual(len(spider._rulesman._rules), 0)
|
||||
|
||||
spider = self.spider_factory(start_urls=['http://example.org'])
|
||||
|
||||
self.failUnlessEqual(len(spider.rules), 0)
|
||||
self.failUnlessEqual(len(spider._rulesman._rules), 1)
|
||||
|
||||
def test_start_url_matcher(self):
|
||||
url = 'http://example.org'
|
||||
spider = self.spider_factory(start_urls=[url])
|
||||
|
||||
response = HtmlResponse(url)
|
||||
|
||||
rule = spider._rulesman.get_rule_from_response(response)
|
||||
self.failUnless(isinstance(rule.matcher, UrlListMatcher))
|
||||
|
||||
response = HtmlResponse(url + '/item.html')
|
||||
|
||||
rule = spider._rulesman.get_rule_from_response(response)
|
||||
self.failUnless(rule is None)
|
||||
|
||||
# TODO: remove this block
|
||||
# in previous version get_rule returns rule from response.request
|
||||
response.request = Request(url)
|
||||
rule = spider._rulesman.get_rule_from_response(response.request)
|
||||
self.failUnless(isinstance(rule.matcher, UrlListMatcher))
|
||||
self.failUnlessEqual(rule.follow, True)
|
||||
|
||||
def test_parse_callback(self):
|
||||
response = HtmlResponse('http://example.org')
|
||||
rules = (
|
||||
Rule(BaseMatcher(), 'parse_item1'),
|
||||
)
|
||||
spider = self.spider_factory(rules)
|
||||
|
||||
result = list(spider.parse(response))
|
||||
self.failUnlessEqual(len(result), 1)
|
||||
self.failUnless(isinstance(result[0], Item1))
|
||||
|
||||
def test_crawling_start_url(self):
|
||||
url = 'http://example.org/'
|
||||
html = """<html><head><title>Page title<title>
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
<p><a href="/about.html">About us</a></p>
|
||||
<img src="/logo.png" alt="Company logo (not a link)" />
|
||||
<p><a href="../othercat.html">Other category</a></p>
|
||||
<p><a href="/" /></p></body></html>"""
|
||||
response = HtmlResponse(url, body=html)
|
||||
|
||||
extractors = (SgmlRequestExtractor(), )
|
||||
spider = self.spider_factory(start_urls=[url],
|
||||
extractors=extractors)
|
||||
result = list(spider.parse(response))
|
||||
|
||||
# 1 request extracted: example.org/
|
||||
# because requests returns only matching
|
||||
self.failUnlessEqual(len(result), 1)
|
||||
|
||||
# we will add catch-all rule to extract all
|
||||
callback = lambda x: None
|
||||
rules = [Rule(r'\.html$', callback=callback)]
|
||||
spider = self.spider_factory(rules, start_urls=[url],
|
||||
extractors=extractors)
|
||||
result = list(spider.parse(response))
|
||||
|
||||
# 4 requests extracted
|
||||
# 3 of .html pattern
|
||||
# 1 of start url patter
|
||||
self.failUnlessEqual(len(result), 4)
|
||||
|
||||
def test_crawling_simple_rule(self):
|
||||
url = 'http://example.org/somepage/index.html'
|
||||
html = """<html><head><title>Page title<title>
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
<p><a href="/about.html">About us</a></p>
|
||||
<img src="/logo.png" alt="Company logo (not a link)" />
|
||||
<p><a href="../othercat.html">Other category</a></p>
|
||||
<p><a href="/" /></p></body></html>"""
|
||||
|
||||
response = HtmlResponse(url, body=html)
|
||||
|
||||
rules = (
|
||||
# first response callback
|
||||
Rule(r'index\.html', 'parse_item1'),
|
||||
)
|
||||
spider = self.spider_factory(rules)
|
||||
result = list(spider.parse(response))
|
||||
|
||||
# should return Item1
|
||||
self.failUnlessEqual(len(result), 1)
|
||||
self.failUnless(isinstance(result[0], Item1))
|
||||
|
||||
# test request generation
|
||||
rules = (
|
||||
# first response without callback and follow flag
|
||||
Rule(r'index\.html', follow=True),
|
||||
Rule(r'(\.html|/)$', 'parse_item1'),
|
||||
)
|
||||
spider = self.spider_factory(rules)
|
||||
result = list(spider.parse(response))
|
||||
|
||||
# 0 because spider does not have extractors
|
||||
self.failUnlessEqual(len(result), 0)
|
||||
|
||||
extractors = (SgmlRequestExtractor(), )
|
||||
|
||||
# instance spider with extractor
|
||||
spider = self.spider_factory(rules, extractors)
|
||||
result = list(spider.parse(response))
|
||||
# 4 requests extracted
|
||||
self.failUnlessEqual(len(result), 4)
|
||||
|
||||
def test_crawling_multiple_rules(self):
|
||||
html = """<html><head><title>Page title<title>
|
||||
<body><p><a href="item/12.html">Item 12</a></p>
|
||||
<p><a href="/about.html">About us</a></p>
|
||||
<img src="/logo.png" alt="Company logo (not a link)" />
|
||||
<p><a href="../othercat.html">Other category</a></p>
|
||||
<p><a href="/" /></p></body></html>"""
|
||||
|
||||
response = HtmlResponse('http://example.org/index.html', body=html)
|
||||
response1 = HtmlResponse('http://example.org/1.html')
|
||||
response2 = HtmlResponse('http://example.org/othercat.html')
|
||||
|
||||
rules = (
|
||||
Rule(r'\d+\.html$', 'parse_item1'),
|
||||
Rule(r'othercat\.html$', 'parse_item2'),
|
||||
# follow-only rules
|
||||
Rule(r'index\.html', 'parse_item3', follow=True)
|
||||
)
|
||||
extractors = [SgmlRequestExtractor()]
|
||||
spider = self.spider_factory(rules, extractors)
|
||||
|
||||
result = list(spider.parse(response))
|
||||
# 1 Item 2 Requests
|
||||
self.failUnlessEqual(len(result), 3)
|
||||
# parse_item3
|
||||
self.failUnless(isinstance(result[0], Item3))
|
||||
only_requests = lambda r: isinstance(r, Request)
|
||||
requests = filter(only_requests, result[1:])
|
||||
self.failUnlessEqual(len(requests), 2)
|
||||
self.failUnless(all(requests))
|
||||
|
||||
result1 = list(spider.parse(response1))
|
||||
# parse_item1
|
||||
self.failUnlessEqual(len(result1), 1)
|
||||
self.failUnless(isinstance(result1[0], Item1))
|
||||
|
||||
result2 = list(spider.parse(response2))
|
||||
# parse_item2
|
||||
self.failUnlessEqual(len(result2), 1)
|
||||
self.failUnless(isinstance(result2[0], Item2))
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user