removed crawlspider v2 tests

2025-02-25 20:44:18 +00:00 · 2011-06-03 18:26:17 -03:00 · 2011-06-03 18:26:17 -03:00 · 61cc95df7c
commit 61cc95df7c
parent 03ae481cad
6 changed files with 0 additions and 1002 deletions
--- a/scrapy/tests/test_contrib_exp_crawlspider_matchers.py
+++ b/scrapy/tests/test_contrib_exp_crawlspider_matchers.py
@ -1,94 +0,0 @@
-from twisted.trial import unittest
-
-from scrapy.http import Request
-from scrapy.http import Response
-
-from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
-
-import re
-
-class MatchersTest(unittest.TestCase):
-
-    def setUp(self):
-        pass
-
-    def test_base_matcher(self):
-        matcher = BaseMatcher()
-
-        request = Request('http://example.com')
-        response = Response('http://example.com')
-
-        self.assertTrue(matcher.matches_request(request))
-        self.assertTrue(matcher.matches_response(response))
-
-    def test_url_matcher(self):
-        matcher = UrlMatcher('http://example.com')
-
-        request = Request('http://example.com')
-        response = Response('http://example.com')
-
-        self.failUnless(matcher.matches_request(request))
-        self.failUnless(matcher.matches_request(response))
-
-        request = Request('http://example2.com')
-        response = Response('http://example2.com')
-
-        self.failIf(matcher.matches_request(request))
-        self.failIf(matcher.matches_request(response))
-
-    def test_url_regex_matcher(self):
-        matcher = UrlRegexMatcher(r'sample')
-        urls = (
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/sample3.html',
-            'http://example.com/sample4.html',
-            )
-        for url in urls:
-            request, response = Request(url), Response(url)
-            self.failUnless(matcher.matches_request(request))
-            self.failUnless(matcher.matches_response(response))
-
-        matcher = UrlRegexMatcher(r'sample_fail')
-        for url in urls:
-            request, response = Request(url), Response(url)
-            self.failIf(matcher.matches_request(request))
-            self.failIf(matcher.matches_response(response))
-
-        matcher = UrlRegexMatcher(r'SAMPLE\d+', re.IGNORECASE)
-        for url in urls:
-            request, response = Request(url), Response(url)
-            self.failUnless(matcher.matches_request(request))
-            self.failUnless(matcher.matches_response(response))
-
-    def test_url_list_matcher(self):
-        urls = (
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/sample3.html',
-            'http://example.com/sample4.html',
-            )
-        urls2 = (
-            'http://example.com/sample5.html',
-            'http://example.com/sample6.html',
-            'http://example.com/sample7.html',
-            'http://example.com/sample8.html',
-            'http://example.com/',
-            )
-        matcher = UrlListMatcher(urls)
-
-        # match urls
-        for url in urls:
-            request, response = Request(url), Response(url)
-            self.failUnless(matcher.matches_request(request))
-            self.failUnless(matcher.matches_response(response))
-
-        # non-match urls
-        for url in urls2:
-            request, response = Request(url), Response(url)
-            self.failIf(matcher.matches_request(request))
-            self.failIf(matcher.matches_response(response))
-
--- a/scrapy/tests/test_contrib_exp_crawlspider_reqext.py
+++ b/scrapy/tests/test_contrib_exp_crawlspider_reqext.py
@ -1,156 +0,0 @@
-from twisted.trial import unittest
-
-from scrapy.http import Request
-from scrapy.http import HtmlResponse
-from scrapy.tests import get_testdata
-
-from scrapy.contrib_exp.crawlspider.reqext import BaseSgmlRequestExtractor
-from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
-from scrapy.contrib_exp.crawlspider.reqext import XPathRequestExtractor
-
-class AbstractRequestExtractorTest(unittest.TestCase):
-
-    def _requests_equals(self, list1, list2):
-        """Compares request's urls and link_text"""
-        for (r1, r2) in zip(list1, list2):
-            if r1.url != r2.url:
-                return False
-            if r1.meta['link_text'] != r2.meta['link_text']:
-                return False
-        # all equal
-        return True
-
-
-class RequestExtractorTest(AbstractRequestExtractorTest):
-
-    def test_basic(self):
-        base_url = 'http://example.org/somepage/index.html'
-        html = """<html><head><title>Page title<title>
-        <body><p><a href="item/12.html">Item 12</a></p>
-        <p><a href="/about.html">About us</a></p>
-        <img src="/logo.png" alt="Company logo (not a link)" />
-        <p><a href="../othercat.html">Other category</a></p>
-        <p><a href="/" /></p></body></html>"""
-        requests = [
-            Request('http://example.org/somepage/item/12.html',
-                    meta={'link_text': 'Item 12'}),
-            Request('http://example.org/about.html',
-                    meta={'link_text': 'About us'}),
-            Request('http://example.org/othercat.html',
-                    meta={'link_text': 'Other category'}),
-            Request('http://example.org/',
-                    meta={'link_text': ''}),
-            ]
-
-        response = HtmlResponse(base_url, body=html)
-        reqx = BaseSgmlRequestExtractor() # default: tag=a, attr=href
-
-        self.failUnless(
-            self._requests_equals(requests, reqx.extract_requests(response))
-            )
-
-    def test_base_url(self):
-        reqx = BaseSgmlRequestExtractor()
-
-        html = """<html><head><title>Page title<title>
-        <base href="http://otherdomain.com/base/" />
-        <body><p><a href="item/12.html">Item 12</a></p>
-        </body></html>"""
-        response = HtmlResponse("https://example.org/p/index.html", body=html)
-        reqs = reqx.extract_requests(response)
-        self.failUnless(self._requests_equals( \
-            [Request('http://otherdomain.com/base/item/12.html', \
-                    meta={'link_text': 'Item 12'})], reqs), reqs)
-
-        # base url is an absolute path and relative to host
-        html = """<html><head><title>Page title<title>
-        <base href="/" />
-        <body><p><a href="item/12.html">Item 12</a></p>
-        </body></html>"""
-        response = HtmlResponse("https://example.org/p/index.html", body=html)
-        reqs = reqx.extract_requests(response)
-        self.failUnless(self._requests_equals( \
-            [Request('https://example.org/item/12.html', \
-                    meta={'link_text': 'Item 12'})], reqs), reqs)
-
-        # base url has no scheme
-        html = """<html><head><title>Page title<title>
-        <base href="//noscheme.com/base/" />
-        <body><p><a href="item/12.html">Item 12</a></p>
-        </body></html>"""
-        response = HtmlResponse("https://example.org/p/index.html", body=html)
-        reqs = reqx.extract_requests(response)
-        self.failUnless(self._requests_equals( \
-            [Request('https://noscheme.com/base/item/12.html', \
-                    meta={'link_text': 'Item 12'})], reqs), reqs)
-
-    def test_extraction_encoding(self):
-        #TODO: use own fixtures
-        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
-        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body,
-                        headers={'Content-Type': ['text/html; charset=utf-8']})
-        response_noenc = HtmlResponse(url='http://example.com/noenc',
-                            body=body)
-        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
-        response_latin1 = HtmlResponse(url='http://example.com/latin1',
-                            body=body)
-
-        reqx = BaseSgmlRequestExtractor()
-        self.failUnless(
-            self._requests_equals(
-                reqx.extract_requests(response_utf8),
-                [ Request(url='http://example.com/sample_%C3%B1.html',
-                          meta={'link_text': ''}),
-                  Request(url='http://example.com/sample_%E2%82%AC.html',
-                          meta={'link_text':
-                                'sample \xe2\x82\xac text'.decode('utf-8')}) ]
-                )
-            )
-
-        self.failUnless(
-            self._requests_equals(
-                reqx.extract_requests(response_noenc),
-                [ Request(url='http://example.com/sample_%C3%B1.html',
-                          meta={'link_text': ''}),
-                  Request(url='http://example.com/sample_%E2%82%AC.html',
-                          meta={'link_text':
-                                'sample \xe2\x82\xac text'.decode('utf-8')}) ]
-                )
-            )
-
-        self.failUnless(
-            self._requests_equals(
-                reqx.extract_requests(response_latin1),
-                [ Request(url='http://example.com/sample_%F1.html',
-                          meta={'link_text': ''}),
-                  Request(url='http://example.com/sample_%E1.html',
-                          meta={'link_text':
-                                'sample \xe1 text'.decode('latin1')}) ]
-                )
-            )
-
-
-class SgmlRequestExtractorTest(AbstractRequestExtractorTest):
-    pass
-
-
-class XPathRequestExtractorTest(AbstractRequestExtractorTest):
-
-    def setUp(self):
-        # TODO: use own fixtures
-        body = get_testdata('link_extractor', 'sgml_linkextractor.html')
-        self.response = HtmlResponse(url='http://example.com/index', body=body)
-
-
-    def test_restrict_xpaths(self):
-        reqx = XPathRequestExtractor('//div[@id="subwrapper"]')
-        self.failUnless(
-            self._requests_equals(
-                reqx.extract_requests(self.response),
-                [ Request(url='http://example.com/sample1.html',
-                          meta={'link_text': ''}),
-                  Request(url='http://example.com/sample2.html',
-                          meta={'link_text': 'sample 2'}) ]
-                )
-            )
-
--- a/scrapy/tests/test_contrib_exp_crawlspider_reqgen.py
+++ b/scrapy/tests/test_contrib_exp_crawlspider_reqgen.py
@ -1,124 +0,0 @@
-from twisted.internet import defer
-from twisted.trial import unittest
-
-from scrapy.http import Request
-from scrapy.http import HtmlResponse
-from scrapy.utils.python import equal_attributes
-
-from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
-from scrapy.contrib_exp.crawlspider.reqgen import RequestGenerator
-from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
-from scrapy.contrib_exp.crawlspider.reqproc import FilterDomain
-from scrapy.contrib_exp.crawlspider.reqproc import FilterUrl
-from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
-
-class RequestGeneratorTest(unittest.TestCase):
-
-    def setUp(self):
-        url = 'http://example.org/somepage/index.html'
-        html = """<html><head><title>Page title<title>
-        <body><p><a href="item/12.html">Item 12</a></p>
-        <p><a href="/about.html">About us</a></p>
-        <img src="/logo.png" alt="Company logo (not a link)" />
-        <p><a href="../othercat.html">Other category</a></p>
-        <p><a href="/" /></p></body></html>"""
-
-        self.response = HtmlResponse(url, body=html)
-        self.deferred = defer.Deferred()
-        self.requests = [
-            Request('http://example.org/somepage/item/12.html',
-                    meta={'link_text': 'Item 12'}),
-            Request('http://example.org/about.html',
-                    meta={'link_text': 'About us'}),
-            Request('http://example.org/othercat.html',
-                    meta={'link_text': 'Other category'}),
-            Request('http://example.org/',
-                    meta={'link_text': ''}),
-            ]
-
-    def _equal_requests_list(self, list1, list2):
-        list1 = list(list1)
-        list2 = list(list2)
-        if not len(list1) == len(list2):
-            return False
-
-        for (req1, req2) in zip(list1, list2):
-            if not equal_attributes(req1, req2, ['url']):
-                return False
-        return True
-
-    def test_basic(self):
-        reqgen = RequestGenerator([], [], callback=self.deferred)
-        # returns generator
-        requests = reqgen.generate_requests(self.response)
-        self.failUnlessEqual(list(requests), [])
-
-    def test_request_extractor(self):
-        extractors = [
-            SgmlRequestExtractor()
-            ]
-
-        # extract all requests
-        reqgen = RequestGenerator(extractors, [], callback=self.deferred)
-        requests = reqgen.generate_requests(self.response)
-        self.failUnless(self._equal_requests_list(requests, self.requests))
-
-    def test_request_processor(self):
-        extractors = [
-            SgmlRequestExtractor()
-            ]
-
-        processors = [
-            Canonicalize(),
-            FilterDupes(),
-            ]
-
-        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
-        requests = reqgen.generate_requests(self.response)
-        self.failUnless(self._equal_requests_list(requests, self.requests))
-
-        # filter domain
-        processors = [
-            Canonicalize(),
-            FilterDupes(),
-            FilterDomain(deny='example.org'),
-            ]
-
-        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
-        requests = reqgen.generate_requests(self.response)
-        self.failUnlessEqual(list(requests), [])
-
-        # filter url
-        processors = [
-            Canonicalize(),
-            FilterDupes(),
-            FilterUrl(deny=(r'about', r'othercat')),
-            ]
-
-        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
-        requests = reqgen.generate_requests(self.response)
-
-        self.failUnless(self._equal_requests_list(requests, [
-                Request('http://example.org/somepage/item/12.html',
-                        meta={'link_text': 'Item 12'}),
-                Request('http://example.org/',
-                        meta={'link_text': ''}),
-                ]))
-
-        processors = [
-            Canonicalize(),
-            FilterDupes(),
-            FilterUrl(allow=r'/somepage/'),
-            ]
-
-        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
-        requests = reqgen.generate_requests(self.response)
-
-        self.failUnless(self._equal_requests_list(requests, [
-                Request('http://example.org/somepage/item/12.html',
-                        meta={'link_text': 'Item 12'}),
-                ]))
-
- 
- 
-        
--- a/scrapy/tests/test_contrib_exp_crawlspider_reqproc.py
+++ b/scrapy/tests/test_contrib_exp_crawlspider_reqproc.py
@ -1,144 +0,0 @@
-from twisted.trial import unittest
-
-from scrapy.http import Request
-
-from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
-from scrapy.contrib_exp.crawlspider.reqproc import FilterDomain
-from scrapy.contrib_exp.crawlspider.reqproc import FilterUrl
-from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
-
-import copy
-
-class RequestProcessorsTest(unittest.TestCase):
-
-    def test_canonicalize_requests(self):
-        urls = [
-            'http://example.com/do?&b=1&a=2&c=3',
-            'http://example.com/do?123,&q=a space',
-            ]
-        urls_after = [
-            'http://example.com/do?a=2&b=1&c=3',
-            'http://example.com/do?123%2C=&q=a+space',
-            ]
-
-        proc = Canonicalize()
-        results = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(results, urls_after)
-
-    def test_unique_requests(self):
-        urls = [
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/sample3.html',
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            ]
-        urls_unique = [
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/sample3.html',
-            ]
-
-        proc = FilterDupes()
-        results = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(results, urls_unique)
-
-        # Check custom attributes
-        requests = [
-            Request('http://example.com', method='GET'),
-            Request('http://example.com', method='POST'),
-            ]
-        proc = FilterDupes('url', 'method')
-        self.failUnlessEqual(len(list(proc(requests))), 2)
-
-        proc = FilterDupes('url')
-        self.failUnlessEqual(len(list(proc(requests))), 1)
-
-    def test_filter_domain(self):
-        urls = [
-            'http://blah1.com/index',
-            'http://blah2.com/index',
-            'http://blah1.com/section',
-            'http://blah2.com/section',
-            ]
-
-        proc = FilterDomain(allow=('blah1.com'), deny=('blah2.com'))
-        filtered = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(filtered, [
-                    'http://blah1.com/index',
-                    'http://blah1.com/section',
-                ])
-
-        proc = FilterDomain(deny=('blah1.com', 'blah2.com'))
-        filtered = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(filtered, [])
-
-        proc = FilterDomain(allow=('blah1.com', 'blah2.com'))
-        filtered = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(filtered, urls)
-
-    def test_filter_url(self):
-        urls = [
-            'http://blah1.com/index',
-            'http://blah2.com/index',
-            'http://blah1.com/section',
-            'http://blah2.com/section',
-            ]
-
-        proc = FilterUrl(allow=(r'blah1'), deny=(r'blah2'))
-        filtered = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(filtered, [
-                    'http://blah1.com/index',
-                    'http://blah1.com/section',
-                ])
-
-        proc = FilterUrl(deny=('blah1', 'blah2'))
-        filtered = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(filtered, [])
-
-        proc = FilterUrl(allow=('index$', 'section$'))
-        filtered = [req.url for req in proc(Request(url) for url in urls)]
-        self.failUnlessEquals(filtered, urls)
-
-
-
-    def test_all_processors(self):
-        urls = [
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/sample3.html',
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/do?&b=1&a=2&c=3',
-            'http://example.com/do?123,&q=a space',
-            ]
-        urls_processed = [
-            'http://example.com/sample1.html',
-            'http://example.com/sample2.html',
-            'http://example.com/sample3.html',
-            'http://example.com/do?a=2&b=1&c=3',
-            'http://example.com/do?123%2C=&q=a+space',
-            ]
-
-        processors = [
-            Canonicalize(),
-            FilterDupes(),
-            ]
-
-        def _process(requests):
-            """Apply all processors"""
-            # copy list
-            processed = [copy.copy(req) for req in requests]
-            for proc in processors:
-                processed = proc(processed)
-            return processed
-
-        # empty requests
-        results1 = [r.url for r in _process([])]
-        self.failUnlessEquals(results1, [])
-
-        # try urls
-        requests = (Request(url) for url in urls)
-        results2 = [r.url for r in _process(requests)]
-        self.failUnlessEquals(results2, urls_processed)
-
--- a/scrapy/tests/test_contrib_exp_crawlspider_rules.py
+++ b/scrapy/tests/test_contrib_exp_crawlspider_rules.py
@ -1,262 +0,0 @@
-from twisted.trial import unittest
-
-from scrapy.http import HtmlResponse
-from scrapy.spider import BaseSpider
-from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
-
-from scrapy.contrib_exp.crawlspider.rules import CompiledRule
-from scrapy.contrib_exp.crawlspider.rules import Rule
-from scrapy.contrib_exp.crawlspider.rules import RulesManager
-
-from functools import partial
-
-class RuleInitializationTest(unittest.TestCase):
-
-    def test_fail_if_rule_null(self):
-        # fail on empty rule
-        self.failUnlessRaises(ValueError, Rule)
-        self.failUnlessRaises(ValueError, Rule,
-                                **dict(callback=None, follow=None))
-        self.failUnlessRaises(ValueError, Rule,
-                                **dict(callback=None, follow=False))
-
-    def test_minimal_arguments_to_instantiation(self):
-        # not fail if callback set
-        self.failUnless(Rule(callback=lambda: True))
-        # not fail if follow set
-        self.failUnless(Rule(follow=True))
-
-    def test_validate_default_attributes(self):
-        # test null Rule
-        rule = Rule(follow=True)
-        self.failUnlessEqual(None, rule.matcher)
-        self.failUnlessEqual(None, rule.callback)
-        self.failUnlessEqual({}, rule.cb_kwargs)
-        # follow default False
-        self.failUnlessEqual(True, rule.follow)
-
-    def test_validate_attributes_set(self):
-        matcher = BaseMatcher()
-        callback = lambda: True
-        rule = Rule(matcher, callback, True, a=1)
-        # test attributes
-        self.failUnlessEqual(matcher, rule.matcher)
-        self.failUnlessEqual(callback, rule.callback)
-        self.failUnlessEqual({'a': 1}, rule.cb_kwargs)
-        self.failUnlessEqual(True, rule.follow)
-
-class CompiledRuleInitializationTest(unittest.TestCase):
-
-    def test_fail_on_invalid_matcher(self):
-        # pass with valid matcher
-        self.failUnless(CompiledRule(BaseMatcher()),
-                "Failed CompiledRule instantiation")
-
-        # at least needs valid matcher
-        self.assertRaises(AssertionError, CompiledRule, None)
-        self.assertRaises(AssertionError, CompiledRule, False)
-        self.assertRaises(AssertionError, CompiledRule, True)
-
-    def test_fail_on_invalid_callback(self):
-        # pass with valid callback
-        callback = lambda: True
-        self.failUnless(CompiledRule(BaseMatcher(), callback))
-        # pass with callback none
-        self.failUnless(CompiledRule(BaseMatcher(), None))
-
-        # assert on invalid callback
-        self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
-                          'myfunc')
-
-        # numeric variable
-        var = 123
-        self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
-                          var)
-
-        class A:
-            pass
-
-        # random instance
-        self.assertRaises(AssertionError, CompiledRule, BaseMatcher(),
-                          A())
-
-
-    def test_fail_on_invalid_follow_value(self):
-        callback = lambda: True
-        matcher = BaseMatcher()
-        # pass bool
-        self.failUnless(CompiledRule(matcher, callback, True))
-        self.failUnless(CompiledRule(matcher, callback, False))
-
-        # assert with non-bool
-        self.assertRaises(AssertionError, CompiledRule, matcher,
-                          callback, None)
-        self.assertRaises(AssertionError, CompiledRule, matcher,
-                          callback, 1)
-
-    def test_validate_default_attributes(self):
-        callback = lambda: True
-        matcher = BaseMatcher()
-        rule = CompiledRule(matcher, callback, True)
-
-        # test attributes
-        self.failUnlessEqual(matcher, rule.matcher)
-        self.failUnlessEqual(callback, rule.callback)
-        self.failUnlessEqual(True, rule.follow)
-
-
-class RulesTest(unittest.TestCase):
-    def test_rules_manager_basic(self):
-        spider = BaseSpider('foo')
-        response1 = HtmlResponse('http://example.org')
-        response2 = HtmlResponse('http://othersite.org')
-        rulesman = RulesManager([], spider)
-
-        # should return none
-        self.failIf(rulesman.get_rule_from_response(response1))
-        self.failIf(rulesman.get_rule_from_response(response2))
-
-        # rules manager with match-all rule
-        rulesman = RulesManager([
-                Rule(BaseMatcher(), follow=True),
-                ], spider)
-
-        # returns CompiledRule
-        rule1 = rulesman.get_rule_from_response(response1)
-        rule2 = rulesman.get_rule_from_response(response2)
-
-        self.failUnless(isinstance(rule1, CompiledRule))
-        self.failUnless(isinstance(rule2, CompiledRule))
-        self.assert_(rule1 is rule2)
-        self.failUnlessEqual(rule1.callback, None)
-        self.failUnlessEqual(rule1.follow, True)
-
-    def test_rules_manager_empty_rule(self):
-        spider = BaseSpider('foo')
-        response = HtmlResponse('http://example.org')
-
-        rulesman = RulesManager([Rule(follow=True)], spider)
-
-        rule = rulesman.get_rule_from_response(response)
-        # default matcher if None: BaseMatcher
-        self.failUnless(isinstance(rule.matcher, BaseMatcher))
-
-    def test_rules_manager_default_matcher(self):
-        spider = BaseSpider('foo')
-        response = HtmlResponse('http://example.org')
-        callback = lambda x: None
-
-        rulesman = RulesManager([
-            Rule('http://example.org', callback),
-                ], spider, default_matcher=UrlMatcher)
-
-        rule = rulesman.get_rule_from_response(response)
-        self.failUnless(isinstance(rule.matcher, UrlMatcher))
-
-    def test_rules_manager_matchers(self):
-        spider = BaseSpider('foo')
-        response1 = HtmlResponse('http://example.org')
-        response2 = HtmlResponse('http://othersite.org')
-
-        urlmatcher = UrlMatcher('http://example.org')
-        basematcher = BaseMatcher()
-        # callback needed for Rule
-        callback = lambda x: None
-
-        # test fail matcher resolve
-        self.assertRaises(ValueError, RulesManager,
-                          [Rule(False, callback)], spider)
-        self.assertRaises(ValueError, RulesManager,
-                          [Rule(spider, callback)], spider)
-
-        rulesman = RulesManager([
-            Rule(urlmatcher, callback),
-            Rule(basematcher, callback),
-            ], spider)
-
-        # response1 matches example.org
-        rule1 = rulesman.get_rule_from_response(response1)
-        # response2 is catch by BaseMatcher()
-        rule2 = rulesman.get_rule_from_response(response2)
-
-        self.failUnlessEqual(rule1.matcher, urlmatcher)
-        self.failUnlessEqual(rule2.matcher, basematcher)
-
-        # reverse order. BaseMatcher should match all
-        rulesman = RulesManager([
-            Rule(basematcher, callback),
-            Rule(urlmatcher, callback),
-            ], spider)
-
-        rule1 = rulesman.get_rule_from_response(response1)
-        rule2 = rulesman.get_rule_from_response(response2)
-
-        self.failUnlessEqual(rule1.matcher, basematcher)
-        self.failUnlessEqual(rule2.matcher, basematcher)
-        self.failUnless(rule1 is rule2)
-
-    def test_rules_manager_callbacks(self):
-        mycallback = lambda: True
-
-        spider = BaseSpider('foo')
-        spider.parse_item = lambda: True
-
-        response1 = HtmlResponse('http://example.org')
-        response2 = HtmlResponse('http://othersite.org')
-
-        rulesman = RulesManager([
-            Rule('example', mycallback),
-            Rule('othersite', 'parse_item'),
-                ], spider, default_matcher=UrlRegexMatcher)
-
-        rule1 = rulesman.get_rule_from_response(response1)
-        rule2 = rulesman.get_rule_from_response(response2)
-
-        self.failUnlessEqual(rule1.callback, mycallback)
-        self.failUnlessEqual(rule2.callback, spider.parse_item)
-
-        # fail unknown callback
-        self.assertRaises(AttributeError, RulesManager, [
-                            Rule(BaseMatcher(), 'mycallback')
-                            ], spider)
-        # fail not callable
-        spider.not_callable = True
-        self.assertRaises(AttributeError, RulesManager, [
-                            Rule(BaseMatcher(), 'not_callable')
-                            ], spider)
-
-
-    def test_rules_manager_callback_with_arguments(self):
-        spider = BaseSpider('foo')
-        response = HtmlResponse('http://example.org')
-
-        kwargs = {'a': 1}
-        
-        def myfunc(**mykwargs):
-            return mykwargs
-        
-        # verify return validation
-        self.failUnlessEquals(kwargs, myfunc(**kwargs))
-
-        # test callback w/o arguments
-        rulesman = RulesManager([
-            Rule(BaseMatcher(), myfunc),
-            ], spider)
-        rule = rulesman.get_rule_from_response(response)
-
-        # without arguments should return same callback
-        self.failUnlessEqual(rule.callback, myfunc)
-
-        # test callback w/ arguments
-        rulesman = RulesManager([
-            Rule(BaseMatcher(), myfunc, **kwargs),
-            ], spider)
-        rule = rulesman.get_rule_from_response(response)
-
-        # with argument should return partial applied callback
-        self.failUnless(isinstance(rule.callback, partial))
-        self.failUnlessEquals(kwargs, rule.callback())
-
-
--- a/scrapy/tests/test_contrib_exp_crawlspider_spider.py
+++ b/scrapy/tests/test_contrib_exp_crawlspider_spider.py
@ -1,222 +0,0 @@
-from twisted.trial import unittest
-
-from scrapy.http import Request
-from scrapy.http import HtmlResponse
-from scrapy.item import BaseItem
-from scrapy.utils.spider import iterate_spider_output
-
-# basics
-from scrapy.contrib_exp.crawlspider import CrawlSpider
-from scrapy.contrib_exp.crawlspider import Rule
-
-# matchers
-from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
-from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
-
-# extractors
-from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
-
-# processors
-from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize
-from scrapy.contrib_exp.crawlspider.reqproc import FilterDupes
-
-
-# mock items
-class Item1(BaseItem):
-    pass
-
-class Item2(BaseItem):
-    pass
-
-class Item3(BaseItem):
-    pass
-
-
-class CrawlSpiderTest(unittest.TestCase):
-
-    def spider_factory(self, rules=[],
-                       extractors=[], processors=[],
-                       start_urls=[]):
-        # mock spider
-        class Spider(CrawlSpider):
-            def parse_item1(self, response):
-                return Item1()
-            
-            def parse_item2(self, response):
-                return Item2()
-
-            def parse_item3(self, response):
-                return Item3()
-
-            def parse_request1(self, response):
-                return Request('http://example.org/request1')
-
-            def parse_request2(self, response):
-                return Request('http://example.org/request2')
-
-        Spider.start_urls = start_urls
-        Spider.rules = rules
-        Spider.request_extractors = extractors
-        Spider.request_processors = processors
-
-        return Spider('foo')
-
-    def test_start_url_auto_rule(self):
-        spider = self.spider_factory()
-        # zero spider rules
-        self.failUnlessEqual(len(spider.rules), 0)
-        self.failUnlessEqual(len(spider._rulesman._rules), 0)
-
-        spider = self.spider_factory(start_urls=['http://example.org'])
-
-        self.failUnlessEqual(len(spider.rules), 0)
-        self.failUnlessEqual(len(spider._rulesman._rules), 1)
-
-    def test_start_url_matcher(self):
-        url = 'http://example.org'
-        spider = self.spider_factory(start_urls=[url])
-
-        response = HtmlResponse(url)
-
-        rule = spider._rulesman.get_rule_from_response(response)
-        self.failUnless(isinstance(rule.matcher, UrlListMatcher))
-
-        response = HtmlResponse(url + '/item.html')
-
-        rule = spider._rulesman.get_rule_from_response(response)
-        self.failUnless(rule is None)
-
-        # TODO: remove this block
-        # in previous version get_rule returns rule from response.request
-        response.request = Request(url)
-        rule = spider._rulesman.get_rule_from_response(response.request)
-        self.failUnless(isinstance(rule.matcher, UrlListMatcher))
-        self.failUnlessEqual(rule.follow, True)
-
-    def test_parse_callback(self):
-        response = HtmlResponse('http://example.org')
-        rules = (
-            Rule(BaseMatcher(), 'parse_item1'),
-            )
-        spider = self.spider_factory(rules)
-
-        result = list(spider.parse(response))
-        self.failUnlessEqual(len(result), 1)
-        self.failUnless(isinstance(result[0], Item1))
-
-    def test_crawling_start_url(self):
-        url = 'http://example.org/'
-        html = """<html><head><title>Page title<title>
-        <body><p><a href="item/12.html">Item 12</a></p>
-        <p><a href="/about.html">About us</a></p>
-        <img src="/logo.png" alt="Company logo (not a link)" />
-        <p><a href="../othercat.html">Other category</a></p>
-        <p><a href="/" /></p></body></html>"""
-        response = HtmlResponse(url, body=html)
-
-        extractors = (SgmlRequestExtractor(), )
-        spider = self.spider_factory(start_urls=[url],
-                                     extractors=extractors)
-        result = list(spider.parse(response))
-
-        # 1 request extracted: example.org/ 
-        # because requests returns only matching
-        self.failUnlessEqual(len(result), 1)
-
-        # we will add catch-all rule to extract all
-        callback = lambda x: None
-        rules = [Rule(r'\.html$', callback=callback)]
-        spider = self.spider_factory(rules, start_urls=[url],
-                                     extractors=extractors)
-        result = list(spider.parse(response))
-
-        # 4 requests extracted
-        # 3 of .html pattern
-        # 1 of start url patter
-        self.failUnlessEqual(len(result), 4)
-
-    def test_crawling_simple_rule(self):
-        url = 'http://example.org/somepage/index.html'
-        html = """<html><head><title>Page title<title>
-        <body><p><a href="item/12.html">Item 12</a></p>
-        <p><a href="/about.html">About us</a></p>
-        <img src="/logo.png" alt="Company logo (not a link)" />
-        <p><a href="../othercat.html">Other category</a></p>
-        <p><a href="/" /></p></body></html>"""
-
-        response = HtmlResponse(url, body=html)
-
-        rules = (
-            # first response callback
-            Rule(r'index\.html', 'parse_item1'),
-            )
-        spider = self.spider_factory(rules)
-        result = list(spider.parse(response))
-
-        # should return Item1
-        self.failUnlessEqual(len(result), 1)
-        self.failUnless(isinstance(result[0], Item1))
-
-        # test request generation
-        rules = (
-            # first response without callback and follow flag
-            Rule(r'index\.html', follow=True),
-            Rule(r'(\.html|/)$', 'parse_item1'),
-            )
-        spider = self.spider_factory(rules)
-        result = list(spider.parse(response))
-
-        # 0 because spider does not have extractors
-        self.failUnlessEqual(len(result), 0)
-
-        extractors = (SgmlRequestExtractor(), )
-
-        # instance spider with extractor
-        spider = self.spider_factory(rules, extractors)
-        result = list(spider.parse(response))
-        # 4 requests extracted
-        self.failUnlessEqual(len(result), 4)
-
-    def test_crawling_multiple_rules(self):
-        html = """<html><head><title>Page title<title>
-        <body><p><a href="item/12.html">Item 12</a></p>
-        <p><a href="/about.html">About us</a></p>
-        <img src="/logo.png" alt="Company logo (not a link)" />
-        <p><a href="../othercat.html">Other category</a></p>
-        <p><a href="/" /></p></body></html>"""
-
-        response = HtmlResponse('http://example.org/index.html', body=html)
-        response1 = HtmlResponse('http://example.org/1.html')
-        response2 = HtmlResponse('http://example.org/othercat.html')
-
-        rules = (
-            Rule(r'\d+\.html$', 'parse_item1'),
-            Rule(r'othercat\.html$', 'parse_item2'),
-            # follow-only rules
-            Rule(r'index\.html', 'parse_item3', follow=True)
-            )
-        extractors = [SgmlRequestExtractor()]
-        spider = self.spider_factory(rules, extractors)
-
-        result = list(spider.parse(response))
-        # 1 Item 2 Requests
-        self.failUnlessEqual(len(result), 3)
-        # parse_item3
-        self.failUnless(isinstance(result[0], Item3))
-        only_requests = lambda r: isinstance(r, Request)
-        requests = filter(only_requests, result[1:])
-        self.failUnlessEqual(len(requests), 2)
-        self.failUnless(all(requests))
-
-        result1 = list(spider.parse(response1))
-        # parse_item1
-        self.failUnlessEqual(len(result1), 1)
-        self.failUnless(isinstance(result1[0], Item1))
-
-        result2 = list(spider.parse(response2))
-        # parse_item2
-        self.failUnlessEqual(len(result2), 1)
-        self.failUnless(isinstance(result2[0], Item2))
-
-