1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 06:44:06 +00:00
scrapy/tests/test_spidermiddleware_offsite.py

83 lines
3.1 KiB
Python

from unittest import TestCase
from six.moves.urllib.parse import urlparse
from scrapy.http import Response, Request
from scrapy.spiders import Spider
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware
from scrapy.spidermiddlewares.offsite import URLWarning
from scrapy.utils.test import get_crawler
import warnings
class TestOffsiteMiddleware(TestCase):
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider(**self._get_spiderargs())
self.mw = OffsiteMiddleware.from_crawler(crawler)
self.mw.spider_opened(self.spider)
def _get_spiderargs(self):
return dict(name='foo', allowed_domains=['scrapytest.org', 'scrapy.org', 'scrapy.test.org'])
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
onsite_reqs = [Request('http://scrapytest.org/1'),
Request('http://scrapy.org/1'),
Request('http://sub.scrapy.org/1'),
Request('http://offsite.tld/letmepass', dont_filter=True),
Request('http://scrapy.test.org/')]
offsite_reqs = [Request('http://scrapy2.org'),
Request('http://offsite.tld/'),
Request('http://offsite.tld/scrapytest.org'),
Request('http://offsite.tld/rogue.scrapytest.org'),
Request('http://rogue.scrapytest.org.haha.com'),
Request('http://roguescrapytest.org'),
Request('http://test.org/'),
Request('http://notscrapy.test.org/')]
reqs = onsite_reqs + offsite_reqs
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, onsite_reqs)
class TestOffsiteMiddleware2(TestOffsiteMiddleware):
def _get_spiderargs(self):
return dict(name='foo', allowed_domains=None)
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
reqs = [Request('http://a.com/b.html'), Request('http://b.com/1')]
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, reqs)
class TestOffsiteMiddleware3(TestOffsiteMiddleware2):
def _get_spider(self):
return Spider('foo')
class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
def _get_spider(self):
bad_hostname = urlparse('http:////scrapytest.org').hostname
return dict(name='foo', allowed_domains=['scrapytest.org', None, bad_hostname])
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
reqs = [Request('http://scrapytest.org/1')]
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, reqs)
class TestOffsiteMiddleware5(TestOffsiteMiddleware4):
def test_get_host_regex(self):
self.spider.allowed_domains = ['http://scrapytest.org', 'scrapy.org', 'scrapy.test.org']
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
self.mw.get_host_regex(self.spider)
assert issubclass(w[-1].category, URLWarning)