2009-09-07 11:14:45 -03:00
|
|
|
from unittest import TestCase
|
|
|
|
|
2014-08-01 16:53:18 +06:00
|
|
|
from six.moves.urllib.parse import urlparse
|
|
|
|
|
2009-09-07 11:14:45 -03:00
|
|
|
from scrapy.http import Response, Request
|
2015-05-09 04:20:09 -03:00
|
|
|
from scrapy.spiders import Spider
|
2015-04-21 13:07:24 -03:00
|
|
|
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware
|
2014-01-29 14:33:12 +01:00
|
|
|
from scrapy.utils.test import get_crawler
|
2009-09-07 11:14:45 -03:00
|
|
|
|
|
|
|
class TestOffsiteMiddleware(TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
2014-07-31 04:12:12 -03:00
|
|
|
crawler = get_crawler(Spider)
|
|
|
|
self.spider = crawler._create_spider(**self._get_spiderargs())
|
2014-01-29 14:33:12 +01:00
|
|
|
self.mw = OffsiteMiddleware.from_crawler(crawler)
|
2009-11-03 00:39:02 -02:00
|
|
|
self.mw.spider_opened(self.spider)
|
2009-09-07 11:14:45 -03:00
|
|
|
|
2014-07-31 04:12:12 -03:00
|
|
|
def _get_spiderargs(self):
|
2016-01-26 12:35:40 +02:00
|
|
|
return dict(name='foo', allowed_domains=['scrapytest.org', 'scrapy.org', 'scrapy.test.org'])
|
2010-09-04 02:23:04 -03:00
|
|
|
|
2009-09-07 11:14:45 -03:00
|
|
|
def test_process_spider_output(self):
|
|
|
|
res = Response('http://scrapytest.org')
|
|
|
|
|
|
|
|
onsite_reqs = [Request('http://scrapytest.org/1'),
|
|
|
|
Request('http://scrapy.org/1'),
|
2011-09-08 15:18:10 -03:00
|
|
|
Request('http://sub.scrapy.org/1'),
|
2016-01-26 12:35:40 +02:00
|
|
|
Request('http://offsite.tld/letmepass', dont_filter=True),
|
|
|
|
Request('http://scrapy.test.org/')]
|
2011-09-08 15:18:10 -03:00
|
|
|
offsite_reqs = [Request('http://scrapy2.org'),
|
2013-11-28 13:37:01 +01:00
|
|
|
Request('http://offsite.tld/'),
|
|
|
|
Request('http://offsite.tld/scrapytest.org'),
|
|
|
|
Request('http://offsite.tld/rogue.scrapytest.org'),
|
|
|
|
Request('http://rogue.scrapytest.org.haha.com'),
|
2016-01-26 12:35:40 +02:00
|
|
|
Request('http://roguescrapytest.org'),
|
|
|
|
Request('http://test.org/'),
|
|
|
|
Request('http://notscrapy.test.org/')]
|
2009-09-07 11:14:45 -03:00
|
|
|
reqs = onsite_reqs + offsite_reqs
|
|
|
|
|
|
|
|
out = list(self.mw.process_spider_output(res, reqs, self.spider))
|
|
|
|
self.assertEquals(out, onsite_reqs)
|
|
|
|
|
2010-09-04 02:23:04 -03:00
|
|
|
|
|
|
|
class TestOffsiteMiddleware2(TestOffsiteMiddleware):
|
|
|
|
|
2014-07-31 04:12:12 -03:00
|
|
|
def _get_spiderargs(self):
|
|
|
|
return dict(name='foo', allowed_domains=None)
|
2010-09-04 02:23:04 -03:00
|
|
|
|
|
|
|
def test_process_spider_output(self):
|
|
|
|
res = Response('http://scrapytest.org')
|
|
|
|
reqs = [Request('http://a.com/b.html'), Request('http://b.com/1')]
|
|
|
|
out = list(self.mw.process_spider_output(res, reqs, self.spider))
|
|
|
|
self.assertEquals(out, reqs)
|
|
|
|
|
|
|
|
class TestOffsiteMiddleware3(TestOffsiteMiddleware2):
|
|
|
|
|
|
|
|
def _get_spider(self):
|
2013-12-28 00:47:32 +06:00
|
|
|
return Spider('foo')
|
2010-09-04 02:23:04 -03:00
|
|
|
|
2014-06-12 11:43:03 -04:00
|
|
|
|
|
|
|
class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
|
|
|
|
|
|
|
|
def _get_spider(self):
|
|
|
|
bad_hostname = urlparse('http:////scrapytest.org').hostname
|
2014-07-31 04:12:12 -03:00
|
|
|
return dict(name='foo', allowed_domains=['scrapytest.org', None, bad_hostname])
|
2014-06-12 11:43:03 -04:00
|
|
|
|
|
|
|
def test_process_spider_output(self):
|
|
|
|
res = Response('http://scrapytest.org')
|
|
|
|
reqs = [Request('http://scrapytest.org/1')]
|
|
|
|
out = list(self.mw.process_spider_output(res, reqs, self.spider))
|
|
|
|
self.assertEquals(out, reqs)
|