1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00
scrapy/tests/test_spidermiddleware_offsite.py
Kevin Lloyd Bernal 1fc91bb462
new allow_offsite parameter in OffsiteMiddleware (#6151)
* new 'allow_offsite' parameter in OffsiteMiddleware

* document deprecated dont_filter flag in OffsiteMiddleware

* avoid deprecating dont_filter in OffsiteMiddleware

* Copy the code to the downloader mw.

* Add tests for allow_offsite in the downloader mw.

* Mark allow_offsite with reqmeta.

---------

Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
2025-01-08 21:28:51 +05:00

107 lines
3.7 KiB
Python

import warnings
from unittest import TestCase
from urllib.parse import urlparse
from scrapy.http import Request, Response
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware, PortWarning, URLWarning
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
class TestOffsiteMiddleware(TestCase):
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider(**self._get_spiderargs())
self.mw = OffsiteMiddleware.from_crawler(crawler)
self.mw.spider_opened(self.spider)
def _get_spiderargs(self):
return {
"name": "foo",
"allowed_domains": ["scrapytest.org", "scrapy.org", "scrapy.test.org"],
}
def test_process_spider_output(self):
res = Response("http://scrapytest.org")
onsite_reqs = [
Request("http://scrapytest.org/1"),
Request("http://scrapy.org/1"),
Request("http://sub.scrapy.org/1"),
Request("http://offsite.tld/letmepass", dont_filter=True),
Request("http://offsite-2.tld/allow", meta={"allow_offsite": True}),
Request("http://scrapy.test.org/"),
Request("http://scrapy.test.org:8000/"),
]
offsite_reqs = [
Request("http://scrapy2.org"),
Request("http://offsite.tld/"),
Request("http://offsite.tld/scrapytest.org"),
Request("http://offsite.tld/rogue.scrapytest.org"),
Request("http://rogue.scrapytest.org.haha.com"),
Request("http://roguescrapytest.org"),
Request("http://test.org/"),
Request("http://notscrapy.test.org/"),
]
reqs = onsite_reqs + offsite_reqs
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, onsite_reqs)
class TestOffsiteMiddleware2(TestOffsiteMiddleware):
def _get_spiderargs(self):
return {"name": "foo", "allowed_domains": None}
def test_process_spider_output(self):
res = Response("http://scrapytest.org")
reqs = [Request("http://a.com/b.html"), Request("http://b.com/1")]
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, reqs)
class TestOffsiteMiddleware3(TestOffsiteMiddleware2):
def _get_spiderargs(self):
return {"name": "foo"}
class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
def _get_spiderargs(self):
bad_hostname = urlparse("http:////scrapytest.org").hostname
return {
"name": "foo",
"allowed_domains": ["scrapytest.org", None, bad_hostname],
}
def test_process_spider_output(self):
res = Response("http://scrapytest.org")
reqs = [Request("http://scrapytest.org/1")]
out = list(self.mw.process_spider_output(res, reqs, self.spider))
self.assertEqual(out, reqs)
class TestOffsiteMiddleware5(TestOffsiteMiddleware4):
def test_get_host_regex(self):
self.spider.allowed_domains = [
"http://scrapytest.org",
"scrapy.org",
"scrapy.test.org",
]
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
self.mw.get_host_regex(self.spider)
assert issubclass(w[-1].category, URLWarning)
class TestOffsiteMiddleware6(TestOffsiteMiddleware4):
def test_get_host_regex(self):
self.spider.allowed_domains = [
"scrapytest.org:8000",
"scrapy.org",
"scrapy.test.org",
]
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
self.mw.get_host_regex(self.spider)
assert issubclass(w[-1].category, PortWarning)