1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00
scrapy/tests/test_downloadermiddleware_offsite.py
Kevin Lloyd Bernal 1fc91bb462
new allow_offsite parameter in OffsiteMiddleware (#6151)
* new 'allow_offsite' parameter in OffsiteMiddleware

* document deprecated dont_filter flag in OffsiteMiddleware

* avoid deprecating dont_filter in OffsiteMiddleware

* Copy the code to the downloader mw.

* Add tests for allow_offsite in the downloader mw.

* Mark allow_offsite with reqmeta.

---------

Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
2025-01-08 21:28:51 +05:00

222 lines
7.3 KiB
Python

import warnings
import pytest
from scrapy import Request, Spider
from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.test import get_crawler
UNSET = object()
@pytest.mark.parametrize(
("allowed_domain", "url", "allowed"),
(
("example.com", "http://example.com/1", True),
("example.com", "http://example.org/1", False),
("example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://example.com/1", False),
("example.com", "http://example.com:8000/1", True),
("example.com", "http://example.org/example.com", False),
("example.com", "http://example.org/foo.example.com", False),
("example.com", "http://example.com.example", False),
("a.example", "http://nota.example", False),
("b.a.example", "http://notb.a.example", False),
),
)
def test_process_request_domain_filtering(allowed_domain, url, allowed):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=[allowed_domain])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request(url)
if allowed:
assert mw.process_request(request, spider) is None
else:
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
@pytest.mark.parametrize(
("value", "filtered"),
(
(UNSET, True),
(None, True),
(False, True),
(True, False),
),
)
def test_process_request_dont_filter(value, filtered):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
kwargs = {}
if value is not UNSET:
kwargs["dont_filter"] = value
request = Request("https://b.example", **kwargs)
if filtered:
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
else:
assert mw.process_request(request, spider) is None
@pytest.mark.parametrize(
("allow_offsite", "dont_filter", "filtered"),
(
(True, UNSET, False),
(True, None, False),
(True, False, False),
(True, True, False),
(False, UNSET, True),
(False, None, True),
(False, False, True),
(False, True, False),
),
)
def test_process_request_allow_offsite(allow_offsite, dont_filter, filtered):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
kwargs = {"meta": {}}
if allow_offsite is not UNSET:
kwargs["meta"]["allow_offsite"] = allow_offsite
if dont_filter is not UNSET:
kwargs["dont_filter"] = dont_filter
request = Request("https://b.example", **kwargs)
if filtered:
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
else:
assert mw.process_request(request, spider) is None
@pytest.mark.parametrize(
"value",
(
UNSET,
None,
[],
),
)
def test_process_request_no_allowed_domains(value):
crawler = get_crawler(Spider)
kwargs = {}
if value is not UNSET:
kwargs["allowed_domains"] = value
spider = crawler._create_spider(name="a", **kwargs)
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request("https://example.com")
assert mw.process_request(request, spider) is None
def test_process_request_invalid_domains():
crawler = get_crawler(Spider)
allowed_domains = ["a.example", None, "http:////b.example", "//c.example"]
spider = crawler._create_spider(name="a", allowed_domains=allowed_domains)
mw = OffsiteMiddleware.from_crawler(crawler)
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
mw.spider_opened(spider)
request = Request("https://a.example")
assert mw.process_request(request, spider) is None
for letter in ("b", "c"):
request = Request(f"https://{letter}.example")
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
@pytest.mark.parametrize(
("allowed_domain", "url", "allowed"),
(
("example.com", "http://example.com/1", True),
("example.com", "http://example.org/1", False),
("example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://example.com/1", False),
("example.com", "http://example.com:8000/1", True),
("example.com", "http://example.org/example.com", False),
("example.com", "http://example.org/foo.example.com", False),
("example.com", "http://example.com.example", False),
("a.example", "http://nota.example", False),
("b.a.example", "http://notb.a.example", False),
),
)
def test_request_scheduled_domain_filtering(allowed_domain, url, allowed):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=[allowed_domain])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request(url)
if allowed:
assert mw.request_scheduled(request, spider) is None
else:
with pytest.raises(IgnoreRequest):
mw.request_scheduled(request, spider)
@pytest.mark.parametrize(
("value", "filtered"),
(
(UNSET, True),
(None, True),
(False, True),
(True, False),
),
)
def test_request_scheduled_dont_filter(value, filtered):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
kwargs = {}
if value is not UNSET:
kwargs["dont_filter"] = value
request = Request("https://b.example", **kwargs)
if filtered:
with pytest.raises(IgnoreRequest):
mw.request_scheduled(request, spider)
else:
assert mw.request_scheduled(request, spider) is None
@pytest.mark.parametrize(
"value",
(
UNSET,
None,
[],
),
)
def test_request_scheduled_no_allowed_domains(value):
crawler = get_crawler(Spider)
kwargs = {}
if value is not UNSET:
kwargs["allowed_domains"] = value
spider = crawler._create_spider(name="a", **kwargs)
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request("https://example.com")
assert mw.request_scheduled(request, spider) is None
def test_request_scheduled_invalid_domains():
crawler = get_crawler(Spider)
allowed_domains = ["a.example", None, "http:////b.example", "//c.example"]
spider = crawler._create_spider(name="a", allowed_domains=allowed_domains)
mw = OffsiteMiddleware.from_crawler(crawler)
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
mw.spider_opened(spider)
request = Request("https://a.example")
assert mw.request_scheduled(request, spider) is None
for letter in ("b", "c"):
request = Request(f"https://{letter}.example")
with pytest.raises(IgnoreRequest):
mw.request_scheduled(request, spider)