1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 07:48:34 +00:00
scrapy/tests/test_downloadermiddleware_offsite.py
2024-10-21 12:30:49 +02:00

191 lines
6.3 KiB
Python

import warnings
import pytest
from scrapy import Request, Spider
from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.test import get_crawler
UNSET = object()
@pytest.mark.parametrize(
("allowed_domain", "url", "allowed"),
(
("example.com", "http://example.com/1", True),
("example.com", "http://example.org/1", False),
("example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://example.com/1", False),
("example.com", "http://example.com:8000/1", True),
("example.com", "http://example.org/example.com", False),
("example.com", "http://example.org/foo.example.com", False),
("example.com", "http://example.com.example", False),
("a.example", "http://nota.example", False),
("b.a.example", "http://notb.a.example", False),
),
)
def test_process_request_domain_filtering(allowed_domain, url, allowed):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=[allowed_domain])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request(url)
if allowed:
assert mw.process_request(request, spider) is None
else:
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
@pytest.mark.parametrize(
("value", "filtered"),
(
(UNSET, True),
(None, True),
(False, True),
(True, False),
),
)
def test_process_request_dont_filter(value, filtered):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
kwargs = {}
if value is not UNSET:
kwargs["dont_filter"] = value
request = Request("https://b.example", **kwargs)
if filtered:
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
else:
assert mw.process_request(request, spider) is None
@pytest.mark.parametrize(
"value",
(
UNSET,
None,
[],
),
)
def test_process_request_no_allowed_domains(value):
crawler = get_crawler(Spider)
kwargs = {}
if value is not UNSET:
kwargs["allowed_domains"] = value
spider = crawler._create_spider(name="a", **kwargs)
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request("https://example.com")
assert mw.process_request(request, spider) is None
def test_process_request_invalid_domains():
crawler = get_crawler(Spider)
allowed_domains = ["a.example", None, "http:////b.example", "//c.example"]
spider = crawler._create_spider(name="a", allowed_domains=allowed_domains)
mw = OffsiteMiddleware.from_crawler(crawler)
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
mw.spider_opened(spider)
request = Request("https://a.example")
assert mw.process_request(request, spider) is None
for letter in ("b", "c"):
request = Request(f"https://{letter}.example")
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
@pytest.mark.parametrize(
("allowed_domain", "url", "allowed"),
(
("example.com", "http://example.com/1", True),
("example.com", "http://example.org/1", False),
("example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://sub.example.com/1", True),
("sub.example.com", "http://example.com/1", False),
("example.com", "http://example.com:8000/1", True),
("example.com", "http://example.org/example.com", False),
("example.com", "http://example.org/foo.example.com", False),
("example.com", "http://example.com.example", False),
("a.example", "http://nota.example", False),
("b.a.example", "http://notb.a.example", False),
),
)
def test_request_scheduled_domain_filtering(allowed_domain, url, allowed):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=[allowed_domain])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request(url)
if allowed:
assert mw.request_scheduled(request, spider) is None
else:
with pytest.raises(IgnoreRequest):
mw.request_scheduled(request, spider)
@pytest.mark.parametrize(
("value", "filtered"),
(
(UNSET, True),
(None, True),
(False, True),
(True, False),
),
)
def test_request_scheduled_dont_filter(value, filtered):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
kwargs = {}
if value is not UNSET:
kwargs["dont_filter"] = value
request = Request("https://b.example", **kwargs)
if filtered:
with pytest.raises(IgnoreRequest):
mw.request_scheduled(request, spider)
else:
assert mw.request_scheduled(request, spider) is None
@pytest.mark.parametrize(
"value",
(
UNSET,
None,
[],
),
)
def test_request_scheduled_no_allowed_domains(value):
crawler = get_crawler(Spider)
kwargs = {}
if value is not UNSET:
kwargs["allowed_domains"] = value
spider = crawler._create_spider(name="a", **kwargs)
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
request = Request("https://example.com")
assert mw.request_scheduled(request, spider) is None
def test_request_scheduled_invalid_domains():
crawler = get_crawler(Spider)
allowed_domains = ["a.example", None, "http:////b.example", "//c.example"]
spider = crawler._create_spider(name="a", allowed_domains=allowed_domains)
mw = OffsiteMiddleware.from_crawler(crawler)
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
mw.spider_opened(spider)
request = Request("https://a.example")
assert mw.request_scheduled(request, spider) is None
for letter in ("b", "c"):
request = Request(f"https://{letter}.example")
with pytest.raises(IgnoreRequest):
mw.request_scheduled(request, spider)