1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

new allow_offsite parameter in OffsiteMiddleware (#6151)

* new 'allow_offsite' parameter in OffsiteMiddleware

* document deprecated dont_filter flag in OffsiteMiddleware

* avoid deprecating dont_filter in OffsiteMiddleware

* Copy the code to the downloader mw.

* Add tests for allow_offsite in the downloader mw.

* Mark allow_offsite with reqmeta.

---------

Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
This commit is contained in:
Kevin Lloyd Bernal 2025-01-09 03:28:51 +11:00 committed by GitHub
parent b6d69e3895
commit 1fc91bb462
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 52 additions and 8 deletions

View File

@ -797,9 +797,12 @@ OffsiteMiddleware
:attr:`~scrapy.Spider.allowed_domains` attribute, or the
attribute is empty, the offsite middleware will allow all requests.
If the request has the :attr:`~scrapy.Request.dont_filter` attribute
set, the offsite middleware will allow the request even if its domain is not
listed in allowed domains.
.. reqmeta:: allow_offsite
If the request has the :attr:`~scrapy.Request.dont_filter` attribute set to
``True`` or :attr:`Request.meta` has ``allow_offsite`` set to ``True``, then
the OffsiteMiddleware will allow the request even if its domain is not listed
in allowed domains.
RedirectMiddleware
------------------

View File

@ -145,9 +145,9 @@ Request objects
:type priority: int
:param dont_filter: indicates that this request should not be filtered by
the scheduler. This is used when you want to perform an identical
request multiple times, to ignore the duplicates filter. Use it with
care, or you will get into crawling loops. Default to ``False``.
the scheduler or some middlewares. This is used when you want to perform
an identical request multiple times, to ignore the duplicates filter.
Use it with care, or you will get into crawling loops. Default to ``False``.
:type dont_filter: bool
:param errback: a function that will be called if any exception was
@ -660,6 +660,7 @@ are some special keys recognized by Scrapy and its built-in extensions.
Those are:
* :reqmeta:`allow_offsite`
* :reqmeta:`autothrottle_dont_adjust_delay`
* :reqmeta:`bindaddress`
* :reqmeta:`cookiejar`

View File

@ -40,7 +40,11 @@ class OffsiteMiddleware:
self.process_request(request, spider)
def process_request(self, request: Request, spider: Spider) -> None:
if request.dont_filter or self.should_follow(request, spider):
if (
request.dont_filter
or request.meta.get("allow_offsite")
or self.should_follow(request, spider)
):
return
domain = urlparse_cached(request).hostname
if domain and domain not in self.domains_seen:

View File

@ -61,7 +61,11 @@ class OffsiteMiddleware:
def _filter(self, request: Any, spider: Spider) -> bool:
if not isinstance(request, Request):
return True
if request.dont_filter or self.should_follow(request, spider):
if (
request.dont_filter
or request.meta.get("allow_offsite")
or self.should_follow(request, spider)
):
return True
domain = urlparse_cached(request).hostname
if domain and domain not in self.domains_seen:

View File

@ -64,6 +64,37 @@ def test_process_request_dont_filter(value, filtered):
assert mw.process_request(request, spider) is None
@pytest.mark.parametrize(
("allow_offsite", "dont_filter", "filtered"),
(
(True, UNSET, False),
(True, None, False),
(True, False, False),
(True, True, False),
(False, UNSET, True),
(False, None, True),
(False, False, True),
(False, True, False),
),
)
def test_process_request_allow_offsite(allow_offsite, dont_filter, filtered):
crawler = get_crawler(Spider)
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
mw = OffsiteMiddleware.from_crawler(crawler)
mw.spider_opened(spider)
kwargs = {"meta": {}}
if allow_offsite is not UNSET:
kwargs["meta"]["allow_offsite"] = allow_offsite
if dont_filter is not UNSET:
kwargs["dont_filter"] = dont_filter
request = Request("https://b.example", **kwargs)
if filtered:
with pytest.raises(IgnoreRequest):
mw.process_request(request, spider)
else:
assert mw.process_request(request, spider) is None
@pytest.mark.parametrize(
"value",
(

View File

@ -29,6 +29,7 @@ class TestOffsiteMiddleware(TestCase):
Request("http://scrapy.org/1"),
Request("http://sub.scrapy.org/1"),
Request("http://offsite.tld/letmepass", dont_filter=True),
Request("http://offsite-2.tld/allow", meta={"allow_offsite": True}),
Request("http://scrapy.test.org/"),
Request("http://scrapy.test.org:8000/"),
]