mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
new allow_offsite
parameter in OffsiteMiddleware (#6151)
* new 'allow_offsite' parameter in OffsiteMiddleware * document deprecated dont_filter flag in OffsiteMiddleware * avoid deprecating dont_filter in OffsiteMiddleware * Copy the code to the downloader mw. * Add tests for allow_offsite in the downloader mw. * Mark allow_offsite with reqmeta. --------- Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
This commit is contained in:
parent
b6d69e3895
commit
1fc91bb462
@ -797,9 +797,12 @@ OffsiteMiddleware
|
||||
:attr:`~scrapy.Spider.allowed_domains` attribute, or the
|
||||
attribute is empty, the offsite middleware will allow all requests.
|
||||
|
||||
If the request has the :attr:`~scrapy.Request.dont_filter` attribute
|
||||
set, the offsite middleware will allow the request even if its domain is not
|
||||
listed in allowed domains.
|
||||
.. reqmeta:: allow_offsite
|
||||
|
||||
If the request has the :attr:`~scrapy.Request.dont_filter` attribute set to
|
||||
``True`` or :attr:`Request.meta` has ``allow_offsite`` set to ``True``, then
|
||||
the OffsiteMiddleware will allow the request even if its domain is not listed
|
||||
in allowed domains.
|
||||
|
||||
RedirectMiddleware
|
||||
------------------
|
||||
|
@ -145,9 +145,9 @@ Request objects
|
||||
:type priority: int
|
||||
|
||||
:param dont_filter: indicates that this request should not be filtered by
|
||||
the scheduler. This is used when you want to perform an identical
|
||||
request multiple times, to ignore the duplicates filter. Use it with
|
||||
care, or you will get into crawling loops. Default to ``False``.
|
||||
the scheduler or some middlewares. This is used when you want to perform
|
||||
an identical request multiple times, to ignore the duplicates filter.
|
||||
Use it with care, or you will get into crawling loops. Default to ``False``.
|
||||
:type dont_filter: bool
|
||||
|
||||
:param errback: a function that will be called if any exception was
|
||||
@ -660,6 +660,7 @@ are some special keys recognized by Scrapy and its built-in extensions.
|
||||
|
||||
Those are:
|
||||
|
||||
* :reqmeta:`allow_offsite`
|
||||
* :reqmeta:`autothrottle_dont_adjust_delay`
|
||||
* :reqmeta:`bindaddress`
|
||||
* :reqmeta:`cookiejar`
|
||||
|
@ -40,7 +40,11 @@ class OffsiteMiddleware:
|
||||
self.process_request(request, spider)
|
||||
|
||||
def process_request(self, request: Request, spider: Spider) -> None:
|
||||
if request.dont_filter or self.should_follow(request, spider):
|
||||
if (
|
||||
request.dont_filter
|
||||
or request.meta.get("allow_offsite")
|
||||
or self.should_follow(request, spider)
|
||||
):
|
||||
return
|
||||
domain = urlparse_cached(request).hostname
|
||||
if domain and domain not in self.domains_seen:
|
||||
|
@ -61,7 +61,11 @@ class OffsiteMiddleware:
|
||||
def _filter(self, request: Any, spider: Spider) -> bool:
|
||||
if not isinstance(request, Request):
|
||||
return True
|
||||
if request.dont_filter or self.should_follow(request, spider):
|
||||
if (
|
||||
request.dont_filter
|
||||
or request.meta.get("allow_offsite")
|
||||
or self.should_follow(request, spider)
|
||||
):
|
||||
return True
|
||||
domain = urlparse_cached(request).hostname
|
||||
if domain and domain not in self.domains_seen:
|
||||
|
@ -64,6 +64,37 @@ def test_process_request_dont_filter(value, filtered):
|
||||
assert mw.process_request(request, spider) is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("allow_offsite", "dont_filter", "filtered"),
|
||||
(
|
||||
(True, UNSET, False),
|
||||
(True, None, False),
|
||||
(True, False, False),
|
||||
(True, True, False),
|
||||
(False, UNSET, True),
|
||||
(False, None, True),
|
||||
(False, False, True),
|
||||
(False, True, False),
|
||||
),
|
||||
)
|
||||
def test_process_request_allow_offsite(allow_offsite, dont_filter, filtered):
|
||||
crawler = get_crawler(Spider)
|
||||
spider = crawler._create_spider(name="a", allowed_domains=["a.example"])
|
||||
mw = OffsiteMiddleware.from_crawler(crawler)
|
||||
mw.spider_opened(spider)
|
||||
kwargs = {"meta": {}}
|
||||
if allow_offsite is not UNSET:
|
||||
kwargs["meta"]["allow_offsite"] = allow_offsite
|
||||
if dont_filter is not UNSET:
|
||||
kwargs["dont_filter"] = dont_filter
|
||||
request = Request("https://b.example", **kwargs)
|
||||
if filtered:
|
||||
with pytest.raises(IgnoreRequest):
|
||||
mw.process_request(request, spider)
|
||||
else:
|
||||
assert mw.process_request(request, spider) is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value",
|
||||
(
|
||||
|
@ -29,6 +29,7 @@ class TestOffsiteMiddleware(TestCase):
|
||||
Request("http://scrapy.org/1"),
|
||||
Request("http://sub.scrapy.org/1"),
|
||||
Request("http://offsite.tld/letmepass", dont_filter=True),
|
||||
Request("http://offsite-2.tld/allow", meta={"allow_offsite": True}),
|
||||
Request("http://scrapy.test.org/"),
|
||||
Request("http://scrapy.test.org:8000/"),
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user