diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index af7885a45..1ab8f588f 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -797,9 +797,12 @@ OffsiteMiddleware :attr:`~scrapy.Spider.allowed_domains` attribute, or the attribute is empty, the offsite middleware will allow all requests. - If the request has the :attr:`~scrapy.Request.dont_filter` attribute - set, the offsite middleware will allow the request even if its domain is not - listed in allowed domains. + .. reqmeta:: allow_offsite + + If the request has the :attr:`~scrapy.Request.dont_filter` attribute set to + ``True`` or :attr:`Request.meta` has ``allow_offsite`` set to ``True``, then + the OffsiteMiddleware will allow the request even if its domain is not listed + in allowed domains. RedirectMiddleware ------------------ diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 1bb1a10a4..b187f3aaf 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -145,9 +145,9 @@ Request objects :type priority: int :param dont_filter: indicates that this request should not be filtered by - the scheduler. This is used when you want to perform an identical - request multiple times, to ignore the duplicates filter. Use it with - care, or you will get into crawling loops. Default to ``False``. + the scheduler or some middlewares. This is used when you want to perform + an identical request multiple times, to ignore the duplicates filter. + Use it with care, or you will get into crawling loops. Default to ``False``. :type dont_filter: bool :param errback: a function that will be called if any exception was @@ -660,6 +660,7 @@ are some special keys recognized by Scrapy and its built-in extensions. Those are: +* :reqmeta:`allow_offsite` * :reqmeta:`autothrottle_dont_adjust_delay` * :reqmeta:`bindaddress` * :reqmeta:`cookiejar` diff --git a/scrapy/downloadermiddlewares/offsite.py b/scrapy/downloadermiddlewares/offsite.py index a69f531a7..a2cff65e7 100644 --- a/scrapy/downloadermiddlewares/offsite.py +++ b/scrapy/downloadermiddlewares/offsite.py @@ -40,7 +40,11 @@ class OffsiteMiddleware: self.process_request(request, spider) def process_request(self, request: Request, spider: Spider) -> None: - if request.dont_filter or self.should_follow(request, spider): + if ( + request.dont_filter + or request.meta.get("allow_offsite") + or self.should_follow(request, spider) + ): return domain = urlparse_cached(request).hostname if domain and domain not in self.domains_seen: diff --git a/scrapy/spidermiddlewares/offsite.py b/scrapy/spidermiddlewares/offsite.py index d3ed64ef5..95e753830 100644 --- a/scrapy/spidermiddlewares/offsite.py +++ b/scrapy/spidermiddlewares/offsite.py @@ -61,7 +61,11 @@ class OffsiteMiddleware: def _filter(self, request: Any, spider: Spider) -> bool: if not isinstance(request, Request): return True - if request.dont_filter or self.should_follow(request, spider): + if ( + request.dont_filter + or request.meta.get("allow_offsite") + or self.should_follow(request, spider) + ): return True domain = urlparse_cached(request).hostname if domain and domain not in self.domains_seen: diff --git a/tests/test_downloadermiddleware_offsite.py b/tests/test_downloadermiddleware_offsite.py index fec56a39f..23a1d06da 100644 --- a/tests/test_downloadermiddleware_offsite.py +++ b/tests/test_downloadermiddleware_offsite.py @@ -64,6 +64,37 @@ def test_process_request_dont_filter(value, filtered): assert mw.process_request(request, spider) is None +@pytest.mark.parametrize( + ("allow_offsite", "dont_filter", "filtered"), + ( + (True, UNSET, False), + (True, None, False), + (True, False, False), + (True, True, False), + (False, UNSET, True), + (False, None, True), + (False, False, True), + (False, True, False), + ), +) +def test_process_request_allow_offsite(allow_offsite, dont_filter, filtered): + crawler = get_crawler(Spider) + spider = crawler._create_spider(name="a", allowed_domains=["a.example"]) + mw = OffsiteMiddleware.from_crawler(crawler) + mw.spider_opened(spider) + kwargs = {"meta": {}} + if allow_offsite is not UNSET: + kwargs["meta"]["allow_offsite"] = allow_offsite + if dont_filter is not UNSET: + kwargs["dont_filter"] = dont_filter + request = Request("https://b.example", **kwargs) + if filtered: + with pytest.raises(IgnoreRequest): + mw.process_request(request, spider) + else: + assert mw.process_request(request, spider) is None + + @pytest.mark.parametrize( "value", ( diff --git a/tests/test_spidermiddleware_offsite.py b/tests/test_spidermiddleware_offsite.py index 837f1c2c8..906928e01 100644 --- a/tests/test_spidermiddleware_offsite.py +++ b/tests/test_spidermiddleware_offsite.py @@ -29,6 +29,7 @@ class TestOffsiteMiddleware(TestCase): Request("http://scrapy.org/1"), Request("http://sub.scrapy.org/1"), Request("http://offsite.tld/letmepass", dont_filter=True), + Request("http://offsite-2.tld/allow", meta={"allow_offsite": True}), Request("http://scrapy.test.org/"), Request("http://scrapy.test.org:8000/"), ]