1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 23:23:47 +00:00

Do not filter requests with dont_filter attribute set in OffsiteMiddleware

This commit is contained in:
Daniel Grana 2011-09-08 15:18:10 -03:00
parent bff3d31469
commit 5f1b1c05f8
3 changed files with 10 additions and 4 deletions

View File

@ -211,7 +211,7 @@ OffsiteMiddleware
----------------- -----------------
.. module:: scrapy.contrib.spidermiddleware.offsite .. module:: scrapy.contrib.spidermiddleware.offsite
:synopsis: Offiste Spider Middleware :synopsis: Offsite Spider Middleware
.. class:: OffsiteMiddleware .. class:: OffsiteMiddleware
@ -236,6 +236,10 @@ OffsiteMiddleware
:attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute, or the :attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute, or the
attribute is empty, the offsite middleware will allow all requests. attribute is empty, the offsite middleware will allow all requests.
If the request has the :attr:`~scrapy.http.Request.dont_filter` attribute
set, the offsite middleware will allow the request even if its domain is not
listed in allowed domains.
RefererMiddleware RefererMiddleware
----------------- -----------------

View File

@ -23,7 +23,7 @@ class OffsiteMiddleware(object):
def process_spider_output(self, response, result, spider): def process_spider_output(self, response, result, spider):
for x in result: for x in result:
if isinstance(x, Request): if isinstance(x, Request):
if self.should_follow(x, spider): if x.dont_filter or self.should_follow(x, spider):
yield x yield x
else: else:
domain = urlparse_cached(x).hostname domain = urlparse_cached(x).hostname

View File

@ -20,8 +20,10 @@ class TestOffsiteMiddleware(TestCase):
onsite_reqs = [Request('http://scrapytest.org/1'), onsite_reqs = [Request('http://scrapytest.org/1'),
Request('http://scrapy.org/1'), Request('http://scrapy.org/1'),
Request('http://sub.scrapy.org/1')] Request('http://sub.scrapy.org/1'),
offsite_reqs = [Request('http://scrapy2.org')] Request('http://offsite.tld/letmepass', dont_filter=True)]
offsite_reqs = [Request('http://scrapy2.org'),
Request('http://offsite.tld/')]
reqs = onsite_reqs + offsite_reqs reqs = onsite_reqs + offsite_reqs
out = list(self.mw.process_spider_output(res, reqs, self.spider)) out = list(self.mw.process_spider_output(res, reqs, self.spider))