mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 20:43:50 +00:00
made offsite middleware log messages when filtering out requests
This commit is contained in:
parent
ee08d38ab6
commit
415dec4e16
12
docs/faq.rst
12
docs/faq.rst
@ -118,3 +118,15 @@ written in a ``my_spider.py`` file you can run it with::
|
||||
|
||||
scrapy-ctl.py runspider my_spider.py
|
||||
|
||||
I get "Filtered offsite request" messages. How can I fix them?
|
||||
--------------------------------------------------------------
|
||||
|
||||
Those messages (logged with ``DEBUG`` level) don't necesarilly mean there is a
|
||||
problem, so mat not need to fix them.
|
||||
|
||||
Those message are thrown by the Offsite Spider Middleware, which is a spider
|
||||
middleware (enabled by default) whose purpose is to filter out requests to
|
||||
domains outside the ones covered by the spider.
|
||||
|
||||
For more info see:
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware`.
|
||||
|
@ -210,12 +210,24 @@ OffsiteMiddleware
|
||||
|
||||
Filters out Requests for URLs outside the domains covered by the spider.
|
||||
|
||||
This middleware filters out every request whose host names doesn't match
|
||||
This middleware filters out every request whose host names don't match
|
||||
:attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider
|
||||
:attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.".
|
||||
Spider can add more domains to exclude using
|
||||
:attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute.
|
||||
|
||||
When your spider returns a request for a domain not belonging to those
|
||||
covered by the spider, this middleware will log a debug message similar to
|
||||
this one::
|
||||
|
||||
DEBUG: Filtered offsite request to 'www.othersite.com': <GET http://www.othersite.com/some/page.html>
|
||||
|
||||
To avoid filling the log with too much noise, it will only print one of
|
||||
these messages for each new domain filtered. So, for example, if another
|
||||
request for ``www.othersite.com`` is filtered, no log message will be
|
||||
printed. But if a request for ``someothersite.com`` is filtered, a message
|
||||
will be printed (but only for the first request filtred).
|
||||
|
||||
RefererMiddleware
|
||||
-----------------
|
||||
|
||||
|
@ -10,17 +10,29 @@ from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy import log
|
||||
|
||||
class OffsiteMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.host_regexes = {}
|
||||
self.domains_seen = {}
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
return (x for x in result if not isinstance(x, Request) or \
|
||||
self.should_follow(x, spider))
|
||||
for x in result:
|
||||
if isinstance(x, Request):
|
||||
if self.should_follow(x, spider):
|
||||
yield x
|
||||
else:
|
||||
domain = urlparse_cached(x).hostname
|
||||
if domain and domain not in self.domains_seen[spider]:
|
||||
log.msg("Filtered offsite request to %r: %s" % (domain, x),
|
||||
level=log.DEBUG, spider=spider)
|
||||
self.domains_seen[spider].add(domain)
|
||||
else:
|
||||
yield x
|
||||
|
||||
def should_follow(self, request, spider):
|
||||
regex = self.host_regexes[spider]
|
||||
@ -37,6 +49,8 @@ class OffsiteMiddleware(object):
|
||||
def spider_opened(self, spider):
|
||||
domains = [spider.domain_name] + spider.extra_domain_names
|
||||
self.host_regexes[spider] = self.get_host_regex(domains)
|
||||
self.domains_seen[spider] = set()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
del self.host_regexes[spider]
|
||||
del self.domains_seen[spider]
|
||||
|
Loading…
x
Reference in New Issue
Block a user