1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 20:43:50 +00:00

made offsite middleware log messages when filtering out requests

This commit is contained in:
Pablo Hoffman 2009-11-12 10:17:21 -02:00
parent ee08d38ab6
commit 415dec4e16
3 changed files with 41 additions and 3 deletions

View File

@ -118,3 +118,15 @@ written in a ``my_spider.py`` file you can run it with::
scrapy-ctl.py runspider my_spider.py
I get "Filtered offsite request" messages. How can I fix them?
--------------------------------------------------------------
Those messages (logged with ``DEBUG`` level) don't necesarilly mean there is a
problem, so mat not need to fix them.
Those message are thrown by the Offsite Spider Middleware, which is a spider
middleware (enabled by default) whose purpose is to filter out requests to
domains outside the ones covered by the spider.
For more info see:
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware`.

View File

@ -210,12 +210,24 @@ OffsiteMiddleware
Filters out Requests for URLs outside the domains covered by the spider.
This middleware filters out every request whose host names doesn't match
This middleware filters out every request whose host names don't match
:attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider
:attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.".
Spider can add more domains to exclude using
:attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute.
When your spider returns a request for a domain not belonging to those
covered by the spider, this middleware will log a debug message similar to
this one::
DEBUG: Filtered offsite request to 'www.othersite.com': <GET http://www.othersite.com/some/page.html>
To avoid filling the log with too much noise, it will only print one of
these messages for each new domain filtered. So, for example, if another
request for ``www.othersite.com`` is filtered, no log message will be
printed. But if a request for ``someothersite.com`` is filtered, a message
will be printed (but only for the first request filtred).
RefererMiddleware
-----------------

View File

@ -10,17 +10,29 @@ from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy import log
class OffsiteMiddleware(object):
def __init__(self):
self.host_regexes = {}
self.domains_seen = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def process_spider_output(self, response, result, spider):
return (x for x in result if not isinstance(x, Request) or \
self.should_follow(x, spider))
for x in result:
if isinstance(x, Request):
if self.should_follow(x, spider):
yield x
else:
domain = urlparse_cached(x).hostname
if domain and domain not in self.domains_seen[spider]:
log.msg("Filtered offsite request to %r: %s" % (domain, x),
level=log.DEBUG, spider=spider)
self.domains_seen[spider].add(domain)
else:
yield x
def should_follow(self, request, spider):
regex = self.host_regexes[spider]
@ -37,6 +49,8 @@ class OffsiteMiddleware(object):
def spider_opened(self, spider):
domains = [spider.domain_name] + spider.extra_domain_names
self.host_regexes[spider] = self.get_host_regex(domains)
self.domains_seen[spider] = set()
def spider_closed(self, spider):
del self.host_regexes[spider]
del self.domains_seen[spider]