1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 23:43:59 +00:00

fix Referer logging

This commit is contained in:
Mikhail Korobov 2015-08-26 02:19:33 +05:00
parent 7da769feb2
commit 642af00bb7
4 changed files with 20 additions and 17 deletions

View File

@ -16,6 +16,7 @@ from scrapy import signals
from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.utils.request import referer_str
logger = logging.getLogger(__name__)
@ -150,10 +151,9 @@ class Scraper(object):
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
referer = request.headers.get('Referer')
logger.error(
"Spider error processing %(request)s (referer: %(referer)s)",
{'request': request, 'referer': referer},
{'request': request, 'referer': referer_str(request)},
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)

View File

@ -3,6 +3,7 @@ import logging
from twisted.python.failure import Failure
from scrapy.utils.request import referer_str
SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s"
DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
@ -38,13 +39,16 @@ class LogFormatter(object):
'args': {
'status': response.status,
'request': request,
'referer': request.headers.get('Referer'),
'referer': referer_str(request),
'flags': flags,
}
}
def scraped(self, item, response, spider):
src = response.getErrorMessage() if isinstance(response, Failure) else response
if isinstance(response, Failure):
src = response.getErrorMessage()
else:
src = response
return {
'level': logging.DEBUG,
'msg': SCRAPEDMSG,

View File

@ -26,7 +26,8 @@ from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.misc import md5sum
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.python import to_bytes, to_native_str
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str
logger = logging.getLogger(__name__)
@ -199,7 +200,7 @@ class FilesPipeline(MediaPipeline):
if age_days > self.EXPIRES:
return # returning None force download
referer = _get_referer(request)
referer = referer_str(request)
logger.debug(
'File (uptodate): Downloaded %(medianame)s from %(request)s '
'referred in <%(referer)s>',
@ -225,7 +226,7 @@ class FilesPipeline(MediaPipeline):
def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = _get_referer(request)
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
@ -237,7 +238,7 @@ class FilesPipeline(MediaPipeline):
raise FileException
def media_downloaded(self, response, request, info):
referer = _get_referer(request)
referer = referer_str(request)
if response.status != 200:
logger.warning(
@ -339,11 +340,3 @@ class FilesPipeline(MediaPipeline):
def file_key(self, url):
return self.file_path(url)
file_key._base = True
def _get_referer(request):
""" Return Referer HTTP header suitable for logging """
referrer = request.headers.get('Referer')
if referrer is None:
return referrer
return to_native_str(referrer, errors='replace')

View File

@ -8,7 +8,6 @@ import hashlib
import weakref
from six.moves.urllib.parse import urlunparse
from twisted.internet.defer import Deferred
from w3lib.http import basic_auth_header
from scrapy.utils.python import to_bytes, to_native_str
@ -86,3 +85,10 @@ def request_httprepr(request):
s += request.body
return s
def referer_str(request):
""" Return Referer HTTP header suitable for logging. """
referrer = request.headers.get('Referer')
if referrer is None:
return referrer
return to_native_str(referrer, errors='replace')