1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 16:03:49 +00:00

Merge pull request #553 from redapple/dupefilter-verbose

DupeFilter: add settings for verbose logging and filtered requests stats
This commit is contained in:
Daniel Graña 2014-02-18 05:21:32 -08:00
commit 70a57dc730
2 changed files with 32 additions and 13 deletions

View File

@ -410,7 +410,17 @@ The class used to detect and filter duplicate requests.
The default (``RFPDupeFilter``) filters based on request fingerprint using The default (``RFPDupeFilter``) filters based on request fingerprint using
the ``scrapy.utils.request.request_fingerprint`` function. the ``scrapy.utils.request.request_fingerprint`` function.
.. setting:: jDITOR .. setting:: DUPEFILTER_DEBUG
DUPEFILTER_DEBUG
----------------
Default: ``False``
By default, ``RFPDupeFilter`` only logs the first duplicate request.
Setting :setting:`DUPEFILTER_DEBUG` to ``True`` will make it log all duplicate requests.
.. setting:: EDITOR
EDITOR EDITOR
------ ------

View File

@ -28,17 +28,19 @@ class BaseDupeFilter(object):
class RFPDupeFilter(BaseDupeFilter): class RFPDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter""" """Request Fingerprint duplicates filter"""
def __init__(self, path=None): def __init__(self, path=None, verbose_log=False):
self.file = None self.file = None
self.fingerprints = set() self.fingerprints = set()
self.logdupes = True self.logdupes = True
self.verbose_log = verbose_log
if path: if path:
self.file = open(os.path.join(path, 'requests.seen'), 'a+') self.file = open(os.path.join(path, 'requests.seen'), 'a+')
self.fingerprints.update(x.rstrip() for x in self.file) self.fingerprints.update(x.rstrip() for x in self.file)
@classmethod @classmethod
def from_settings(cls, settings): def from_settings(cls, settings):
return cls(job_dir(settings)) verbose_log = settings.getbool('DUPEFILTER_DEBUG')
return cls(job_dir(settings), verbose_log)
def request_seen(self, request): def request_seen(self, request):
fp = self.request_fingerprint(request) fp = self.request_fingerprint(request)
@ -56,7 +58,14 @@ class RFPDupeFilter(BaseDupeFilter):
self.file.close() self.file.close()
def log(self, request, spider): def log(self, request, spider):
if self.logdupes: if self.verbose_log:
fmt = "Filtered duplicate request: %(request)s - no more duplicates will be shown (see DUPEFILTER_CLASS)" fmt = "Filtered duplicate request: %(request)s"
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
elif self.logdupes:
fmt = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider) log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
self.logdupes = False self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)