mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 16:03:49 +00:00
Merge pull request #553 from redapple/dupefilter-verbose
DupeFilter: add settings for verbose logging and filtered requests stats
This commit is contained in:
commit
70a57dc730
@ -410,7 +410,17 @@ The class used to detect and filter duplicate requests.
|
|||||||
The default (``RFPDupeFilter``) filters based on request fingerprint using
|
The default (``RFPDupeFilter``) filters based on request fingerprint using
|
||||||
the ``scrapy.utils.request.request_fingerprint`` function.
|
the ``scrapy.utils.request.request_fingerprint`` function.
|
||||||
|
|
||||||
.. setting:: jDITOR
|
.. setting:: DUPEFILTER_DEBUG
|
||||||
|
|
||||||
|
DUPEFILTER_DEBUG
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Default: ``False``
|
||||||
|
|
||||||
|
By default, ``RFPDupeFilter`` only logs the first duplicate request.
|
||||||
|
Setting :setting:`DUPEFILTER_DEBUG` to ``True`` will make it log all duplicate requests.
|
||||||
|
|
||||||
|
.. setting:: EDITOR
|
||||||
|
|
||||||
EDITOR
|
EDITOR
|
||||||
------
|
------
|
||||||
|
@ -28,17 +28,19 @@ class BaseDupeFilter(object):
|
|||||||
class RFPDupeFilter(BaseDupeFilter):
|
class RFPDupeFilter(BaseDupeFilter):
|
||||||
"""Request Fingerprint duplicates filter"""
|
"""Request Fingerprint duplicates filter"""
|
||||||
|
|
||||||
def __init__(self, path=None):
|
def __init__(self, path=None, verbose_log=False):
|
||||||
self.file = None
|
self.file = None
|
||||||
self.fingerprints = set()
|
self.fingerprints = set()
|
||||||
self.logdupes = True
|
self.logdupes = True
|
||||||
|
self.verbose_log = verbose_log
|
||||||
if path:
|
if path:
|
||||||
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
||||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_settings(cls, settings):
|
def from_settings(cls, settings):
|
||||||
return cls(job_dir(settings))
|
verbose_log = settings.getbool('DUPEFILTER_DEBUG')
|
||||||
|
return cls(job_dir(settings), verbose_log)
|
||||||
|
|
||||||
def request_seen(self, request):
|
def request_seen(self, request):
|
||||||
fp = self.request_fingerprint(request)
|
fp = self.request_fingerprint(request)
|
||||||
@ -56,7 +58,14 @@ class RFPDupeFilter(BaseDupeFilter):
|
|||||||
self.file.close()
|
self.file.close()
|
||||||
|
|
||||||
def log(self, request, spider):
|
def log(self, request, spider):
|
||||||
if self.logdupes:
|
if self.verbose_log:
|
||||||
fmt = "Filtered duplicate request: %(request)s - no more duplicates will be shown (see DUPEFILTER_CLASS)"
|
fmt = "Filtered duplicate request: %(request)s"
|
||||||
|
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
||||||
|
elif self.logdupes:
|
||||||
|
fmt = ("Filtered duplicate request: %(request)s"
|
||||||
|
" - no more duplicates will be shown"
|
||||||
|
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||||
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
||||||
self.logdupes = False
|
self.logdupes = False
|
||||||
|
|
||||||
|
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user