mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 11:44:11 +00:00
Merge pull request #553 from redapple/dupefilter-verbose
DupeFilter: add settings for verbose logging and filtered requests stats
This commit is contained in:
commit
70a57dc730
@ -410,7 +410,17 @@ The class used to detect and filter duplicate requests.
|
||||
The default (``RFPDupeFilter``) filters based on request fingerprint using
|
||||
the ``scrapy.utils.request.request_fingerprint`` function.
|
||||
|
||||
.. setting:: jDITOR
|
||||
.. setting:: DUPEFILTER_DEBUG
|
||||
|
||||
DUPEFILTER_DEBUG
|
||||
----------------
|
||||
|
||||
Default: ``False``
|
||||
|
||||
By default, ``RFPDupeFilter`` only logs the first duplicate request.
|
||||
Setting :setting:`DUPEFILTER_DEBUG` to ``True`` will make it log all duplicate requests.
|
||||
|
||||
.. setting:: EDITOR
|
||||
|
||||
EDITOR
|
||||
------
|
||||
|
@ -28,17 +28,19 @@ class BaseDupeFilter(object):
|
||||
class RFPDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
|
||||
def __init__(self, path=None):
|
||||
def __init__(self, path=None, verbose_log=False):
|
||||
self.file = None
|
||||
self.fingerprints = set()
|
||||
self.logdupes = True
|
||||
self.verbose_log = verbose_log
|
||||
if path:
|
||||
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls(job_dir(settings))
|
||||
verbose_log = settings.getbool('DUPEFILTER_DEBUG')
|
||||
return cls(job_dir(settings), verbose_log)
|
||||
|
||||
def request_seen(self, request):
|
||||
fp = self.request_fingerprint(request)
|
||||
@ -56,7 +58,14 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
self.file.close()
|
||||
|
||||
def log(self, request, spider):
|
||||
if self.logdupes:
|
||||
fmt = "Filtered duplicate request: %(request)s - no more duplicates will be shown (see DUPEFILTER_CLASS)"
|
||||
if self.verbose_log:
|
||||
fmt = "Filtered duplicate request: %(request)s"
|
||||
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
||||
elif self.logdupes:
|
||||
fmt = ("Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
||||
self.logdupes = False
|
||||
|
||||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
||||
|
Loading…
x
Reference in New Issue
Block a user