diff --git a/docs/conf.py b/docs/conf.py index fd8165db3..8196b6934 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -258,6 +258,10 @@ coverage_ignore_pyobjects = [ # Base classes of downloader middlewares are implementation details that # are not meant for users. r"^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware", + # The interface methods of duplicate request filtering classes are already + # covered in the interface documentation part of the DUPEFILTER_CLASS + # setting documentation. + r"^scrapy\.dupefilters\.[A-Z]\w*?\.(from_settings|request_seen|open|close|log)$", # Private exception used by the command-line interface implementation. r"^scrapy\.exceptions\.UsageError", # Methods of BaseItemExporter subclasses are only documented in diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 76904a26e..06974f336 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -955,15 +955,79 @@ Default: ``'scrapy.dupefilters.RFPDupeFilter'`` The class used to detect and filter duplicate requests. -The default (``RFPDupeFilter``) filters based on the +The default, :class:`~scrapy.dupefilters.RFPDupeFilter`, filters based on the :setting:`REQUEST_FINGERPRINTER_CLASS` setting. -You can disable filtering of duplicate requests by setting -:setting:`DUPEFILTER_CLASS` to ``'scrapy.dupefilters.BaseDupeFilter'``. -Be very careful about this however, because you can get into crawling loops. -It's usually a better idea to set the ``dont_filter`` parameter to -``True`` on the specific :class:`~scrapy.Request` that should not be -filtered. +To change how duplicates are checked, you can point :setting:`DUPEFILTER_CLASS` +to a custom subclass of :class:`~scrapy.dupefilters.RFPDupeFilter` that +overrides its ``__init__`` method to use a :ref:`different request +fingerprinting class `. For example: + +.. code-block:: python + + from scrapy.dupefilters import RFPDupeFilter + from scrapy.utils.request import fingerprint + + + class CustomRequestFingerprinter: + def fingerprint(self, request): + return fingerprint(request, include_headers=["X-ID"]) + + + class CustomDupeFilter(RFPDupeFilter): + + def __init__(self, path=None, debug=False, *, fingerprinter=None): + super().__init__( + path=path, debug=debug, fingerprinter=CustomRequestFingerprinter() + ) + +To disable duplicate request filtering set :setting:`DUPEFILTER_CLASS` to +``'scrapy.dupefilters.BaseDupeFilter'``. Note that not filtering out duplicate +requests may cause crawling loops. It is usually better to set +the ``dont_filter`` parameter to ``True`` on the ``__init__`` method of a +specific :class:`~scrapy.Request` object that should not be filtered out. + +A class assigned to :setting:`DUPEFILTER_CLASS` must implement the following +interface:: + + class MyDupeFilter: + + @classmethod + def from_settings(cls, settings): + """Returns an instance of this duplicate request filtering class + based on the current crawl settings.""" + return cls() + + def request_seen(self, request): + """Returns ``True`` if *request* is a duplicate of another request + seen in a previous call to :meth:`request_seen`, or ``False`` + otherwise.""" + return False + + def open(self): + """Called before the spider opens. It may return a deferred.""" + pass + + def close(self, reason): + """Called before the spider closes. It may return a deferred.""" + pass + + def log(self, request, spider): + """Logs that a request has been filtered out. + + It is called right after a call to :meth:`request_seen` that + returns ``True``. + + If :meth:`request_seen` always returns ``False``, such as in the + case of :class:`~scrapy.dupefilters.BaseDupeFilter`, this method + may be omitted. + """ + pass + +.. autoclass:: scrapy.dupefilters.BaseDupeFilter + +.. autoclass:: scrapy.dupefilters.RFPDupeFilter + .. setting:: DUPEFILTER_DEBUG diff --git a/scrapy/dupefilters.py b/scrapy/dupefilters.py index caf69daf4..a3e2c5eb4 100644 --- a/scrapy/dupefilters.py +++ b/scrapy/dupefilters.py @@ -4,6 +4,7 @@ import logging import warnings from pathlib import Path from typing import TYPE_CHECKING +from warnings import warn from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.utils.job import job_dir @@ -26,6 +27,9 @@ if TYPE_CHECKING: class BaseDupeFilter: + """Dummy duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) + that does not filter out any request.""" + @classmethod def from_settings(cls, settings: BaseSettings) -> Self: warnings.warn( @@ -50,10 +54,19 @@ class BaseDupeFilter: def log(self, request: Request, spider: Spider) -> None: """Log that a request has been filtered""" + warn( + "Calling BaseDupeFilter.log() is deprecated.", + ScrapyDeprecationWarning, + stacklevel=2, + ) class RFPDupeFilter(BaseDupeFilter): - """Request Fingerprint duplicates filter""" + """Duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that + filters out requests with the canonical + (:func:`w3lib.url.canonicalize_url`) :attr:`~scrapy.http.Request.url`, + :attr:`~scrapy.http.Request.method` and :attr:`~scrapy.http.Request.body`. + """ def __init__( self, @@ -117,6 +130,7 @@ class RFPDupeFilter(BaseDupeFilter): return False def request_fingerprint(self, request: Request) -> str: + """Returns a string that uniquely identifies the specified request.""" return self.fingerprinter.fingerprint(request).hex() def close(self, reason: str) -> None: diff --git a/tests/test_dupefilters.py b/tests/test_dupefilters.py index 4fd648f48..703c23529 100644 --- a/tests/test_dupefilters.py +++ b/tests/test_dupefilters.py @@ -4,11 +4,13 @@ import sys import tempfile import unittest from pathlib import Path +from warnings import catch_warnings from testfixtures import LogCapture from scrapy.core.scheduler import Scheduler -from scrapy.dupefilters import RFPDupeFilter +from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import Request from scrapy.utils.python import to_bytes from scrapy.utils.test import get_crawler @@ -252,3 +254,18 @@ class RFPDupeFilterTest(unittest.TestCase): ) dupefilter.close("finished") + + +class BaseDupeFilterTestCase(unittest.TestCase): + def test_log_deprecation(self): + dupefilter = _get_dupefilter( + settings={"DUPEFILTER_CLASS": BaseDupeFilter}, + ) + with catch_warnings(record=True) as warning_list: + dupefilter.log(None, None) + self.assertEqual(len(warning_list), 1) + self.assertEqual( + str(warning_list[0].message), + "Calling BaseDupeFilter.log() is deprecated.", + ) + self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)