Deprecate BaseDupeFilter.log() and improve dupefilter docs (#4151)

* Remove BaseDupeFilter.log() It is never called because request_seen() always returns False * Document the interface of DUPEFILTER_CLASS classes * Remove unnecessary BaseDupeFilter comments and add a short class description * Improve the documentation related to the DUPEFILTER_CLASS setting * Deprecate BaseDupeFilter.log * Update the docs * Fix the new code example * Remove typing to keep the example short Otherwise, it would have required yet another import line (from __future__ or typing). --------- Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
2025-02-06 11:00:46 +00:00 · 2025-01-14 15:36:56 +01:00 · 2025-01-14 15:36:56 +01:00 · 98ba61256d
commit 98ba61256d
parent 402500b164
4 changed files with 108 additions and 9 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -258,6 +258,10 @@ coverage_ignore_pyobjects = [
    # Base classes of downloader middlewares are implementation details that
    # are not meant for users.
    r"^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware",
+    # The interface methods of duplicate request filtering classes are already
+    # covered in the interface documentation part of the DUPEFILTER_CLASS
+    # setting documentation.
+    r"^scrapy\.dupefilters\.[A-Z]\w*?\.(from_settings|request_seen|open|close|log)$",
    # Private exception used by the command-line interface implementation.
    r"^scrapy\.exceptions\.UsageError",
    # Methods of BaseItemExporter subclasses are only documented in
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@ -955,15 +955,79 @@ Default: ``'scrapy.dupefilters.RFPDupeFilter'``

 The class used to detect and filter duplicate requests.

-The default (``RFPDupeFilter``) filters based on the
+The default, :class:`~scrapy.dupefilters.RFPDupeFilter`, filters based on the
 :setting:`REQUEST_FINGERPRINTER_CLASS` setting.

-You can disable filtering of duplicate requests by setting
-:setting:`DUPEFILTER_CLASS` to ``'scrapy.dupefilters.BaseDupeFilter'``.
-Be very careful about this however, because you can get into crawling loops.
-It's usually a better idea to set the ``dont_filter`` parameter to
-``True`` on the specific :class:`~scrapy.Request` that should not be
-filtered.
+To change how duplicates are checked, you can point :setting:`DUPEFILTER_CLASS`
+to a custom subclass of :class:`~scrapy.dupefilters.RFPDupeFilter` that
+overrides its ``__init__`` method to use a :ref:`different request
+fingerprinting class <custom-request-fingerprinter>`. For example:
+
+.. code-block:: python
+
+    from scrapy.dupefilters import RFPDupeFilter
+    from scrapy.utils.request import fingerprint
+
+
+    class CustomRequestFingerprinter:
+        def fingerprint(self, request):
+            return fingerprint(request, include_headers=["X-ID"])
+
+
+    class CustomDupeFilter(RFPDupeFilter):
+
+        def __init__(self, path=None, debug=False, *, fingerprinter=None):
+            super().__init__(
+                path=path, debug=debug, fingerprinter=CustomRequestFingerprinter()
+            )
+
+To disable duplicate request filtering set :setting:`DUPEFILTER_CLASS` to
+``'scrapy.dupefilters.BaseDupeFilter'``. Note that not filtering out duplicate
+requests may cause crawling loops. It is usually better to set
+the ``dont_filter`` parameter to ``True`` on the ``__init__`` method of a
+specific :class:`~scrapy.Request` object that should not be filtered out.
+
+A class assigned to :setting:`DUPEFILTER_CLASS` must implement the following
+interface::
+
+    class MyDupeFilter:
+
+        @classmethod
+        def from_settings(cls, settings):
+            """Returns an instance of this duplicate request filtering class
+            based on the current crawl settings."""
+            return cls()
+
+        def request_seen(self, request):
+            """Returns ``True`` if *request* is a duplicate of another request
+            seen in a previous call to :meth:`request_seen`, or ``False``
+            otherwise."""
+            return False
+
+        def open(self):
+            """Called before the spider opens. It may return a deferred."""
+            pass
+
+        def close(self, reason):
+            """Called before the spider closes. It may return a deferred."""
+            pass
+
+        def log(self, request, spider):
+            """Logs that a request has been filtered out.
+
+            It is called right after a call to :meth:`request_seen` that
+            returns ``True``.
+
+            If :meth:`request_seen` always returns ``False``, such as in the
+            case of :class:`~scrapy.dupefilters.BaseDupeFilter`, this method
+            may be omitted.
+            """
+            pass
+
+.. autoclass:: scrapy.dupefilters.BaseDupeFilter
+
+.. autoclass:: scrapy.dupefilters.RFPDupeFilter
+

 .. setting:: DUPEFILTER_DEBUG

--- a/scrapy/dupefilters.py
+++ b/scrapy/dupefilters.py
@ -4,6 +4,7 @@ import logging
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING
+from warnings import warn

 from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.utils.job import job_dir
@ -26,6 +27,9 @@ if TYPE_CHECKING:


 class BaseDupeFilter:
+    """Dummy duplicate request filtering class (:setting:`DUPEFILTER_CLASS`)
+    that does not filter out any request."""
+
    @classmethod
    def from_settings(cls, settings: BaseSettings) -> Self:
        warnings.warn(
@ -50,10 +54,19 @@ class BaseDupeFilter:

    def log(self, request: Request, spider: Spider) -> None:
        """Log that a request has been filtered"""
+        warn(
+            "Calling BaseDupeFilter.log() is deprecated.",
+            ScrapyDeprecationWarning,
+            stacklevel=2,
+        )


 class RFPDupeFilter(BaseDupeFilter):
-    """Request Fingerprint duplicates filter"""
+    """Duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that
+    filters out requests with the canonical
+    (:func:`w3lib.url.canonicalize_url`) :attr:`~scrapy.http.Request.url`,
+    :attr:`~scrapy.http.Request.method` and :attr:`~scrapy.http.Request.body`.
+    """

    def __init__(
        self,
@ -117,6 +130,7 @@ class RFPDupeFilter(BaseDupeFilter):
        return False

    def request_fingerprint(self, request: Request) -> str:
+        """Returns a string that uniquely identifies the specified request."""
        return self.fingerprinter.fingerprint(request).hex()

    def close(self, reason: str) -> None:
--- a/tests/test_dupefilters.py
+++ b/tests/test_dupefilters.py
@ -4,11 +4,13 @@ import sys
 import tempfile
 import unittest
 from pathlib import Path
+from warnings import catch_warnings

 from testfixtures import LogCapture

 from scrapy.core.scheduler import Scheduler
-from scrapy.dupefilters import RFPDupeFilter
+from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Request
 from scrapy.utils.python import to_bytes
 from scrapy.utils.test import get_crawler
@ -252,3 +254,18 @@ class RFPDupeFilterTest(unittest.TestCase):
            )

            dupefilter.close("finished")
+
+
+class BaseDupeFilterTestCase(unittest.TestCase):
+    def test_log_deprecation(self):
+        dupefilter = _get_dupefilter(
+            settings={"DUPEFILTER_CLASS": BaseDupeFilter},
+        )
+        with catch_warnings(record=True) as warning_list:
+            dupefilter.log(None, None)
+        self.assertEqual(len(warning_list), 1)
+        self.assertEqual(
+            str(warning_list[0].message),
+            "Calling BaseDupeFilter.log() is deprecated.",
+        )
+        self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)