1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Deprecate BaseDupeFilter.log() and improve dupefilter docs (#4151)

* Remove BaseDupeFilter.log()

It is never called because request_seen() always returns False

* Document the interface of DUPEFILTER_CLASS classes

* Remove unnecessary BaseDupeFilter comments and add a short class description

* Improve the documentation related to the DUPEFILTER_CLASS setting

* Deprecate BaseDupeFilter.log

* Update the docs

* Fix the new code example

* Remove typing to keep the example short

Otherwise, it would have required yet another import line (from __future__ or typing).

---------

Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
This commit is contained in:
Adrián Chaves 2025-01-14 15:36:56 +01:00 committed by GitHub
parent 402500b164
commit 98ba61256d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 108 additions and 9 deletions

View File

@ -258,6 +258,10 @@ coverage_ignore_pyobjects = [
# Base classes of downloader middlewares are implementation details that
# are not meant for users.
r"^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware",
# The interface methods of duplicate request filtering classes are already
# covered in the interface documentation part of the DUPEFILTER_CLASS
# setting documentation.
r"^scrapy\.dupefilters\.[A-Z]\w*?\.(from_settings|request_seen|open|close|log)$",
# Private exception used by the command-line interface implementation.
r"^scrapy\.exceptions\.UsageError",
# Methods of BaseItemExporter subclasses are only documented in

View File

@ -955,15 +955,79 @@ Default: ``'scrapy.dupefilters.RFPDupeFilter'``
The class used to detect and filter duplicate requests.
The default (``RFPDupeFilter``) filters based on the
The default, :class:`~scrapy.dupefilters.RFPDupeFilter`, filters based on the
:setting:`REQUEST_FINGERPRINTER_CLASS` setting.
You can disable filtering of duplicate requests by setting
:setting:`DUPEFILTER_CLASS` to ``'scrapy.dupefilters.BaseDupeFilter'``.
Be very careful about this however, because you can get into crawling loops.
It's usually a better idea to set the ``dont_filter`` parameter to
``True`` on the specific :class:`~scrapy.Request` that should not be
filtered.
To change how duplicates are checked, you can point :setting:`DUPEFILTER_CLASS`
to a custom subclass of :class:`~scrapy.dupefilters.RFPDupeFilter` that
overrides its ``__init__`` method to use a :ref:`different request
fingerprinting class <custom-request-fingerprinter>`. For example:
.. code-block:: python
from scrapy.dupefilters import RFPDupeFilter
from scrapy.utils.request import fingerprint
class CustomRequestFingerprinter:
def fingerprint(self, request):
return fingerprint(request, include_headers=["X-ID"])
class CustomDupeFilter(RFPDupeFilter):
def __init__(self, path=None, debug=False, *, fingerprinter=None):
super().__init__(
path=path, debug=debug, fingerprinter=CustomRequestFingerprinter()
)
To disable duplicate request filtering set :setting:`DUPEFILTER_CLASS` to
``'scrapy.dupefilters.BaseDupeFilter'``. Note that not filtering out duplicate
requests may cause crawling loops. It is usually better to set
the ``dont_filter`` parameter to ``True`` on the ``__init__`` method of a
specific :class:`~scrapy.Request` object that should not be filtered out.
A class assigned to :setting:`DUPEFILTER_CLASS` must implement the following
interface::
class MyDupeFilter:
@classmethod
def from_settings(cls, settings):
"""Returns an instance of this duplicate request filtering class
based on the current crawl settings."""
return cls()
def request_seen(self, request):
"""Returns ``True`` if *request* is a duplicate of another request
seen in a previous call to :meth:`request_seen`, or ``False``
otherwise."""
return False
def open(self):
"""Called before the spider opens. It may return a deferred."""
pass
def close(self, reason):
"""Called before the spider closes. It may return a deferred."""
pass
def log(self, request, spider):
"""Logs that a request has been filtered out.
It is called right after a call to :meth:`request_seen` that
returns ``True``.
If :meth:`request_seen` always returns ``False``, such as in the
case of :class:`~scrapy.dupefilters.BaseDupeFilter`, this method
may be omitted.
"""
pass
.. autoclass:: scrapy.dupefilters.BaseDupeFilter
.. autoclass:: scrapy.dupefilters.RFPDupeFilter
.. setting:: DUPEFILTER_DEBUG

View File

@ -4,6 +4,7 @@ import logging
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
from warnings import warn
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.job import job_dir
@ -26,6 +27,9 @@ if TYPE_CHECKING:
class BaseDupeFilter:
"""Dummy duplicate request filtering class (:setting:`DUPEFILTER_CLASS`)
that does not filter out any request."""
@classmethod
def from_settings(cls, settings: BaseSettings) -> Self:
warnings.warn(
@ -50,10 +54,19 @@ class BaseDupeFilter:
def log(self, request: Request, spider: Spider) -> None:
"""Log that a request has been filtered"""
warn(
"Calling BaseDupeFilter.log() is deprecated.",
ScrapyDeprecationWarning,
stacklevel=2,
)
class RFPDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
"""Duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that
filters out requests with the canonical
(:func:`w3lib.url.canonicalize_url`) :attr:`~scrapy.http.Request.url`,
:attr:`~scrapy.http.Request.method` and :attr:`~scrapy.http.Request.body`.
"""
def __init__(
self,
@ -117,6 +130,7 @@ class RFPDupeFilter(BaseDupeFilter):
return False
def request_fingerprint(self, request: Request) -> str:
"""Returns a string that uniquely identifies the specified request."""
return self.fingerprinter.fingerprint(request).hex()
def close(self, reason: str) -> None:

View File

@ -4,11 +4,13 @@ import sys
import tempfile
import unittest
from pathlib import Path
from warnings import catch_warnings
from testfixtures import LogCapture
from scrapy.core.scheduler import Scheduler
from scrapy.dupefilters import RFPDupeFilter
from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.utils.python import to_bytes
from scrapy.utils.test import get_crawler
@ -252,3 +254,18 @@ class RFPDupeFilterTest(unittest.TestCase):
)
dupefilter.close("finished")
class BaseDupeFilterTestCase(unittest.TestCase):
def test_log_deprecation(self):
dupefilter = _get_dupefilter(
settings={"DUPEFILTER_CLASS": BaseDupeFilter},
)
with catch_warnings(record=True) as warning_list:
dupefilter.log(None, None)
self.assertEqual(len(warning_list), 1)
self.assertEqual(
str(warning_list[0].message),
"Calling BaseDupeFilter.log() is deprecated.",
)
self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)