mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Deprecate BaseDupeFilter.log() and improve dupefilter docs (#4151)
* Remove BaseDupeFilter.log() It is never called because request_seen() always returns False * Document the interface of DUPEFILTER_CLASS classes * Remove unnecessary BaseDupeFilter comments and add a short class description * Improve the documentation related to the DUPEFILTER_CLASS setting * Deprecate BaseDupeFilter.log * Update the docs * Fix the new code example * Remove typing to keep the example short Otherwise, it would have required yet another import line (from __future__ or typing). --------- Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
This commit is contained in:
parent
402500b164
commit
98ba61256d
@ -258,6 +258,10 @@ coverage_ignore_pyobjects = [
|
||||
# Base classes of downloader middlewares are implementation details that
|
||||
# are not meant for users.
|
||||
r"^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware",
|
||||
# The interface methods of duplicate request filtering classes are already
|
||||
# covered in the interface documentation part of the DUPEFILTER_CLASS
|
||||
# setting documentation.
|
||||
r"^scrapy\.dupefilters\.[A-Z]\w*?\.(from_settings|request_seen|open|close|log)$",
|
||||
# Private exception used by the command-line interface implementation.
|
||||
r"^scrapy\.exceptions\.UsageError",
|
||||
# Methods of BaseItemExporter subclasses are only documented in
|
||||
|
@ -955,15 +955,79 @@ Default: ``'scrapy.dupefilters.RFPDupeFilter'``
|
||||
|
||||
The class used to detect and filter duplicate requests.
|
||||
|
||||
The default (``RFPDupeFilter``) filters based on the
|
||||
The default, :class:`~scrapy.dupefilters.RFPDupeFilter`, filters based on the
|
||||
:setting:`REQUEST_FINGERPRINTER_CLASS` setting.
|
||||
|
||||
You can disable filtering of duplicate requests by setting
|
||||
:setting:`DUPEFILTER_CLASS` to ``'scrapy.dupefilters.BaseDupeFilter'``.
|
||||
Be very careful about this however, because you can get into crawling loops.
|
||||
It's usually a better idea to set the ``dont_filter`` parameter to
|
||||
``True`` on the specific :class:`~scrapy.Request` that should not be
|
||||
filtered.
|
||||
To change how duplicates are checked, you can point :setting:`DUPEFILTER_CLASS`
|
||||
to a custom subclass of :class:`~scrapy.dupefilters.RFPDupeFilter` that
|
||||
overrides its ``__init__`` method to use a :ref:`different request
|
||||
fingerprinting class <custom-request-fingerprinter>`. For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from scrapy.dupefilters import RFPDupeFilter
|
||||
from scrapy.utils.request import fingerprint
|
||||
|
||||
|
||||
class CustomRequestFingerprinter:
|
||||
def fingerprint(self, request):
|
||||
return fingerprint(request, include_headers=["X-ID"])
|
||||
|
||||
|
||||
class CustomDupeFilter(RFPDupeFilter):
|
||||
|
||||
def __init__(self, path=None, debug=False, *, fingerprinter=None):
|
||||
super().__init__(
|
||||
path=path, debug=debug, fingerprinter=CustomRequestFingerprinter()
|
||||
)
|
||||
|
||||
To disable duplicate request filtering set :setting:`DUPEFILTER_CLASS` to
|
||||
``'scrapy.dupefilters.BaseDupeFilter'``. Note that not filtering out duplicate
|
||||
requests may cause crawling loops. It is usually better to set
|
||||
the ``dont_filter`` parameter to ``True`` on the ``__init__`` method of a
|
||||
specific :class:`~scrapy.Request` object that should not be filtered out.
|
||||
|
||||
A class assigned to :setting:`DUPEFILTER_CLASS` must implement the following
|
||||
interface::
|
||||
|
||||
class MyDupeFilter:
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
"""Returns an instance of this duplicate request filtering class
|
||||
based on the current crawl settings."""
|
||||
return cls()
|
||||
|
||||
def request_seen(self, request):
|
||||
"""Returns ``True`` if *request* is a duplicate of another request
|
||||
seen in a previous call to :meth:`request_seen`, or ``False``
|
||||
otherwise."""
|
||||
return False
|
||||
|
||||
def open(self):
|
||||
"""Called before the spider opens. It may return a deferred."""
|
||||
pass
|
||||
|
||||
def close(self, reason):
|
||||
"""Called before the spider closes. It may return a deferred."""
|
||||
pass
|
||||
|
||||
def log(self, request, spider):
|
||||
"""Logs that a request has been filtered out.
|
||||
|
||||
It is called right after a call to :meth:`request_seen` that
|
||||
returns ``True``.
|
||||
|
||||
If :meth:`request_seen` always returns ``False``, such as in the
|
||||
case of :class:`~scrapy.dupefilters.BaseDupeFilter`, this method
|
||||
may be omitted.
|
||||
"""
|
||||
pass
|
||||
|
||||
.. autoclass:: scrapy.dupefilters.BaseDupeFilter
|
||||
|
||||
.. autoclass:: scrapy.dupefilters.RFPDupeFilter
|
||||
|
||||
|
||||
.. setting:: DUPEFILTER_DEBUG
|
||||
|
||||
|
@ -4,6 +4,7 @@ import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from warnings import warn
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.job import job_dir
|
||||
@ -26,6 +27,9 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class BaseDupeFilter:
|
||||
"""Dummy duplicate request filtering class (:setting:`DUPEFILTER_CLASS`)
|
||||
that does not filter out any request."""
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings: BaseSettings) -> Self:
|
||||
warnings.warn(
|
||||
@ -50,10 +54,19 @@ class BaseDupeFilter:
|
||||
|
||||
def log(self, request: Request, spider: Spider) -> None:
|
||||
"""Log that a request has been filtered"""
|
||||
warn(
|
||||
"Calling BaseDupeFilter.log() is deprecated.",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
class RFPDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
"""Duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that
|
||||
filters out requests with the canonical
|
||||
(:func:`w3lib.url.canonicalize_url`) :attr:`~scrapy.http.Request.url`,
|
||||
:attr:`~scrapy.http.Request.method` and :attr:`~scrapy.http.Request.body`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -117,6 +130,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
return False
|
||||
|
||||
def request_fingerprint(self, request: Request) -> str:
|
||||
"""Returns a string that uniquely identifies the specified request."""
|
||||
return self.fingerprinter.fingerprint(request).hex()
|
||||
|
||||
def close(self, reason: str) -> None:
|
||||
|
@ -4,11 +4,13 @@ import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from warnings import catch_warnings
|
||||
|
||||
from testfixtures import LogCapture
|
||||
|
||||
from scrapy.core.scheduler import Scheduler
|
||||
from scrapy.dupefilters import RFPDupeFilter
|
||||
from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.test import get_crawler
|
||||
@ -252,3 +254,18 @@ class RFPDupeFilterTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
dupefilter.close("finished")
|
||||
|
||||
|
||||
class BaseDupeFilterTestCase(unittest.TestCase):
|
||||
def test_log_deprecation(self):
|
||||
dupefilter = _get_dupefilter(
|
||||
settings={"DUPEFILTER_CLASS": BaseDupeFilter},
|
||||
)
|
||||
with catch_warnings(record=True) as warning_list:
|
||||
dupefilter.log(None, None)
|
||||
self.assertEqual(len(warning_list), 1)
|
||||
self.assertEqual(
|
||||
str(warning_list[0].message),
|
||||
"Calling BaseDupeFilter.log() is deprecated.",
|
||||
)
|
||||
self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)
|
||||
|
Loading…
x
Reference in New Issue
Block a user