Deprecate AjaxCrawlMiddleware.

2025-02-06 16:31:38 +00:00 · 2024-11-07 21:21:17 +05:00 · 2024-11-07 21:21:17 +05:00 · 393ff96e45
commit 393ff96e45
parent b4c2531021
4 changed files with 13 additions and 28 deletions
--- a/docs/topics/broad-crawls.rst
+++ b/docs/topics/broad-crawls.rst
@ -182,30 +182,6 @@ To disable redirects use:
    REDIRECT_ENABLED = False
 Enable crawling of "Ajax Crawlable Pages"
 =========================================
 Some pages (up to 1%, based on empirical data from year 2013) declare
 themselves as ajax crawlable. This means they provide plain HTML
 version of content that is usually available only via AJAX.
 Pages can indicate it in two ways:
 1) by using ``#!`` in URL - this is the default way;
 2) by using a special meta tag - this way is used on
   "main", "index" website pages.
 Scrapy handles (1) automatically; to handle (2) enable
 :ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
 .. code-block:: python
    AJAXCRAWL_ENABLED = True
 When doing broad crawls it's common to crawl a lot of "index" web pages;
 AjaxCrawlMiddleware helps to crawl them correctly.
 It is turned OFF by default because it has some performance overhead,
 and enabling it for focused crawls doesn't make much sense.
 .. _broad-crawls-bfo:
 Crawl in BFO order
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -1249,8 +1249,7 @@ AJAXCRAWL_ENABLED
 Default: ``False``
-Whether the AjaxCrawlMiddleware will be enabled. You may want to
+Whether the AjaxCrawlMiddleware will be enabled.
 enable it for :ref:`broad crawls <topics-broad-crawls>`.
 HttpProxyMiddleware settings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/scrapy/downloadermiddlewares/ajaxcrawl.py
+++ b/scrapy/downloadermiddlewares/ajaxcrawl.py
@ -3,10 +3,11 @@ from __future__ import annotations
 import logging
 import re
 from typing import TYPE_CHECKING
 from warnings import warn
 from w3lib import html
-from scrapy.exceptions import NotConfigured
+from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
 from scrapy.http import HtmlResponse, Response
 if TYPE_CHECKING:
@ -30,6 +31,13 @@ class AjaxCrawlMiddleware:
        if not settings.getbool("AJAXCRAWL_ENABLED"):
            raise NotConfigured
        warn(
            "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
            " and will be removed in a future Scrapy version.",
            ScrapyDeprecationWarning,
            stacklevel=2,
        )
        # XXX: Google parses at least first 100k bytes; scrapy's redirect
        # middleware parses first 4k. 4k turns out to be insufficient
        # for this middleware, and parsing 100k could be slow.
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
        return _has_ajaxcrawlable_meta(body)
 # XXX: move it to w3lib?
 _ajax_crawlable_re: re.Pattern[str] = re.compile(
    r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
 )
--- a/tests/test_downloadermiddleware_ajaxcrawlable.py
+++ b/tests/test_downloadermiddleware_ajaxcrawlable.py
@ -1,5 +1,7 @@
 import unittest
 import pytest
 from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
 from scrapy.http import HtmlResponse, Request, Response
 from scrapy.spiders import Spider
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
 __doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
 class AjaxCrawlMiddlewareTest(unittest.TestCase):
    def setUp(self):
        crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})