Deprecate AjaxCrawlMiddleware.

2025-02-06 11:00:46 +00:00 · 2024-11-07 21:21:17 +05:00 · 2024-11-07 21:21:17 +05:00 · 393ff96e45
commit 393ff96e45
parent b4c2531021
4 changed files with 13 additions and 28 deletions
--- a/docs/topics/broad-crawls.rst
+++ b/docs/topics/broad-crawls.rst
@ -182,30 +182,6 @@ To disable redirects use:

    REDIRECT_ENABLED = False

-Enable crawling of "Ajax Crawlable Pages"
-=========================================
-
-Some pages (up to 1%, based on empirical data from year 2013) declare
-themselves as ajax crawlable. This means they provide plain HTML
-version of content that is usually available only via AJAX.
-Pages can indicate it in two ways:
-
-1) by using ``#!`` in URL - this is the default way;
-2) by using a special meta tag - this way is used on
-   "main", "index" website pages.
-
-Scrapy handles (1) automatically; to handle (2) enable
-:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
-
-.. code-block:: python
-
-    AJAXCRAWL_ENABLED = True
-
-When doing broad crawls it's common to crawl a lot of "index" web pages;
-AjaxCrawlMiddleware helps to crawl them correctly.
-It is turned OFF by default because it has some performance overhead,
-and enabling it for focused crawls doesn't make much sense.
-
 .. _broad-crawls-bfo:

 Crawl in BFO order
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -1249,8 +1249,7 @@ AJAXCRAWL_ENABLED

 Default: ``False``

-Whether the AjaxCrawlMiddleware will be enabled. You may want to
-enable it for :ref:`broad crawls <topics-broad-crawls>`.
+Whether the AjaxCrawlMiddleware will be enabled.

 HttpProxyMiddleware settings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/scrapy/downloadermiddlewares/ajaxcrawl.py
+++ b/scrapy/downloadermiddlewares/ajaxcrawl.py
@ -3,10 +3,11 @@ from __future__ import annotations
 import logging
 import re
 from typing import TYPE_CHECKING
+from warnings import warn

 from w3lib import html

-from scrapy.exceptions import NotConfigured
+from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
 from scrapy.http import HtmlResponse, Response

 if TYPE_CHECKING:
@ -30,6 +31,13 @@ class AjaxCrawlMiddleware:
        if not settings.getbool("AJAXCRAWL_ENABLED"):
            raise NotConfigured

+        warn(
+            "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
+            " and will be removed in a future Scrapy version.",
+            ScrapyDeprecationWarning,
+            stacklevel=2,
+        )
+
        # XXX: Google parses at least first 100k bytes; scrapy's redirect
        # middleware parses first 4k. 4k turns out to be insufficient
        # for this middleware, and parsing 100k could be slow.
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
        return _has_ajaxcrawlable_meta(body)


-# XXX: move it to w3lib?
 _ajax_crawlable_re: re.Pattern[str] = re.compile(
    r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
 )
--- a/tests/test_downloadermiddleware_ajaxcrawlable.py
+++ b/tests/test_downloadermiddleware_ajaxcrawlable.py
@ -1,5 +1,7 @@
 import unittest

+import pytest
+
 from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
 from scrapy.http import HtmlResponse, Request, Response
 from scrapy.spiders import Spider
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
 __doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]


+@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
 class AjaxCrawlMiddlewareTest(unittest.TestCase):
    def setUp(self):
        crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})