Merge pull request #6651 from wRAR/deprecate-ajaxcrawl

Deprecate AjaxCrawlMiddleware and stop calling escape_ajax() by default
2025-02-06 11:00:46 +00:00 · 2025-02-03 13:57:46 +04:00 · 2025-02-03 13:57:46 +04:00 · 16e39661e9
commit 16e39661e9
parent b4c2531021 76a8badd24
7 changed files with 26 additions and 45 deletions
--- a/docs/topics/broad-crawls.rst
+++ b/docs/topics/broad-crawls.rst
@ -182,30 +182,6 @@ To disable redirects use:

    REDIRECT_ENABLED = False

-Enable crawling of "Ajax Crawlable Pages"
-=========================================
-
-Some pages (up to 1%, based on empirical data from year 2013) declare
-themselves as ajax crawlable. This means they provide plain HTML
-version of content that is usually available only via AJAX.
-Pages can indicate it in two ways:
-
-1) by using ``#!`` in URL - this is the default way;
-2) by using a special meta tag - this way is used on
-   "main", "index" website pages.
-
-Scrapy handles (1) automatically; to handle (2) enable
-:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
-
-.. code-block:: python
-
-    AJAXCRAWL_ENABLED = True
-
-When doing broad crawls it's common to crawl a lot of "index" web pages;
-AjaxCrawlMiddleware helps to crawl them correctly.
-It is turned OFF by default because it has some performance overhead,
-and enabling it for focused crawls doesn't make much sense.
-
 .. _broad-crawls-bfo:

 Crawl in BFO order
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -1249,8 +1249,11 @@ AJAXCRAWL_ENABLED

 Default: ``False``

-Whether the AjaxCrawlMiddleware will be enabled. You may want to
-enable it for :ref:`broad crawls <topics-broad-crawls>`.
+Whether the AjaxCrawlMiddleware will be enabled.
+
+   .. note::
+
+       This middleware is deprecated and will be removed in a future Scrapy release.

 HttpProxyMiddleware settings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/scrapy/downloadermiddlewares/ajaxcrawl.py
+++ b/scrapy/downloadermiddlewares/ajaxcrawl.py
@ -3,11 +3,13 @@ from __future__ import annotations
 import logging
 import re
 from typing import TYPE_CHECKING
+from warnings import warn

 from w3lib import html

-from scrapy.exceptions import NotConfigured
+from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
 from scrapy.http import HtmlResponse, Response
+from scrapy.utils.url import escape_ajax

 if TYPE_CHECKING:
    # typing.Self requires Python 3.11
@ -30,6 +32,13 @@ class AjaxCrawlMiddleware:
        if not settings.getbool("AJAXCRAWL_ENABLED"):
            raise NotConfigured

+        warn(
+            "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
+            " and will be removed in a future Scrapy version.",
+            ScrapyDeprecationWarning,
+            stacklevel=2,
+        )
+
        # XXX: Google parses at least first 100k bytes; scrapy's redirect
        # middleware parses first 4k. 4k turns out to be insufficient
        # for this middleware, and parsing 100k could be slow.
@ -56,8 +65,7 @@ class AjaxCrawlMiddleware:
        if not self._has_ajax_crawlable_variant(response):
            return response

-        # scrapy already handles #! links properly
-        ajax_crawl_request = request.replace(url=request.url + "#!")
+        ajax_crawl_request = request.replace(url=escape_ajax(request.url + "#!"))
        logger.debug(
            "Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
            {"ajax_crawl_request": ajax_crawl_request, "request": request},
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
        return _has_ajaxcrawlable_meta(body)


-# XXX: move it to w3lib?
 _ajax_crawlable_re: re.Pattern[str] = re.compile(
    r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
 )
--- a/scrapy/http/request/init.py
+++ b/scrapy/http/request/init.py
@ -27,7 +27,6 @@ from scrapy.http.headers import Headers
 from scrapy.utils.curl import curl_to_request_kwargs
 from scrapy.utils.python import to_bytes
 from scrapy.utils.trackref import object_ref
-from scrapy.utils.url import escape_ajax

 if TYPE_CHECKING:
    from collections.abc import Callable, Iterable, Mapping
@ -170,8 +169,7 @@ class Request(object_ref):
        if not isinstance(url, str):
            raise TypeError(f"Request url must be str, got {type(url).__name__}")

-        s = safe_url_string(url, self.encoding)
-        self._url = escape_ajax(s)
+        self._url = safe_url_string(url, self.encoding)

        if (
            "://" not in self._url
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@ -10,6 +10,7 @@ import warnings
 from importlib import import_module
 from typing import TYPE_CHECKING, Union
 from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
+from warnings import warn

 from w3lib.url import __all__ as _public_w3lib_objects
 from w3lib.url import add_or_replace_parameter as _add_or_replace_parameter
@ -83,6 +84,11 @@ def escape_ajax(url: str) -> str:
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
+    warn(
+        "escape_ajax() is deprecated and will be removed in a future Scrapy version.",
+        ScrapyDeprecationWarning,
+        stacklevel=2,
+    )
    defrag, frag = urldefrag(url)
    if not frag.startswith("!"):
        return url
--- a/tests/test_downloadermiddleware_ajaxcrawlable.py
+++ b/tests/test_downloadermiddleware_ajaxcrawlable.py
@ -1,5 +1,7 @@
 import unittest

+import pytest
+
 from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
 from scrapy.http import HtmlResponse, Request, Response
 from scrapy.spiders import Spider
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
 __doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]


+@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
 class AjaxCrawlMiddlewareTest(unittest.TestCase):
    def setUp(self):
        crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})
--- a/tests/test_http_request.py
+++ b/tests/test_http_request.py
@ -187,18 +187,6 @@ class RequestTest(unittest.TestCase):
        assert isinstance(r4.body, bytes)
        self.assertEqual(r4.body, b"Price: \xa3100")

-    def test_ajax_url(self):
-        # ascii url
-        r = self.request_class(url="http://www.example.com/ajax.html#!key=value")
-        self.assertEqual(
-            r.url, "http://www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue"
-        )
-        # unicode url
-        r = self.request_class(url="http://www.example.com/ajax.html#!key=value")
-        self.assertEqual(
-            r.url, "http://www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue"
-        )
-
    def test_copy(self):
        """Test Request copy"""