1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Merge pull request #6651 from wRAR/deprecate-ajaxcrawl

Deprecate AjaxCrawlMiddleware and stop calling escape_ajax() by default
This commit is contained in:
Andrey Rakhmatullin 2025-02-03 13:57:46 +04:00 committed by GitHub
commit 16e39661e9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 26 additions and 45 deletions

View File

@ -182,30 +182,6 @@ To disable redirects use:
REDIRECT_ENABLED = False
Enable crawling of "Ajax Crawlable Pages"
=========================================
Some pages (up to 1%, based on empirical data from year 2013) declare
themselves as ajax crawlable. This means they provide plain HTML
version of content that is usually available only via AJAX.
Pages can indicate it in two ways:
1) by using ``#!`` in URL - this is the default way;
2) by using a special meta tag - this way is used on
"main", "index" website pages.
Scrapy handles (1) automatically; to handle (2) enable
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
.. code-block:: python
AJAXCRAWL_ENABLED = True
When doing broad crawls it's common to crawl a lot of "index" web pages;
AjaxCrawlMiddleware helps to crawl them correctly.
It is turned OFF by default because it has some performance overhead,
and enabling it for focused crawls doesn't make much sense.
.. _broad-crawls-bfo:
Crawl in BFO order

View File

@ -1249,8 +1249,11 @@ AJAXCRAWL_ENABLED
Default: ``False``
Whether the AjaxCrawlMiddleware will be enabled. You may want to
enable it for :ref:`broad crawls <topics-broad-crawls>`.
Whether the AjaxCrawlMiddleware will be enabled.
.. note::
This middleware is deprecated and will be removed in a future Scrapy release.
HttpProxyMiddleware settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -3,11 +3,13 @@ from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING
from warnings import warn
from w3lib import html
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.http import HtmlResponse, Response
from scrapy.utils.url import escape_ajax
if TYPE_CHECKING:
# typing.Self requires Python 3.11
@ -30,6 +32,13 @@ class AjaxCrawlMiddleware:
if not settings.getbool("AJAXCRAWL_ENABLED"):
raise NotConfigured
warn(
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
" and will be removed in a future Scrapy version.",
ScrapyDeprecationWarning,
stacklevel=2,
)
# XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow.
@ -56,8 +65,7 @@ class AjaxCrawlMiddleware:
if not self._has_ajax_crawlable_variant(response):
return response
# scrapy already handles #! links properly
ajax_crawl_request = request.replace(url=request.url + "#!")
ajax_crawl_request = request.replace(url=escape_ajax(request.url + "#!"))
logger.debug(
"Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{"ajax_crawl_request": ajax_crawl_request, "request": request},
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
return _has_ajaxcrawlable_meta(body)
# XXX: move it to w3lib?
_ajax_crawlable_re: re.Pattern[str] = re.compile(
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
)

View File

@ -27,7 +27,6 @@ from scrapy.http.headers import Headers
from scrapy.utils.curl import curl_to_request_kwargs
from scrapy.utils.python import to_bytes
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import escape_ajax
if TYPE_CHECKING:
from collections.abc import Callable, Iterable, Mapping
@ -170,8 +169,7 @@ class Request(object_ref):
if not isinstance(url, str):
raise TypeError(f"Request url must be str, got {type(url).__name__}")
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
self._url = safe_url_string(url, self.encoding)
if (
"://" not in self._url

View File

@ -10,6 +10,7 @@ import warnings
from importlib import import_module
from typing import TYPE_CHECKING, Union
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
from warnings import warn
from w3lib.url import __all__ as _public_w3lib_objects
from w3lib.url import add_or_replace_parameter as _add_or_replace_parameter
@ -83,6 +84,11 @@ def escape_ajax(url: str) -> str:
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
"""
warn(
"escape_ajax() is deprecated and will be removed in a future Scrapy version.",
ScrapyDeprecationWarning,
stacklevel=2,
)
defrag, frag = urldefrag(url)
if not frag.startswith("!"):
return url

View File

@ -1,5 +1,7 @@
import unittest
import pytest
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
from scrapy.http import HtmlResponse, Request, Response
from scrapy.spiders import Spider
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class AjaxCrawlMiddlewareTest(unittest.TestCase):
def setUp(self):
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})

View File

@ -187,18 +187,6 @@ class RequestTest(unittest.TestCase):
assert isinstance(r4.body, bytes)
self.assertEqual(r4.body, b"Price: \xa3100")
def test_ajax_url(self):
# ascii url
r = self.request_class(url="http://www.example.com/ajax.html#!key=value")
self.assertEqual(
r.url, "http://www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue"
)
# unicode url
r = self.request_class(url="http://www.example.com/ajax.html#!key=value")
self.assertEqual(
r.url, "http://www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue"
)
def test_copy(self):
"""Test Request copy"""