mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Merge pull request #6651 from wRAR/deprecate-ajaxcrawl
Deprecate AjaxCrawlMiddleware and stop calling escape_ajax() by default
This commit is contained in:
commit
16e39661e9
@ -182,30 +182,6 @@ To disable redirects use:
|
||||
|
||||
REDIRECT_ENABLED = False
|
||||
|
||||
Enable crawling of "Ajax Crawlable Pages"
|
||||
=========================================
|
||||
|
||||
Some pages (up to 1%, based on empirical data from year 2013) declare
|
||||
themselves as ajax crawlable. This means they provide plain HTML
|
||||
version of content that is usually available only via AJAX.
|
||||
Pages can indicate it in two ways:
|
||||
|
||||
1) by using ``#!`` in URL - this is the default way;
|
||||
2) by using a special meta tag - this way is used on
|
||||
"main", "index" website pages.
|
||||
|
||||
Scrapy handles (1) automatically; to handle (2) enable
|
||||
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
AJAXCRAWL_ENABLED = True
|
||||
|
||||
When doing broad crawls it's common to crawl a lot of "index" web pages;
|
||||
AjaxCrawlMiddleware helps to crawl them correctly.
|
||||
It is turned OFF by default because it has some performance overhead,
|
||||
and enabling it for focused crawls doesn't make much sense.
|
||||
|
||||
.. _broad-crawls-bfo:
|
||||
|
||||
Crawl in BFO order
|
||||
|
@ -1249,8 +1249,11 @@ AJAXCRAWL_ENABLED
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Whether the AjaxCrawlMiddleware will be enabled. You may want to
|
||||
enable it for :ref:`broad crawls <topics-broad-crawls>`.
|
||||
Whether the AjaxCrawlMiddleware will be enabled.
|
||||
|
||||
.. note::
|
||||
|
||||
This middleware is deprecated and will be removed in a future Scrapy release.
|
||||
|
||||
HttpProxyMiddleware settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -3,11 +3,13 @@ from __future__ import annotations
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
from warnings import warn
|
||||
|
||||
from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.http import HtmlResponse, Response
|
||||
from scrapy.utils.url import escape_ajax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# typing.Self requires Python 3.11
|
||||
@ -30,6 +32,13 @@ class AjaxCrawlMiddleware:
|
||||
if not settings.getbool("AJAXCRAWL_ENABLED"):
|
||||
raise NotConfigured
|
||||
|
||||
warn(
|
||||
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
|
||||
" and will be removed in a future Scrapy version.",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
@ -56,8 +65,7 @@ class AjaxCrawlMiddleware:
|
||||
if not self._has_ajax_crawlable_variant(response):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url + "#!")
|
||||
ajax_crawl_request = request.replace(url=escape_ajax(request.url + "#!"))
|
||||
logger.debug(
|
||||
"Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{"ajax_crawl_request": ajax_crawl_request, "request": request},
|
||||
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re: re.Pattern[str] = re.compile(
|
||||
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
|
||||
)
|
||||
|
@ -27,7 +27,6 @@ from scrapy.http.headers import Headers
|
||||
from scrapy.utils.curl import curl_to_request_kwargs
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.url import escape_ajax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Iterable, Mapping
|
||||
@ -170,8 +169,7 @@ class Request(object_ref):
|
||||
if not isinstance(url, str):
|
||||
raise TypeError(f"Request url must be str, got {type(url).__name__}")
|
||||
|
||||
s = safe_url_string(url, self.encoding)
|
||||
self._url = escape_ajax(s)
|
||||
self._url = safe_url_string(url, self.encoding)
|
||||
|
||||
if (
|
||||
"://" not in self._url
|
||||
|
@ -10,6 +10,7 @@ import warnings
|
||||
from importlib import import_module
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
|
||||
from warnings import warn
|
||||
|
||||
from w3lib.url import __all__ as _public_w3lib_objects
|
||||
from w3lib.url import add_or_replace_parameter as _add_or_replace_parameter
|
||||
@ -83,6 +84,11 @@ def escape_ajax(url: str) -> str:
|
||||
>>> escape_ajax("www.example.com/ajax.html")
|
||||
'www.example.com/ajax.html'
|
||||
"""
|
||||
warn(
|
||||
"escape_ajax() is deprecated and will be removed in a future Scrapy version.",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
defrag, frag = urldefrag(url)
|
||||
if not frag.startswith("!"):
|
||||
return url
|
||||
|
@ -1,5 +1,7 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
|
||||
from scrapy.http import HtmlResponse, Request, Response
|
||||
from scrapy.spiders import Spider
|
||||
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
|
||||
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class AjaxCrawlMiddlewareTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})
|
||||
|
@ -187,18 +187,6 @@ class RequestTest(unittest.TestCase):
|
||||
assert isinstance(r4.body, bytes)
|
||||
self.assertEqual(r4.body, b"Price: \xa3100")
|
||||
|
||||
def test_ajax_url(self):
|
||||
# ascii url
|
||||
r = self.request_class(url="http://www.example.com/ajax.html#!key=value")
|
||||
self.assertEqual(
|
||||
r.url, "http://www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue"
|
||||
)
|
||||
# unicode url
|
||||
r = self.request_class(url="http://www.example.com/ajax.html#!key=value")
|
||||
self.assertEqual(
|
||||
r.url, "http://www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue"
|
||||
)
|
||||
|
||||
def test_copy(self):
|
||||
"""Test Request copy"""
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user