1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Deprecate AjaxCrawlMiddleware.

This commit is contained in:
Andrey Rakhmatullin 2024-11-07 21:21:17 +05:00
parent b4c2531021
commit 393ff96e45
4 changed files with 13 additions and 28 deletions

View File

@ -182,30 +182,6 @@ To disable redirects use:
REDIRECT_ENABLED = False
Enable crawling of "Ajax Crawlable Pages"
=========================================
Some pages (up to 1%, based on empirical data from year 2013) declare
themselves as ajax crawlable. This means they provide plain HTML
version of content that is usually available only via AJAX.
Pages can indicate it in two ways:
1) by using ``#!`` in URL - this is the default way;
2) by using a special meta tag - this way is used on
"main", "index" website pages.
Scrapy handles (1) automatically; to handle (2) enable
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
.. code-block:: python
AJAXCRAWL_ENABLED = True
When doing broad crawls it's common to crawl a lot of "index" web pages;
AjaxCrawlMiddleware helps to crawl them correctly.
It is turned OFF by default because it has some performance overhead,
and enabling it for focused crawls doesn't make much sense.
.. _broad-crawls-bfo:
Crawl in BFO order

View File

@ -1249,8 +1249,7 @@ AJAXCRAWL_ENABLED
Default: ``False``
Whether the AjaxCrawlMiddleware will be enabled. You may want to
enable it for :ref:`broad crawls <topics-broad-crawls>`.
Whether the AjaxCrawlMiddleware will be enabled.
HttpProxyMiddleware settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -3,10 +3,11 @@ from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING
from warnings import warn
from w3lib import html
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.http import HtmlResponse, Response
if TYPE_CHECKING:
@ -30,6 +31,13 @@ class AjaxCrawlMiddleware:
if not settings.getbool("AJAXCRAWL_ENABLED"):
raise NotConfigured
warn(
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
" and will be removed in a future Scrapy version.",
ScrapyDeprecationWarning,
stacklevel=2,
)
# XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow.
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
return _has_ajaxcrawlable_meta(body)
# XXX: move it to w3lib?
_ajax_crawlable_re: re.Pattern[str] = re.compile(
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
)

View File

@ -1,5 +1,7 @@
import unittest
import pytest
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
from scrapy.http import HtmlResponse, Request, Response
from scrapy.spiders import Spider
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class AjaxCrawlMiddlewareTest(unittest.TestCase):
def setUp(self):
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})