1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 13:30:05 +00:00

Deprecate AjaxCrawlMiddleware.

This commit is contained in:
Andrey Rakhmatullin 2024-11-07 21:21:17 +05:00
parent b4c2531021
commit 393ff96e45
4 changed files with 13 additions and 28 deletions

View File

@ -182,30 +182,6 @@ To disable redirects use:
REDIRECT_ENABLED = False REDIRECT_ENABLED = False
Enable crawling of "Ajax Crawlable Pages"
=========================================
Some pages (up to 1%, based on empirical data from year 2013) declare
themselves as ajax crawlable. This means they provide plain HTML
version of content that is usually available only via AJAX.
Pages can indicate it in two ways:
1) by using ``#!`` in URL - this is the default way;
2) by using a special meta tag - this way is used on
"main", "index" website pages.
Scrapy handles (1) automatically; to handle (2) enable
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
.. code-block:: python
AJAXCRAWL_ENABLED = True
When doing broad crawls it's common to crawl a lot of "index" web pages;
AjaxCrawlMiddleware helps to crawl them correctly.
It is turned OFF by default because it has some performance overhead,
and enabling it for focused crawls doesn't make much sense.
.. _broad-crawls-bfo: .. _broad-crawls-bfo:
Crawl in BFO order Crawl in BFO order

View File

@ -1249,8 +1249,7 @@ AJAXCRAWL_ENABLED
Default: ``False`` Default: ``False``
Whether the AjaxCrawlMiddleware will be enabled. You may want to Whether the AjaxCrawlMiddleware will be enabled.
enable it for :ref:`broad crawls <topics-broad-crawls>`.
HttpProxyMiddleware settings HttpProxyMiddleware settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -3,10 +3,11 @@ from __future__ import annotations
import logging import logging
import re import re
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from warnings import warn
from w3lib import html from w3lib import html
from scrapy.exceptions import NotConfigured from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.http import HtmlResponse, Response from scrapy.http import HtmlResponse, Response
if TYPE_CHECKING: if TYPE_CHECKING:
@ -30,6 +31,13 @@ class AjaxCrawlMiddleware:
if not settings.getbool("AJAXCRAWL_ENABLED"): if not settings.getbool("AJAXCRAWL_ENABLED"):
raise NotConfigured raise NotConfigured
warn(
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
" and will be removed in a future Scrapy version.",
ScrapyDeprecationWarning,
stacklevel=2,
)
# XXX: Google parses at least first 100k bytes; scrapy's redirect # XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient # middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow. # for this middleware, and parsing 100k could be slow.
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
return _has_ajaxcrawlable_meta(body) return _has_ajaxcrawlable_meta(body)
# XXX: move it to w3lib?
_ajax_crawlable_re: re.Pattern[str] = re.compile( _ajax_crawlable_re: re.Pattern[str] = re.compile(
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>' r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
) )

View File

@ -1,5 +1,7 @@
import unittest import unittest
import pytest
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
from scrapy.http import HtmlResponse, Request, Response from scrapy.http import HtmlResponse, Request, Response
from scrapy.spiders import Spider from scrapy.spiders import Spider
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"] __doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class AjaxCrawlMiddlewareTest(unittest.TestCase): class AjaxCrawlMiddlewareTest(unittest.TestCase):
def setUp(self): def setUp(self):
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True}) crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})