mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Deprecate AjaxCrawlMiddleware.
This commit is contained in:
parent
b4c2531021
commit
393ff96e45
@ -182,30 +182,6 @@ To disable redirects use:
|
||||
|
||||
REDIRECT_ENABLED = False
|
||||
|
||||
Enable crawling of "Ajax Crawlable Pages"
|
||||
=========================================
|
||||
|
||||
Some pages (up to 1%, based on empirical data from year 2013) declare
|
||||
themselves as ajax crawlable. This means they provide plain HTML
|
||||
version of content that is usually available only via AJAX.
|
||||
Pages can indicate it in two ways:
|
||||
|
||||
1) by using ``#!`` in URL - this is the default way;
|
||||
2) by using a special meta tag - this way is used on
|
||||
"main", "index" website pages.
|
||||
|
||||
Scrapy handles (1) automatically; to handle (2) enable
|
||||
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
AJAXCRAWL_ENABLED = True
|
||||
|
||||
When doing broad crawls it's common to crawl a lot of "index" web pages;
|
||||
AjaxCrawlMiddleware helps to crawl them correctly.
|
||||
It is turned OFF by default because it has some performance overhead,
|
||||
and enabling it for focused crawls doesn't make much sense.
|
||||
|
||||
.. _broad-crawls-bfo:
|
||||
|
||||
Crawl in BFO order
|
||||
|
@ -1249,8 +1249,7 @@ AJAXCRAWL_ENABLED
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Whether the AjaxCrawlMiddleware will be enabled. You may want to
|
||||
enable it for :ref:`broad crawls <topics-broad-crawls>`.
|
||||
Whether the AjaxCrawlMiddleware will be enabled.
|
||||
|
||||
HttpProxyMiddleware settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -3,10 +3,11 @@ from __future__ import annotations
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
from warnings import warn
|
||||
|
||||
from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.http import HtmlResponse, Response
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -30,6 +31,13 @@ class AjaxCrawlMiddleware:
|
||||
if not settings.getbool("AJAXCRAWL_ENABLED"):
|
||||
raise NotConfigured
|
||||
|
||||
warn(
|
||||
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
|
||||
" and will be removed in a future Scrapy version.",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re: re.Pattern[str] = re.compile(
|
||||
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
|
||||
)
|
||||
|
@ -1,5 +1,7 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
|
||||
from scrapy.http import HtmlResponse, Request, Response
|
||||
from scrapy.spiders import Spider
|
||||
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
|
||||
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class AjaxCrawlMiddlewareTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})
|
||||
|
Loading…
x
Reference in New Issue
Block a user