mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 16:31:38 +00:00
Deprecate AjaxCrawlMiddleware.
This commit is contained in:
parent
b4c2531021
commit
393ff96e45
@ -182,30 +182,6 @@ To disable redirects use:
|
|||||||
|
|
||||||
REDIRECT_ENABLED = False
|
REDIRECT_ENABLED = False
|
||||||
|
|
||||||
Enable crawling of "Ajax Crawlable Pages"
|
|
||||||
=========================================
|
|
||||||
|
|
||||||
Some pages (up to 1%, based on empirical data from year 2013) declare
|
|
||||||
themselves as ajax crawlable. This means they provide plain HTML
|
|
||||||
version of content that is usually available only via AJAX.
|
|
||||||
Pages can indicate it in two ways:
|
|
||||||
|
|
||||||
1) by using ``#!`` in URL - this is the default way;
|
|
||||||
2) by using a special meta tag - this way is used on
|
|
||||||
"main", "index" website pages.
|
|
||||||
|
|
||||||
Scrapy handles (1) automatically; to handle (2) enable
|
|
||||||
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
AJAXCRAWL_ENABLED = True
|
|
||||||
|
|
||||||
When doing broad crawls it's common to crawl a lot of "index" web pages;
|
|
||||||
AjaxCrawlMiddleware helps to crawl them correctly.
|
|
||||||
It is turned OFF by default because it has some performance overhead,
|
|
||||||
and enabling it for focused crawls doesn't make much sense.
|
|
||||||
|
|
||||||
.. _broad-crawls-bfo:
|
.. _broad-crawls-bfo:
|
||||||
|
|
||||||
Crawl in BFO order
|
Crawl in BFO order
|
||||||
|
@ -1249,8 +1249,7 @@ AJAXCRAWL_ENABLED
|
|||||||
|
|
||||||
Default: ``False``
|
Default: ``False``
|
||||||
|
|
||||||
Whether the AjaxCrawlMiddleware will be enabled. You may want to
|
Whether the AjaxCrawlMiddleware will be enabled.
|
||||||
enable it for :ref:`broad crawls <topics-broad-crawls>`.
|
|
||||||
|
|
||||||
HttpProxyMiddleware settings
|
HttpProxyMiddleware settings
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -3,10 +3,11 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
from warnings import warn
|
||||||
|
|
||||||
from w3lib import html
|
from w3lib import html
|
||||||
|
|
||||||
from scrapy.exceptions import NotConfigured
|
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||||
from scrapy.http import HtmlResponse, Response
|
from scrapy.http import HtmlResponse, Response
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -30,6 +31,13 @@ class AjaxCrawlMiddleware:
|
|||||||
if not settings.getbool("AJAXCRAWL_ENABLED"):
|
if not settings.getbool("AJAXCRAWL_ENABLED"):
|
||||||
raise NotConfigured
|
raise NotConfigured
|
||||||
|
|
||||||
|
warn(
|
||||||
|
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware is deprecated"
|
||||||
|
" and will be removed in a future Scrapy version.",
|
||||||
|
ScrapyDeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||||
# middleware parses first 4k. 4k turns out to be insufficient
|
# middleware parses first 4k. 4k turns out to be insufficient
|
||||||
# for this middleware, and parsing 100k could be slow.
|
# for this middleware, and parsing 100k could be slow.
|
||||||
@ -75,7 +83,6 @@ class AjaxCrawlMiddleware:
|
|||||||
return _has_ajaxcrawlable_meta(body)
|
return _has_ajaxcrawlable_meta(body)
|
||||||
|
|
||||||
|
|
||||||
# XXX: move it to w3lib?
|
|
||||||
_ajax_crawlable_re: re.Pattern[str] = re.compile(
|
_ajax_crawlable_re: re.Pattern[str] = re.compile(
|
||||||
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
|
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
|
||||||
)
|
)
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
|
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
|
||||||
from scrapy.http import HtmlResponse, Request, Response
|
from scrapy.http import HtmlResponse, Request, Response
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
@ -8,6 +10,7 @@ from scrapy.utils.test import get_crawler
|
|||||||
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
|
__doctests__ = ["scrapy.downloadermiddlewares.ajaxcrawl"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
class AjaxCrawlMiddlewareTest(unittest.TestCase):
|
class AjaxCrawlMiddlewareTest(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})
|
crawler = get_crawler(Spider, {"AJAXCRAWL_ENABLED": True})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user