mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 08:44:13 +00:00
Rename AjaxCrawlableMiddleware to AjaxCrawlMiddleware
This commit is contained in:
parent
ed6fd4933f
commit
b03fe04999
@ -132,12 +132,12 @@ Pages can indicate it in two ways:
|
||||
"main", "index" website pages.
|
||||
|
||||
Scrapy handles (1) automatically; to handle (2) enable
|
||||
:ref:`AjaxCrawlableMiddleware <ajaxcrawlable-middleware>`::
|
||||
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`::
|
||||
|
||||
AJAXCRAWLABLE_ENABLED = True
|
||||
AJAXCRAWL_ENABLED = True
|
||||
|
||||
When doing broad crawls it's common to crawl a lot of "index" web pages;
|
||||
AjaxCrawlableMiddleware helps to crawl them correctly.
|
||||
AjaxCrawlMiddleware helps to crawl them correctly.
|
||||
It is turned OFF by default because it has some performance overhead,
|
||||
and enabling it for focused crawls doesn't make much sense.
|
||||
|
||||
|
@ -797,14 +797,14 @@ UserAgentMiddleware
|
||||
In order for a spider to override the default user agent, its `user_agent`
|
||||
attribute must be set.
|
||||
|
||||
.. _ajaxcrawlable-middleware:
|
||||
.. _ajaxcrawl-middleware:
|
||||
|
||||
AjaxCrawlableMiddleware
|
||||
-----------------------
|
||||
AjaxCrawlMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.ajaxcrawlable
|
||||
.. module:: scrapy.contrib.downloadermiddleware.ajaxcrawl
|
||||
|
||||
.. class:: AjaxCrawlableMiddleware
|
||||
.. class:: AjaxCrawlMiddleware
|
||||
|
||||
Middleware that finds 'AJAX crawlable' page variants based
|
||||
on meta-fragment html tag. See
|
||||
@ -815,22 +815,22 @@ AjaxCrawlableMiddleware
|
||||
|
||||
Scrapy finds 'AJAX crawlable' pages for URLs like
|
||||
``'http://example.com/!#foo=bar'`` even without this middleware.
|
||||
AjaxCrawlableMiddleware is necessary when URL doesn't contain ``'!#'``.
|
||||
AjaxCrawlMiddleware is necessary when URL doesn't contain ``'!#'``.
|
||||
This is often a case for 'index' or 'main' website pages.
|
||||
|
||||
AjaxCrawlableMiddleware Settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
AjaxCrawlMiddleware Settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. setting:: AJAXCRAWLABLE_ENABLED
|
||||
.. setting:: AJAXCRAWL_ENABLED
|
||||
|
||||
AJAXCRAWLABLE_ENABLED
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
AJAXCRAWL_ENABLED
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Whether the AjaxCrawlableMiddleware will be enabled. You may want to
|
||||
Whether the AjaxCrawlMiddleware will be enabled. You may want to
|
||||
enable it for :ref:`broad crawls <topics-broad-crawls>`.
|
||||
|
||||
|
||||
|
@ -7,21 +7,21 @@ from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import _noscript_re, _script_re
|
||||
from w3lib import html
|
||||
|
||||
class AjaxCrawlableMiddleware(object):
|
||||
class AjaxCrawlMiddleware(object):
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('AJAXCRAWLABLE_ENABLED'):
|
||||
if not settings.getbool('AJAXCRAWL_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
# We use something in between (32K) by default.
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWLABLE_MAXSIZE', 32768)
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
@ -43,13 +43,13 @@ class AjaxCrawlableMiddleware(object):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawlable = request.replace(url=request.url+'#!')
|
||||
log.msg(format="Downloading AJAX crawlable %(ajax_crawlable)s instead of %(request)s",
|
||||
level=log.DEBUG, spider=spider, ajax_crawlable=ajax_crawlable,
|
||||
request=request)
|
||||
ajax_crawl_request = request.replace(url=request.url+'#!')
|
||||
log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
level=log.DEBUG, spider=spider,
|
||||
ajax_crawl_request=ajax_crawl_request, request=request)
|
||||
|
||||
ajax_crawlable.meta['ajax_crawlable'] = True
|
||||
return ajax_crawlable
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
return ajax_crawl_request
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response):
|
||||
"""
|
@ -18,7 +18,7 @@ import sys
|
||||
from importlib import import_module
|
||||
from os.path import join, abspath, dirname
|
||||
|
||||
AJAXCRAWLABLE_ENABLED = False
|
||||
AJAXCRAWL_ENABLED = False
|
||||
|
||||
BOT_NAME = 'scrapybot'
|
||||
|
||||
@ -81,7 +81,7 @@ DOWNLOADER_MIDDLEWARES_BASE = {
|
||||
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
|
||||
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
|
||||
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
|
||||
'scrapy.contrib.downloadermiddleware.ajaxcrawlable.AjaxCrawlableMiddleware': 560,
|
||||
'scrapy.contrib.downloadermiddleware.ajaxcrawl.AjaxCrawlMiddleware': 560,
|
||||
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
|
||||
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
|
||||
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
|
||||
|
@ -1,17 +1,17 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.ajaxcrawlable import AjaxCrawlableMiddleware
|
||||
from scrapy.contrib.downloadermiddleware.ajaxcrawl import AjaxCrawlMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request, HtmlResponse, Response
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
__doctests__ = ['scrapy.contrib.downloadermiddleware.ajaxcrawlable']
|
||||
__doctests__ = ['scrapy.contrib.downloadermiddleware.ajaxcrawl']
|
||||
|
||||
class AjaxCrawlableMiddlewareTest(unittest.TestCase):
|
||||
class AjaxCrawlMiddlewareTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
crawler = get_crawler({'AJAXCRAWLABLE_ENABLED': True})
|
||||
self.mw = AjaxCrawlableMiddleware.from_crawler(crawler)
|
||||
crawler = get_crawler({'AJAXCRAWL_ENABLED': True})
|
||||
self.mw = AjaxCrawlMiddleware.from_crawler(crawler)
|
||||
|
||||
def _ajaxcrawlable_body(self):
|
||||
return '<html><head><meta name="fragment" content="!"/></head><body></body></html>'
|
||||
@ -32,7 +32,7 @@ class AjaxCrawlableMiddlewareTest(unittest.TestCase):
|
||||
resp2 = self.mw.process_response(req, resp, self.spider)
|
||||
self.assertIs(resp, resp2)
|
||||
|
||||
def test_ajax_crawlable(self):
|
||||
def test_ajaxcrawl(self):
|
||||
req, resp = self._req_resp(
|
||||
'http://example.com/',
|
||||
{'meta': {'foo': 'bar'}},
|
||||
@ -42,7 +42,7 @@ class AjaxCrawlableMiddlewareTest(unittest.TestCase):
|
||||
self.assertEqual(req2.url, 'http://example.com/?_escaped_fragment_=')
|
||||
self.assertEqual(req2.meta['foo'], 'bar')
|
||||
|
||||
def test_ajax_crawlable_loop(self):
|
||||
def test_ajaxcrawl_loop(self):
|
||||
req, resp = self._req_resp('http://example.com/', {}, {'body': self._ajaxcrawlable_body()})
|
||||
req2 = self.mw.process_response(req, resp, self.spider)
|
||||
resp2 = HtmlResponse(req2.url, body=resp.body, request=req2)
|
||||
|
Loading…
x
Reference in New Issue
Block a user