1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 08:44:13 +00:00

Rename AjaxCrawlableMiddleware to AjaxCrawlMiddleware

This commit is contained in:
Mikhail Korobov 2014-01-16 23:09:37 +06:00
parent ed6fd4933f
commit b03fe04999
5 changed files with 33 additions and 33 deletions

View File

@ -132,12 +132,12 @@ Pages can indicate it in two ways:
"main", "index" website pages.
Scrapy handles (1) automatically; to handle (2) enable
:ref:`AjaxCrawlableMiddleware <ajaxcrawlable-middleware>`::
:ref:`AjaxCrawlMiddleware <ajaxcrawl-middleware>`::
AJAXCRAWLABLE_ENABLED = True
AJAXCRAWL_ENABLED = True
When doing broad crawls it's common to crawl a lot of "index" web pages;
AjaxCrawlableMiddleware helps to crawl them correctly.
AjaxCrawlMiddleware helps to crawl them correctly.
It is turned OFF by default because it has some performance overhead,
and enabling it for focused crawls doesn't make much sense.

View File

@ -797,14 +797,14 @@ UserAgentMiddleware
In order for a spider to override the default user agent, its `user_agent`
attribute must be set.
.. _ajaxcrawlable-middleware:
.. _ajaxcrawl-middleware:
AjaxCrawlableMiddleware
-----------------------
AjaxCrawlMiddleware
-------------------
.. module:: scrapy.contrib.downloadermiddleware.ajaxcrawlable
.. module:: scrapy.contrib.downloadermiddleware.ajaxcrawl
.. class:: AjaxCrawlableMiddleware
.. class:: AjaxCrawlMiddleware
Middleware that finds 'AJAX crawlable' page variants based
on meta-fragment html tag. See
@ -815,22 +815,22 @@ AjaxCrawlableMiddleware
Scrapy finds 'AJAX crawlable' pages for URLs like
``'http://example.com/!#foo=bar'`` even without this middleware.
AjaxCrawlableMiddleware is necessary when URL doesn't contain ``'!#'``.
AjaxCrawlMiddleware is necessary when URL doesn't contain ``'!#'``.
This is often a case for 'index' or 'main' website pages.
AjaxCrawlableMiddleware Settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AjaxCrawlMiddleware Settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. setting:: AJAXCRAWLABLE_ENABLED
.. setting:: AJAXCRAWL_ENABLED
AJAXCRAWLABLE_ENABLED
^^^^^^^^^^^^^^^^^^^^^
AJAXCRAWL_ENABLED
^^^^^^^^^^^^^^^^^
.. versionadded:: 0.21
Default: ``False``
Whether the AjaxCrawlableMiddleware will be enabled. You may want to
Whether the AjaxCrawlMiddleware will be enabled. You may want to
enable it for :ref:`broad crawls <topics-broad-crawls>`.

View File

@ -7,21 +7,21 @@ from scrapy.http import HtmlResponse
from scrapy.utils.response import _noscript_re, _script_re
from w3lib import html
class AjaxCrawlableMiddleware(object):
class AjaxCrawlMiddleware(object):
"""
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
def __init__(self, settings):
if not settings.getbool('AJAXCRAWLABLE_ENABLED'):
if not settings.getbool('AJAXCRAWL_ENABLED'):
raise NotConfigured
# XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow.
# We use something in between (32K) by default.
self.lookup_bytes = settings.getint('AJAXCRAWLABLE_MAXSIZE', 32768)
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
@classmethod
def from_crawler(cls, crawler):
@ -43,13 +43,13 @@ class AjaxCrawlableMiddleware(object):
return response
# scrapy already handles #! links properly
ajax_crawlable = request.replace(url=request.url+'#!')
log.msg(format="Downloading AJAX crawlable %(ajax_crawlable)s instead of %(request)s",
level=log.DEBUG, spider=spider, ajax_crawlable=ajax_crawlable,
request=request)
ajax_crawl_request = request.replace(url=request.url+'#!')
log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
level=log.DEBUG, spider=spider,
ajax_crawl_request=ajax_crawl_request, request=request)
ajax_crawlable.meta['ajax_crawlable'] = True
return ajax_crawlable
ajax_crawl_request.meta['ajax_crawlable'] = True
return ajax_crawl_request
def _has_ajax_crawlable_variant(self, response):
"""

View File

@ -18,7 +18,7 @@ import sys
from importlib import import_module
from os.path import join, abspath, dirname
AJAXCRAWLABLE_ENABLED = False
AJAXCRAWL_ENABLED = False
BOT_NAME = 'scrapybot'
@ -81,7 +81,7 @@ DOWNLOADER_MIDDLEWARES_BASE = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
'scrapy.contrib.downloadermiddleware.ajaxcrawlable.AjaxCrawlableMiddleware': 560,
'scrapy.contrib.downloadermiddleware.ajaxcrawl.AjaxCrawlMiddleware': 560,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,

View File

@ -1,17 +1,17 @@
import unittest
from scrapy.contrib.downloadermiddleware.ajaxcrawlable import AjaxCrawlableMiddleware
from scrapy.contrib.downloadermiddleware.ajaxcrawl import AjaxCrawlMiddleware
from scrapy.spider import BaseSpider
from scrapy.http import Request, HtmlResponse, Response
from scrapy.utils.test import get_crawler
__doctests__ = ['scrapy.contrib.downloadermiddleware.ajaxcrawlable']
__doctests__ = ['scrapy.contrib.downloadermiddleware.ajaxcrawl']
class AjaxCrawlableMiddlewareTest(unittest.TestCase):
class AjaxCrawlMiddlewareTest(unittest.TestCase):
def setUp(self):
self.spider = BaseSpider('foo')
crawler = get_crawler({'AJAXCRAWLABLE_ENABLED': True})
self.mw = AjaxCrawlableMiddleware.from_crawler(crawler)
crawler = get_crawler({'AJAXCRAWL_ENABLED': True})
self.mw = AjaxCrawlMiddleware.from_crawler(crawler)
def _ajaxcrawlable_body(self):
return '<html><head><meta name="fragment" content="!"/></head><body></body></html>'
@ -32,7 +32,7 @@ class AjaxCrawlableMiddlewareTest(unittest.TestCase):
resp2 = self.mw.process_response(req, resp, self.spider)
self.assertIs(resp, resp2)
def test_ajax_crawlable(self):
def test_ajaxcrawl(self):
req, resp = self._req_resp(
'http://example.com/',
{'meta': {'foo': 'bar'}},
@ -42,7 +42,7 @@ class AjaxCrawlableMiddlewareTest(unittest.TestCase):
self.assertEqual(req2.url, 'http://example.com/?_escaped_fragment_=')
self.assertEqual(req2.meta['foo'], 'bar')
def test_ajax_crawlable_loop(self):
def test_ajaxcrawl_loop(self):
req, resp = self._req_resp('http://example.com/', {}, {'body': self._ajaxcrawlable_body()})
req2 = self.mw.process_response(req, resp, self.spider)
resp2 = HtmlResponse(req2.url, body=resp.body, request=req2)