2013-07-10 04:30:05 +06:00
|
|
|
import unittest
|
|
|
|
|
2015-04-20 21:23:05 -03:00
|
|
|
from scrapy.downloadermiddlewares.ajaxcrawl import AjaxCrawlMiddleware
|
2015-05-09 04:20:09 -03:00
|
|
|
from scrapy.spiders import Spider
|
2013-12-19 00:35:46 +06:00
|
|
|
from scrapy.http import Request, HtmlResponse, Response
|
2013-07-10 04:30:05 +06:00
|
|
|
from scrapy.utils.test import get_crawler
|
|
|
|
|
2015-04-20 21:23:05 -03:00
|
|
|
__doctests__ = ['scrapy.downloadermiddlewares.ajaxcrawl']
|
2013-07-10 04:30:05 +06:00
|
|
|
|
2014-01-16 23:09:37 +06:00
|
|
|
class AjaxCrawlMiddlewareTest(unittest.TestCase):
|
2013-07-10 04:30:05 +06:00
|
|
|
def setUp(self):
|
2014-07-31 04:12:12 -03:00
|
|
|
crawler = get_crawler(Spider, {'AJAXCRAWL_ENABLED': True})
|
|
|
|
self.spider = crawler._create_spider('foo')
|
2014-01-16 23:09:37 +06:00
|
|
|
self.mw = AjaxCrawlMiddleware.from_crawler(crawler)
|
2013-07-10 04:30:05 +06:00
|
|
|
|
|
|
|
def _ajaxcrawlable_body(self):
|
2015-09-01 03:49:52 +05:00
|
|
|
return b'<html><head><meta name="fragment" content="!"/></head><body></body></html>'
|
2013-07-10 04:30:05 +06:00
|
|
|
|
|
|
|
def _req_resp(self, url, req_kwargs=None, resp_kwargs=None):
|
|
|
|
req = Request(url, **(req_kwargs or {}))
|
|
|
|
resp = HtmlResponse(url, request=req, **(resp_kwargs or {}))
|
|
|
|
return req, resp
|
|
|
|
|
|
|
|
def test_non_get(self):
|
|
|
|
req, resp = self._req_resp('http://example.com/', {'method': 'HEAD'})
|
|
|
|
resp2 = self.mw.process_response(req, resp, self.spider)
|
|
|
|
self.assertEqual(resp, resp2)
|
|
|
|
|
2013-12-19 00:35:46 +06:00
|
|
|
def test_binary_response(self):
|
|
|
|
req = Request('http://example.com/')
|
|
|
|
resp = Response('http://example.com/', body=b'foobar\x00\x01\x02', request=req)
|
|
|
|
resp2 = self.mw.process_response(req, resp, self.spider)
|
|
|
|
self.assertIs(resp, resp2)
|
|
|
|
|
2014-01-16 23:09:37 +06:00
|
|
|
def test_ajaxcrawl(self):
|
2013-07-10 04:30:05 +06:00
|
|
|
req, resp = self._req_resp(
|
|
|
|
'http://example.com/',
|
|
|
|
{'meta': {'foo': 'bar'}},
|
|
|
|
{'body': self._ajaxcrawlable_body()}
|
|
|
|
)
|
|
|
|
req2 = self.mw.process_response(req, resp, self.spider)
|
|
|
|
self.assertEqual(req2.url, 'http://example.com/?_escaped_fragment_=')
|
|
|
|
self.assertEqual(req2.meta['foo'], 'bar')
|
|
|
|
|
2014-01-16 23:09:37 +06:00
|
|
|
def test_ajaxcrawl_loop(self):
|
2013-07-10 04:30:05 +06:00
|
|
|
req, resp = self._req_resp('http://example.com/', {}, {'body': self._ajaxcrawlable_body()})
|
|
|
|
req2 = self.mw.process_response(req, resp, self.spider)
|
|
|
|
resp2 = HtmlResponse(req2.url, body=resp.body, request=req2)
|
|
|
|
resp3 = self.mw.process_response(req2, resp2, self.spider)
|
|
|
|
|
|
|
|
assert isinstance(resp3, HtmlResponse), (resp3.__class__, resp3)
|
|
|
|
self.assertEqual(resp3.request.url, 'http://example.com/?_escaped_fragment_=')
|
|
|
|
assert resp3 is resp2
|
|
|
|
|
|
|
|
def test_noncrawlable_body(self):
|
2015-09-01 03:49:52 +05:00
|
|
|
req, resp = self._req_resp('http://example.com/', {}, {'body': b'<html></html>'})
|
2013-07-10 04:30:05 +06:00
|
|
|
resp2 = self.mw.process_response(req, resp, self.spider)
|
2013-12-19 00:35:46 +06:00
|
|
|
self.assertIs(resp, resp2)
|