mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 00:33:02 +00:00
180 lines
6.5 KiB
Python
180 lines
6.5 KiB
Python
from unittest import mock
|
|
|
|
from twisted.trial.unittest import TestCase
|
|
from twisted.python.failure import Failure
|
|
|
|
from scrapy.http import Request, Response
|
|
from scrapy.spiders import Spider
|
|
from scrapy.exceptions import _InvalidOutput
|
|
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
|
from scrapy.utils.test import get_crawler
|
|
from scrapy.utils.python import to_bytes
|
|
|
|
|
|
class ManagerTestCase(TestCase):
|
|
|
|
settings_dict = None
|
|
|
|
def setUp(self):
|
|
self.crawler = get_crawler(Spider, self.settings_dict)
|
|
self.spider = self.crawler._create_spider('foo')
|
|
self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
|
|
# some mw depends on stats collector
|
|
self.crawler.stats.open_spider(self.spider)
|
|
return self.mwman.open_spider(self.spider)
|
|
|
|
def tearDown(self):
|
|
self.crawler.stats.close_spider(self.spider, '')
|
|
return self.mwman.close_spider(self.spider)
|
|
|
|
def _download(self, request, response=None):
|
|
"""Executes downloader mw manager's download method and returns
|
|
the result (Request or Response) or raise exception in case of
|
|
failure.
|
|
"""
|
|
if not response:
|
|
response = Response(request.url)
|
|
|
|
def download_func(**kwargs):
|
|
return response
|
|
|
|
dfd = self.mwman.download(download_func, request, self.spider)
|
|
# catch deferred result and return the value
|
|
results = []
|
|
dfd.addBoth(results.append)
|
|
self._wait(dfd)
|
|
ret = results[0]
|
|
if isinstance(ret, Failure):
|
|
ret.raiseException()
|
|
return ret
|
|
|
|
|
|
class DefaultsTest(ManagerTestCase):
|
|
"""Tests default behavior with default settings"""
|
|
|
|
def test_request_response(self):
|
|
req = Request('http://example.com/index.html')
|
|
resp = Response(req.url, status=200)
|
|
ret = self._download(req, resp)
|
|
self.assertTrue(isinstance(ret, Response), "Non-response returned")
|
|
|
|
def test_3xx_and_invalid_gzipped_body_must_redirect(self):
|
|
"""Regression test for a failure when redirecting a compressed
|
|
request.
|
|
|
|
This happens when httpcompression middleware is executed before redirect
|
|
middleware and attempts to decompress a non-compressed body.
|
|
In particular when some website returns a 30x response with header
|
|
'Content-Encoding: gzip' giving as result the error below:
|
|
|
|
exceptions.IOError: Not a gzipped file
|
|
|
|
"""
|
|
req = Request('http://example.com')
|
|
body = b'<p>You are being redirected</p>'
|
|
resp = Response(req.url, status=302, body=body, headers={
|
|
'Content-Length': str(len(body)),
|
|
'Content-Type': 'text/html',
|
|
'Content-Encoding': 'gzip',
|
|
'Location': 'http://example.com/login',
|
|
})
|
|
ret = self._download(request=req, response=resp)
|
|
self.assertTrue(isinstance(ret, Request),
|
|
"Not redirected: {0!r}".format(ret))
|
|
self.assertEqual(to_bytes(ret.url), resp.headers['Location'],
|
|
"Not redirected to location header")
|
|
|
|
def test_200_and_invalid_gzipped_body_must_fail(self):
|
|
req = Request('http://example.com')
|
|
body = b'<p>You are being redirected</p>'
|
|
resp = Response(req.url, status=200, body=body, headers={
|
|
'Content-Length': str(len(body)),
|
|
'Content-Type': 'text/html',
|
|
'Content-Encoding': 'gzip',
|
|
'Location': 'http://example.com/login',
|
|
})
|
|
self.assertRaises(IOError, self._download, request=req, response=resp)
|
|
|
|
|
|
class ResponseFromProcessRequestTest(ManagerTestCase):
|
|
"""Tests middleware returning a response from process_request."""
|
|
|
|
def test_download_func_not_called(self):
|
|
resp = Response('http://example.com/index.html')
|
|
|
|
class ResponseMiddleware(object):
|
|
def process_request(self, request, spider):
|
|
return resp
|
|
|
|
self.mwman._add_middleware(ResponseMiddleware())
|
|
|
|
req = Request('http://example.com/index.html')
|
|
download_func = mock.MagicMock()
|
|
dfd = self.mwman.download(download_func, req, self.spider)
|
|
results = []
|
|
dfd.addBoth(results.append)
|
|
self._wait(dfd)
|
|
|
|
self.assertIs(results[0], resp)
|
|
self.assertFalse(download_func.called)
|
|
|
|
|
|
class ProcessRequestInvalidOutput(ManagerTestCase):
|
|
"""Invalid return value for process_request method should raise an exception"""
|
|
|
|
def test_invalid_process_request(self):
|
|
req = Request('http://example.com/index.html')
|
|
|
|
class InvalidProcessRequestMiddleware:
|
|
def process_request(self, request, spider):
|
|
return 1
|
|
|
|
self.mwman._add_middleware(InvalidProcessRequestMiddleware())
|
|
download_func = mock.MagicMock()
|
|
dfd = self.mwman.download(download_func, req, self.spider)
|
|
results = []
|
|
dfd.addBoth(results.append)
|
|
self.assertIsInstance(results[0], Failure)
|
|
self.assertIsInstance(results[0].value, _InvalidOutput)
|
|
|
|
|
|
class ProcessResponseInvalidOutput(ManagerTestCase):
|
|
"""Invalid return value for process_response method should raise an exception"""
|
|
|
|
def test_invalid_process_response(self):
|
|
req = Request('http://example.com/index.html')
|
|
|
|
class InvalidProcessResponseMiddleware:
|
|
def process_response(self, request, response, spider):
|
|
return 1
|
|
|
|
self.mwman._add_middleware(InvalidProcessResponseMiddleware())
|
|
download_func = mock.MagicMock()
|
|
dfd = self.mwman.download(download_func, req, self.spider)
|
|
results = []
|
|
dfd.addBoth(results.append)
|
|
self.assertIsInstance(results[0], Failure)
|
|
self.assertIsInstance(results[0].value, _InvalidOutput)
|
|
|
|
|
|
class ProcessExceptionInvalidOutput(ManagerTestCase):
|
|
"""Invalid return value for process_exception method should raise an exception"""
|
|
|
|
def test_invalid_process_exception(self):
|
|
req = Request('http://example.com/index.html')
|
|
|
|
class InvalidProcessExceptionMiddleware:
|
|
def process_request(self, request, spider):
|
|
raise Exception()
|
|
|
|
def process_exception(self, request, exception, spider):
|
|
return 1
|
|
|
|
self.mwman._add_middleware(InvalidProcessExceptionMiddleware())
|
|
download_func = mock.MagicMock()
|
|
dfd = self.mwman.download(download_func, req, self.spider)
|
|
results = []
|
|
dfd.addBoth(results.append)
|
|
self.assertIsInstance(results[0], Failure)
|
|
self.assertIsInstance(results[0].value, _InvalidOutput)
|