1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00
scrapy/tests/test_spidermiddleware.py

545 lines
19 KiB
Python

from __future__ import annotations
from collections.abc import AsyncIterator, Iterable
from unittest import mock
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.python.failure import Failure
from twisted.trial.unittest import TestCase
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Request, Response
from scrapy.spiders import Spider
from scrapy.utils.asyncgen import collect_asyncgen
from scrapy.utils.defer import deferred_from_coro, maybe_deferred_to_future
from scrapy.utils.test import get_crawler
class SpiderMiddlewareTestCase(TestCase):
def setUp(self):
self.request = Request("http://example.com/index.html")
self.response = Response(self.request.url, request=self.request)
self.crawler = get_crawler(Spider, {"SPIDER_MIDDLEWARES_BASE": {}})
self.spider = self.crawler._create_spider("foo")
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
def _scrape_response(self):
"""Execute spider mw manager's scrape_response method and return the result.
Raise exception in case of failure.
"""
scrape_func = mock.MagicMock()
dfd = self.mwman.scrape_response(
scrape_func, self.response, self.request, self.spider
)
# catch deferred result and return the value
results = []
dfd.addBoth(results.append)
self._wait(dfd)
return results[0]
class ProcessSpiderInputInvalidOutput(SpiderMiddlewareTestCase):
"""Invalid return value for process_spider_input method"""
def test_invalid_process_spider_input(self):
class InvalidProcessSpiderInputMiddleware:
def process_spider_input(self, response, spider):
return 1
self.mwman._add_middleware(InvalidProcessSpiderInputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, _InvalidOutput)
class ProcessSpiderOutputInvalidOutput(SpiderMiddlewareTestCase):
"""Invalid return value for process_spider_output method"""
def test_invalid_process_spider_output(self):
class InvalidProcessSpiderOutputMiddleware:
def process_spider_output(self, response, result, spider):
return 1
self.mwman._add_middleware(InvalidProcessSpiderOutputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, _InvalidOutput)
class ProcessSpiderExceptionInvalidOutput(SpiderMiddlewareTestCase):
"""Invalid return value for process_spider_exception method"""
def test_invalid_process_spider_exception(self):
class InvalidProcessSpiderOutputExceptionMiddleware:
def process_spider_exception(self, response, exception, spider):
return 1
class RaiseExceptionProcessSpiderOutputMiddleware:
def process_spider_output(self, response, result, spider):
raise RuntimeError
self.mwman._add_middleware(InvalidProcessSpiderOutputExceptionMiddleware())
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, _InvalidOutput)
class ProcessSpiderExceptionReRaise(SpiderMiddlewareTestCase):
"""Re raise the exception by returning None"""
def test_process_spider_exception_return_none(self):
class ProcessSpiderExceptionReturnNoneMiddleware:
def process_spider_exception(self, response, exception, spider):
return None
class RaiseExceptionProcessSpiderOutputMiddleware:
def process_spider_output(self, response, result, spider):
1 / 0
self.mwman._add_middleware(ProcessSpiderExceptionReturnNoneMiddleware())
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, ZeroDivisionError)
class BaseAsyncSpiderMiddlewareTestCase(SpiderMiddlewareTestCase):
"""Helpers for testing sync, async and mixed middlewares.
Should work for process_spider_output and, when it's supported, process_start_requests.
"""
ITEM_TYPE: type | tuple
RESULT_COUNT = 3 # to simplify checks, let everything return 3 objects
@staticmethod
def _construct_mw_setting(*mw_classes, start_index: int | None = None):
if start_index is None:
start_index = 10
return {i: c for c, i in enumerate(mw_classes, start=start_index)}
def _scrape_func(self, *args, **kwargs):
yield {"foo": 1}
yield {"foo": 2}
yield {"foo": 3}
@defer.inlineCallbacks
def _get_middleware_result(self, *mw_classes, start_index: int | None = None):
setting = self._construct_mw_setting(*mw_classes, start_index=start_index)
self.crawler = get_crawler(
Spider, {"SPIDER_MIDDLEWARES_BASE": {}, "SPIDER_MIDDLEWARES": setting}
)
self.spider = self.crawler._create_spider("foo")
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
result = yield self.mwman.scrape_response(
self._scrape_func, self.response, self.request, self.spider
)
return result
@defer.inlineCallbacks
def _test_simple_base(
self, *mw_classes, downgrade: bool = False, start_index: int | None = None
):
with LogCapture() as log:
result = yield self._get_middleware_result(
*mw_classes, start_index=start_index
)
self.assertIsInstance(result, Iterable)
result_list = list(result)
self.assertEqual(len(result_list), self.RESULT_COUNT)
self.assertIsInstance(result_list[0], self.ITEM_TYPE)
self.assertEqual("downgraded to a non-async" in str(log), downgrade)
@defer.inlineCallbacks
def _test_asyncgen_base(
self, *mw_classes, downgrade: bool = False, start_index: int | None = None
):
with LogCapture() as log:
result = yield self._get_middleware_result(
*mw_classes, start_index=start_index
)
self.assertIsInstance(result, AsyncIterator)
result_list = yield deferred_from_coro(collect_asyncgen(result))
self.assertEqual(len(result_list), self.RESULT_COUNT)
self.assertIsInstance(result_list[0], self.ITEM_TYPE)
self.assertEqual("downgraded to a non-async" in str(log), downgrade)
class ProcessSpiderOutputSimpleMiddleware:
def process_spider_output(self, response, result, spider):
yield from result
class ProcessSpiderOutputAsyncGenMiddleware:
async def process_spider_output(self, response, result, spider):
async for r in result:
yield r
class ProcessSpiderOutputUniversalMiddleware:
def process_spider_output(self, response, result, spider):
yield from result
async def process_spider_output_async(self, response, result, spider):
async for r in result:
yield r
class ProcessSpiderExceptionSimpleIterableMiddleware:
def process_spider_exception(self, response, exception, spider):
yield {"foo": 1}
yield {"foo": 2}
yield {"foo": 3}
class ProcessSpiderExceptionAsyncIterableMiddleware:
async def process_spider_exception(self, response, exception, spider):
yield {"foo": 1}
d = defer.Deferred()
from twisted.internet import reactor
reactor.callLater(0, d.callback, None)
await maybe_deferred_to_future(d)
yield {"foo": 2}
yield {"foo": 3}
class ProcessSpiderOutputSimple(BaseAsyncSpiderMiddlewareTestCase):
"""process_spider_output tests for simple callbacks"""
ITEM_TYPE = dict
MW_SIMPLE = ProcessSpiderOutputSimpleMiddleware
MW_ASYNCGEN = ProcessSpiderOutputAsyncGenMiddleware
MW_UNIVERSAL = ProcessSpiderOutputUniversalMiddleware
def test_simple(self):
"""Simple mw"""
return self._test_simple_base(self.MW_SIMPLE)
def test_asyncgen(self):
"""Asyncgen mw; upgrade"""
return self._test_asyncgen_base(self.MW_ASYNCGEN)
def test_simple_asyncgen(self):
"""Simple mw -> asyncgen mw; upgrade"""
return self._test_asyncgen_base(self.MW_ASYNCGEN, self.MW_SIMPLE)
def test_asyncgen_simple(self):
"""Asyncgen mw -> simple mw; upgrade then downgrade"""
return self._test_simple_base(self.MW_SIMPLE, self.MW_ASYNCGEN, downgrade=True)
def test_universal(self):
"""Universal mw"""
return self._test_simple_base(self.MW_UNIVERSAL)
def test_universal_simple(self):
"""Universal mw -> simple mw"""
return self._test_simple_base(self.MW_SIMPLE, self.MW_UNIVERSAL)
def test_simple_universal(self):
"""Simple mw -> universal mw"""
return self._test_simple_base(self.MW_UNIVERSAL, self.MW_SIMPLE)
def test_universal_asyncgen(self):
"""Universal mw -> asyncgen mw; upgrade"""
return self._test_asyncgen_base(self.MW_ASYNCGEN, self.MW_UNIVERSAL)
def test_asyncgen_universal(self):
"""Asyncgen mw -> universal mw; upgrade"""
return self._test_asyncgen_base(self.MW_UNIVERSAL, self.MW_ASYNCGEN)
class ProcessSpiderOutputAsyncGen(ProcessSpiderOutputSimple):
"""process_spider_output tests for async generator callbacks"""
async def _scrape_func(self, *args, **kwargs):
for item in super()._scrape_func():
yield item
def test_simple(self):
"""Simple mw; downgrade"""
return self._test_simple_base(self.MW_SIMPLE, downgrade=True)
def test_simple_asyncgen(self):
"""Simple mw -> asyncgen mw; downgrade then upgrade"""
return self._test_asyncgen_base(
self.MW_ASYNCGEN, self.MW_SIMPLE, downgrade=True
)
def test_universal(self):
"""Universal mw"""
return self._test_asyncgen_base(self.MW_UNIVERSAL)
def test_universal_simple(self):
"""Universal mw -> simple mw; downgrade"""
return self._test_simple_base(self.MW_SIMPLE, self.MW_UNIVERSAL, downgrade=True)
def test_simple_universal(self):
"""Simple mw -> universal mw; downgrade"""
return self._test_simple_base(self.MW_UNIVERSAL, self.MW_SIMPLE, downgrade=True)
class ProcessSpiderOutputNonIterableMiddleware:
def process_spider_output(self, response, result, spider):
return
class ProcessSpiderOutputCoroutineMiddleware:
async def process_spider_output(self, response, result, spider):
return result
class ProcessSpiderOutputInvalidResult(BaseAsyncSpiderMiddlewareTestCase):
@defer.inlineCallbacks
def test_non_iterable(self):
with self.assertRaisesRegex(
_InvalidOutput,
(
r"\.process_spider_output must return an iterable, got <class "
r"'NoneType'>"
),
):
yield self._get_middleware_result(
ProcessSpiderOutputNonIterableMiddleware,
)
@defer.inlineCallbacks
def test_coroutine(self):
with self.assertRaisesRegex(
_InvalidOutput,
r"\.process_spider_output must be an asynchronous generator",
):
yield self._get_middleware_result(
ProcessSpiderOutputCoroutineMiddleware,
)
class ProcessStartRequestsSimpleMiddleware:
def process_start_requests(self, start_requests, spider):
yield from start_requests
class ProcessStartRequestsSimple(BaseAsyncSpiderMiddlewareTestCase):
"""process_start_requests tests for simple start_requests"""
ITEM_TYPE = (Request, dict)
MW_SIMPLE = ProcessStartRequestsSimpleMiddleware
def _start_requests(self):
for i in range(2):
yield Request(f"https://example.com/{i}", dont_filter=True)
yield {"name": "test item"}
@defer.inlineCallbacks
def _get_middleware_result(self, *mw_classes, start_index: int | None = None):
setting = self._construct_mw_setting(*mw_classes, start_index=start_index)
self.crawler = get_crawler(
Spider, {"SPIDER_MIDDLEWARES_BASE": {}, "SPIDER_MIDDLEWARES": setting}
)
self.spider = self.crawler._create_spider("foo")
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
start_requests = iter(self._start_requests())
results = yield self.mwman.process_start_requests(start_requests, self.spider)
return results
def test_simple(self):
"""Simple mw"""
return self._test_simple_base(self.MW_SIMPLE)
class UniversalMiddlewareNoSync:
async def process_spider_output_async(self, response, result, spider):
yield
class UniversalMiddlewareBothSync:
def process_spider_output(self, response, result, spider):
yield
def process_spider_output_async(self, response, result, spider):
yield
class UniversalMiddlewareBothAsync:
async def process_spider_output(self, response, result, spider):
yield
async def process_spider_output_async(self, response, result, spider):
yield
class UniversalMiddlewareManagerTest(TestCase):
def setUp(self):
self.mwman = SpiderMiddlewareManager()
def test_simple_mw(self):
mw = ProcessSpiderOutputSimpleMiddleware
self.mwman._add_middleware(mw)
self.assertEqual(
self.mwman.methods["process_spider_output"][0], mw.process_spider_output
)
def test_async_mw(self):
mw = ProcessSpiderOutputAsyncGenMiddleware
self.mwman._add_middleware(mw)
self.assertEqual(
self.mwman.methods["process_spider_output"][0], mw.process_spider_output
)
def test_universal_mw(self):
mw = ProcessSpiderOutputUniversalMiddleware
self.mwman._add_middleware(mw)
self.assertEqual(
self.mwman.methods["process_spider_output"][0],
(mw.process_spider_output, mw.process_spider_output_async),
)
def test_universal_mw_no_sync(self):
with LogCapture() as log:
self.mwman._add_middleware(UniversalMiddlewareNoSync)
self.assertIn(
"UniversalMiddlewareNoSync has process_spider_output_async"
" without process_spider_output",
str(log),
)
self.assertEqual(self.mwman.methods["process_spider_output"][0], None)
def test_universal_mw_both_sync(self):
mw = UniversalMiddlewareBothSync
with LogCapture() as log:
self.mwman._add_middleware(mw)
self.assertIn(
"UniversalMiddlewareBothSync.process_spider_output_async "
"is not an async generator function",
str(log),
)
self.assertEqual(
self.mwman.methods["process_spider_output"][0], mw.process_spider_output
)
def test_universal_mw_both_async(self):
with LogCapture() as log:
self.mwman._add_middleware(UniversalMiddlewareBothAsync)
self.assertIn(
"UniversalMiddlewareBothAsync.process_spider_output "
"is an async generator function while process_spider_output_async exists",
str(log),
)
self.assertEqual(self.mwman.methods["process_spider_output"][0], None)
class BuiltinMiddlewareSimpleTest(BaseAsyncSpiderMiddlewareTestCase):
ITEM_TYPE = dict
MW_SIMPLE = ProcessSpiderOutputSimpleMiddleware
MW_ASYNCGEN = ProcessSpiderOutputAsyncGenMiddleware
MW_UNIVERSAL = ProcessSpiderOutputUniversalMiddleware
@defer.inlineCallbacks
def _get_middleware_result(self, *mw_classes, start_index: int | None = None):
setting = self._construct_mw_setting(*mw_classes, start_index=start_index)
self.crawler = get_crawler(Spider, {"SPIDER_MIDDLEWARES": setting})
self.spider = self.crawler._create_spider("foo")
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
result = yield self.mwman.scrape_response(
self._scrape_func, self.response, self.request, self.spider
)
return result
def test_just_builtin(self):
return self._test_simple_base()
def test_builtin_simple(self):
return self._test_simple_base(self.MW_SIMPLE, start_index=1000)
def test_builtin_async(self):
"""Upgrade"""
return self._test_asyncgen_base(self.MW_ASYNCGEN, start_index=1000)
def test_builtin_universal(self):
return self._test_simple_base(self.MW_UNIVERSAL, start_index=1000)
def test_simple_builtin(self):
return self._test_simple_base(self.MW_SIMPLE)
def test_async_builtin(self):
"""Upgrade"""
return self._test_asyncgen_base(self.MW_ASYNCGEN)
def test_universal_builtin(self):
return self._test_simple_base(self.MW_UNIVERSAL)
class BuiltinMiddlewareAsyncGenTest(BuiltinMiddlewareSimpleTest):
async def _scrape_func(self, *args, **kwargs):
for item in super()._scrape_func():
yield item
def test_just_builtin(self):
return self._test_asyncgen_base()
def test_builtin_simple(self):
"""Downgrade"""
return self._test_simple_base(self.MW_SIMPLE, downgrade=True, start_index=1000)
def test_builtin_async(self):
return self._test_asyncgen_base(self.MW_ASYNCGEN, start_index=1000)
def test_builtin_universal(self):
return self._test_asyncgen_base(self.MW_UNIVERSAL, start_index=1000)
def test_simple_builtin(self):
"""Downgrade"""
return self._test_simple_base(self.MW_SIMPLE, downgrade=True)
def test_async_builtin(self):
return self._test_asyncgen_base(self.MW_ASYNCGEN)
def test_universal_builtin(self):
return self._test_asyncgen_base(self.MW_UNIVERSAL)
class ProcessSpiderExceptionTest(BaseAsyncSpiderMiddlewareTestCase):
ITEM_TYPE = dict
MW_SIMPLE = ProcessSpiderOutputSimpleMiddleware
MW_ASYNCGEN = ProcessSpiderOutputAsyncGenMiddleware
MW_UNIVERSAL = ProcessSpiderOutputUniversalMiddleware
MW_EXC_SIMPLE = ProcessSpiderExceptionSimpleIterableMiddleware
MW_EXC_ASYNCGEN = ProcessSpiderExceptionAsyncIterableMiddleware
def _scrape_func(self, *args, **kwargs):
1 / 0
@defer.inlineCallbacks
def _test_asyncgen_nodowngrade(self, *mw_classes):
with self.assertRaisesRegex(
_InvalidOutput, "Async iterable returned from .+ cannot be downgraded"
):
yield self._get_middleware_result(*mw_classes)
def test_exc_simple(self):
"""Simple exc mw"""
return self._test_simple_base(self.MW_EXC_SIMPLE)
def test_exc_async(self):
"""Async exc mw"""
return self._test_asyncgen_base(self.MW_EXC_ASYNCGEN)
def test_exc_simple_simple(self):
"""Simple exc mw -> simple output mw"""
return self._test_simple_base(self.MW_SIMPLE, self.MW_EXC_SIMPLE)
def test_exc_async_async(self):
"""Async exc mw -> async output mw"""
return self._test_asyncgen_base(self.MW_ASYNCGEN, self.MW_EXC_ASYNCGEN)
def test_exc_simple_async(self):
"""Simple exc mw -> async output mw; upgrade"""
return self._test_asyncgen_base(self.MW_ASYNCGEN, self.MW_EXC_SIMPLE)
def test_exc_async_simple(self):
"""Async exc mw -> simple output mw; cannot work as downgrading is not supported"""
return self._test_asyncgen_nodowngrade(self.MW_SIMPLE, self.MW_EXC_ASYNCGEN)