1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 13:48:23 +00:00
scrapy/tests/test_spidermiddleware_urllength.py

45 lines
1.5 KiB
Python
Raw Normal View History

2009-09-07 11:14:47 -03:00
from unittest import TestCase
from testfixtures import LogCapture
2023-01-25 14:25:15 -06:00
from scrapy.http import Request, Response
from scrapy.settings import Settings
from scrapy.spidermiddlewares.urllength import UrlLengthMiddleware
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
2009-09-07 11:14:47 -03:00
class TestUrlLengthMiddleware(TestCase):
def setUp(self):
2021-03-12 17:11:50 +06:00
self.maxlength = 25
2022-11-29 11:30:46 -03:00
settings = Settings({"URLLENGTH_LIMIT": self.maxlength})
2021-03-12 17:11:50 +06:00
crawler = get_crawler(Spider)
2022-11-29 11:30:46 -03:00
self.spider = crawler._create_spider("foo")
2021-03-15 21:38:03 +06:00
self.stats = crawler.stats
2021-03-12 17:11:50 +06:00
self.mw = UrlLengthMiddleware.from_settings(settings)
2022-11-29 11:30:46 -03:00
self.response = Response("http://scrapytest.org")
self.short_url_req = Request("http://scrapytest.org/")
self.long_url_req = Request("http://scrapytest.org/this_is_a_long_url")
self.reqs = [self.short_url_req, self.long_url_req]
def process_spider_output(self):
2022-11-29 11:30:46 -03:00
return list(
self.mw.process_spider_output(self.response, self.reqs, self.spider)
)
def test_middleware_works(self):
self.assertEqual(self.process_spider_output(), [self.short_url_req])
def test_logging(self):
with LogCapture() as log:
self.process_spider_output()
2009-09-07 11:14:47 -03:00
2022-11-29 11:30:46 -03:00
ric = self.stats.get_value(
"urllength/request_ignored_count", spider=self.spider
)
self.assertEqual(ric, 1)
2009-09-07 11:14:47 -03:00
2022-11-29 11:30:46 -03:00
self.assertIn(f"Ignoring link (url length > {self.maxlength})", str(log))