1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00
scrapy/tests/test_downloaderslotssettings.py
2025-02-02 14:10:09 +05:00

103 lines
3.1 KiB
Python

import time
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy import Request
from scrapy.core.downloader import Downloader, Slot
from scrapy.crawler import CrawlerRunner
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import MetaSpider
class DownloaderSlotsSettingsTestSpider(MetaSpider):
name = "downloader_slots"
custom_settings = {
"DOWNLOAD_DELAY": 1,
"RANDOMIZE_DOWNLOAD_DELAY": False,
"DOWNLOAD_SLOTS": {
"quotes.toscrape.com": {
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
"throttle": False,
},
"books.toscrape.com": {"delay": 3, "randomize_delay": False},
},
}
def start_requests(self):
self.times = {None: []}
slots = [*self.custom_settings.get("DOWNLOAD_SLOTS", {}), None]
for slot in slots:
url = self.mockserver.url(f"/?downloader_slot={slot}")
self.times[slot] = []
yield Request(url, callback=self.parse, meta={"download_slot": slot})
def parse(self, response):
slot = response.meta.get("download_slot", None)
self.times[slot].append(time.time())
url = self.mockserver.url(f"/?downloader_slot={slot}&req=2")
yield Request(url, callback=self.not_parse, meta={"download_slot": slot})
def not_parse(self, response):
slot = response.meta.get("download_slot", None)
self.times[slot].append(time.time())
class CrawlTestCase(TestCase):
@classmethod
def setUpClass(cls):
cls.mockserver = MockServer()
cls.mockserver.__enter__()
@classmethod
def tearDownClass(cls):
cls.mockserver.__exit__(None, None, None)
def setUp(self):
self.runner = CrawlerRunner()
@defer.inlineCallbacks
def test_delay(self):
crawler = CrawlerRunner().create_crawler(DownloaderSlotsSettingsTestSpider)
yield crawler.crawl(mockserver=self.mockserver)
slots = crawler.engine.downloader.slots
times = crawler.spider.times
tolerance = 0.3
delays_real = {k: v[1] - v[0] for k, v in times.items()}
error_delta = {
k: 1 - min(delays_real[k], v.delay) / max(delays_real[k], v.delay)
for k, v in slots.items()
}
self.assertTrue(max(list(error_delta.values())) < tolerance)
def test_params():
params = {
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
}
settings = {
"DOWNLOAD_SLOTS": {
"example.com": params,
},
}
crawler = get_crawler(settings_dict=settings)
downloader = Downloader(crawler)
downloader._slot_gc_loop.stop() # Prevent an unclean reactor.
request = Request("https://example.com")
_, actual = downloader._get_slot(request, spider=None)
expected = Slot(**params)
for param in params:
assert getattr(expected, param) == getattr(actual, param), (
f"Slot.{param}: {getattr(expected, param)!r} != {getattr(actual, param)!r}"
)