mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 02:24:22 +00:00
248 lines
8.4 KiB
Python
248 lines
8.4 KiB
Python
import logging
|
|
import warnings
|
|
|
|
from twisted.internet import defer
|
|
from twisted.trial import unittest
|
|
from pytest import raises
|
|
|
|
import scrapy
|
|
from scrapy.crawler import Crawler, CrawlerRunner, CrawlerProcess
|
|
from scrapy.settings import Settings, default_settings
|
|
from scrapy.spiderloader import SpiderLoader
|
|
from scrapy.utils.log import configure_logging, get_scrapy_root_handler
|
|
from scrapy.utils.spider import DefaultSpider
|
|
from scrapy.utils.misc import load_object
|
|
from scrapy.extensions.throttle import AutoThrottle
|
|
from scrapy.extensions import telnet
|
|
|
|
|
|
class BaseCrawlerTest(unittest.TestCase):
|
|
|
|
def assertOptionIsDefault(self, settings, key):
|
|
self.assertIsInstance(settings, Settings)
|
|
self.assertEqual(settings[key], getattr(default_settings, key))
|
|
|
|
|
|
class CrawlerTestCase(BaseCrawlerTest):
|
|
|
|
def setUp(self):
|
|
self.crawler = Crawler(DefaultSpider, Settings())
|
|
|
|
def test_deprecated_attribute_spiders(self):
|
|
with warnings.catch_warnings(record=True) as w:
|
|
spiders = self.crawler.spiders
|
|
self.assertEqual(len(w), 1)
|
|
self.assertIn("Crawler.spiders", str(w[0].message))
|
|
sl_cls = load_object(self.crawler.settings['SPIDER_LOADER_CLASS'])
|
|
self.assertIsInstance(spiders, sl_cls)
|
|
|
|
self.crawler.spiders
|
|
is_one_warning = len(w) == 1
|
|
if not is_one_warning:
|
|
for warning in w:
|
|
print(warning)
|
|
self.assertTrue(is_one_warning, "Warn deprecated access only once")
|
|
|
|
def test_populate_spidercls_settings(self):
|
|
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
|
|
project_settings = {'TEST1': 'project', 'TEST3': 'project'}
|
|
|
|
class CustomSettingsSpider(DefaultSpider):
|
|
custom_settings = spider_settings
|
|
|
|
settings = Settings()
|
|
settings.setdict(project_settings, priority='project')
|
|
crawler = Crawler(CustomSettingsSpider, settings)
|
|
|
|
self.assertEqual(crawler.settings.get('TEST1'), 'spider')
|
|
self.assertEqual(crawler.settings.get('TEST2'), 'spider')
|
|
self.assertEqual(crawler.settings.get('TEST3'), 'project')
|
|
|
|
self.assertFalse(settings.frozen)
|
|
self.assertTrue(crawler.settings.frozen)
|
|
|
|
def test_crawler_accepts_dict(self):
|
|
crawler = Crawler(DefaultSpider, {'foo': 'bar'})
|
|
self.assertEqual(crawler.settings['foo'], 'bar')
|
|
self.assertOptionIsDefault(crawler.settings, 'RETRY_ENABLED')
|
|
|
|
def test_crawler_accepts_None(self):
|
|
crawler = Crawler(DefaultSpider)
|
|
self.assertOptionIsDefault(crawler.settings, 'RETRY_ENABLED')
|
|
|
|
def test_crawler_rejects_spider_objects(self):
|
|
with raises(ValueError):
|
|
Crawler(DefaultSpider())
|
|
|
|
|
|
class SpiderSettingsTestCase(unittest.TestCase):
|
|
def test_spider_custom_settings(self):
|
|
class MySpider(scrapy.Spider):
|
|
name = 'spider'
|
|
custom_settings = {
|
|
'AUTOTHROTTLE_ENABLED': True
|
|
}
|
|
|
|
crawler = Crawler(MySpider, {})
|
|
enabled_exts = [e.__class__ for e in crawler.extensions.middlewares]
|
|
self.assertIn(AutoThrottle, enabled_exts)
|
|
|
|
|
|
class CrawlerLoggingTestCase(unittest.TestCase):
|
|
def test_no_root_handler_installed(self):
|
|
handler = get_scrapy_root_handler()
|
|
if handler is not None:
|
|
logging.root.removeHandler(handler)
|
|
|
|
class MySpider(scrapy.Spider):
|
|
name = 'spider'
|
|
|
|
crawler = Crawler(MySpider, {})
|
|
assert get_scrapy_root_handler() is None
|
|
|
|
def test_spider_custom_settings_log_level(self):
|
|
log_file = self.mktemp()
|
|
class MySpider(scrapy.Spider):
|
|
name = 'spider'
|
|
custom_settings = {
|
|
'LOG_LEVEL': 'INFO',
|
|
'LOG_FILE': log_file,
|
|
# disable telnet if not available to avoid an extra warning
|
|
'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
|
|
}
|
|
|
|
configure_logging()
|
|
self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
|
|
crawler = Crawler(MySpider, {})
|
|
self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
|
|
info_count = crawler.stats.get_value('log_count/INFO')
|
|
logging.debug('debug message')
|
|
logging.info('info message')
|
|
logging.warning('warning message')
|
|
logging.error('error message')
|
|
|
|
with open(log_file, 'rb') as fo:
|
|
logged = fo.read().decode('utf8')
|
|
|
|
self.assertNotIn('debug message', logged)
|
|
self.assertIn('info message', logged)
|
|
self.assertIn('warning message', logged)
|
|
self.assertIn('error message', logged)
|
|
self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
|
|
self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
|
|
self.assertEqual(
|
|
crawler.stats.get_value('log_count/INFO') - info_count, 1)
|
|
self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
|
|
|
|
|
|
class SpiderLoaderWithWrongInterface(object):
|
|
|
|
def unneeded_method(self):
|
|
pass
|
|
|
|
|
|
class CustomSpiderLoader(SpiderLoader):
|
|
pass
|
|
|
|
|
|
class CrawlerRunnerTestCase(BaseCrawlerTest):
|
|
|
|
def test_spider_manager_verify_interface(self):
|
|
settings = Settings({
|
|
'SPIDER_LOADER_CLASS': 'tests.test_crawler.SpiderLoaderWithWrongInterface'
|
|
})
|
|
with warnings.catch_warnings(record=True) as w:
|
|
self.assertRaises(AttributeError, CrawlerRunner, settings)
|
|
self.assertEqual(len(w), 1)
|
|
self.assertIn("SPIDER_LOADER_CLASS", str(w[0].message))
|
|
self.assertIn("scrapy.interfaces.ISpiderLoader", str(w[0].message))
|
|
|
|
def test_crawler_runner_accepts_dict(self):
|
|
runner = CrawlerRunner({'foo': 'bar'})
|
|
self.assertEqual(runner.settings['foo'], 'bar')
|
|
self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
|
|
|
|
def test_crawler_runner_accepts_None(self):
|
|
runner = CrawlerRunner()
|
|
self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
|
|
|
|
def test_deprecated_attribute_spiders(self):
|
|
with warnings.catch_warnings(record=True) as w:
|
|
runner = CrawlerRunner(Settings())
|
|
spiders = runner.spiders
|
|
self.assertEqual(len(w), 1)
|
|
self.assertIn("CrawlerRunner.spiders", str(w[0].message))
|
|
self.assertIn("CrawlerRunner.spider_loader", str(w[0].message))
|
|
sl_cls = load_object(runner.settings['SPIDER_LOADER_CLASS'])
|
|
self.assertIsInstance(spiders, sl_cls)
|
|
|
|
|
|
class CrawlerProcessTest(BaseCrawlerTest):
|
|
def test_crawler_process_accepts_dict(self):
|
|
runner = CrawlerProcess({'foo': 'bar'})
|
|
self.assertEqual(runner.settings['foo'], 'bar')
|
|
self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
|
|
|
|
def test_crawler_process_accepts_None(self):
|
|
runner = CrawlerProcess()
|
|
self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
|
|
|
|
|
|
class ExceptionSpider(scrapy.Spider):
|
|
name = 'exception'
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler, *args, **kwargs):
|
|
raise ValueError('Exception in from_crawler method')
|
|
|
|
|
|
class NoRequestsSpider(scrapy.Spider):
|
|
name = 'no_request'
|
|
|
|
def start_requests(self):
|
|
return []
|
|
|
|
|
|
class CrawlerRunnerHasSpider(unittest.TestCase):
|
|
|
|
@defer.inlineCallbacks
|
|
def test_crawler_runner_bootstrap_successful(self):
|
|
runner = CrawlerRunner()
|
|
yield runner.crawl(NoRequestsSpider)
|
|
self.assertEqual(runner.bootstrap_failed, False)
|
|
|
|
@defer.inlineCallbacks
|
|
def test_crawler_runner_bootstrap_successful_for_several(self):
|
|
runner = CrawlerRunner()
|
|
yield runner.crawl(NoRequestsSpider)
|
|
yield runner.crawl(NoRequestsSpider)
|
|
self.assertEqual(runner.bootstrap_failed, False)
|
|
|
|
@defer.inlineCallbacks
|
|
def test_crawler_runner_bootstrap_failed(self):
|
|
runner = CrawlerRunner()
|
|
|
|
try:
|
|
yield runner.crawl(ExceptionSpider)
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
self.fail('Exception should be raised from spider')
|
|
|
|
self.assertEqual(runner.bootstrap_failed, True)
|
|
|
|
@defer.inlineCallbacks
|
|
def test_crawler_runner_bootstrap_failed_for_several(self):
|
|
runner = CrawlerRunner()
|
|
|
|
try:
|
|
yield runner.crawl(ExceptionSpider)
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
self.fail('Exception should be raised from spider')
|
|
|
|
yield runner.crawl(NoRequestsSpider)
|
|
|
|
self.assertEqual(runner.bootstrap_failed, True)
|