2010-05-26 11:58:31 -03:00
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import shutil
|
2010-04-05 11:27:19 -03:00
|
|
|
|
2010-09-03 14:29:27 -03:00
|
|
|
from zope.interface.verify import verifyObject
|
2010-05-26 11:58:31 -03:00
|
|
|
from twisted.trial import unittest
|
|
|
|
|
2010-09-03 14:29:27 -03:00
|
|
|
|
2015-05-09 04:20:09 -03:00
|
|
|
# ugly hack to avoid cyclic imports of scrapy.spiders when running this test
|
2010-04-05 11:27:19 -03:00
|
|
|
# alone
|
2015-10-06 17:23:47 +05:00
|
|
|
import scrapy
|
2016-12-01 12:52:52 -03:00
|
|
|
import tempfile
|
2015-04-16 20:07:53 +05:00
|
|
|
from scrapy.interfaces import ISpiderLoader
|
|
|
|
from scrapy.spiderloader import SpiderLoader
|
2014-07-17 10:25:07 -03:00
|
|
|
from scrapy.settings import Settings
|
2010-04-05 11:27:19 -03:00
|
|
|
from scrapy.http import Request
|
2015-10-06 17:23:47 +05:00
|
|
|
from scrapy.crawler import CrawlerRunner
|
2010-04-05 11:27:19 -03:00
|
|
|
|
2010-05-26 11:58:31 -03:00
|
|
|
module_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
2015-04-16 20:07:53 +05:00
|
|
|
|
|
|
|
class SpiderLoaderTest(unittest.TestCase):
|
2010-04-05 11:27:19 -03:00
|
|
|
|
|
|
|
def setUp(self):
|
2010-05-26 11:58:31 -03:00
|
|
|
orig_spiders_dir = os.path.join(module_dir, 'test_spiders')
|
2016-12-01 12:52:52 -03:00
|
|
|
self.tmpdir = tempfile.mkdtemp()
|
2010-05-26 11:58:31 -03:00
|
|
|
self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
|
|
|
|
shutil.copytree(orig_spiders_dir, self.spiders_dir)
|
|
|
|
sys.path.append(self.tmpdir)
|
2014-07-17 10:25:07 -03:00
|
|
|
settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']})
|
2015-04-17 01:54:26 +05:00
|
|
|
self.spider_loader = SpiderLoader.from_settings(settings)
|
2010-04-05 11:27:19 -03:00
|
|
|
|
2010-05-26 11:58:31 -03:00
|
|
|
def tearDown(self):
|
2015-04-17 01:54:26 +05:00
|
|
|
del self.spider_loader
|
2014-04-04 17:01:42 -03:00
|
|
|
del sys.modules['test_spiders_xxx']
|
2010-05-26 11:58:31 -03:00
|
|
|
sys.path.remove(self.tmpdir)
|
|
|
|
|
2010-09-03 14:29:27 -03:00
|
|
|
def test_interface(self):
|
2015-04-17 01:54:26 +05:00
|
|
|
verifyObject(ISpiderLoader, self.spider_loader)
|
2010-09-03 14:29:27 -03:00
|
|
|
|
2010-04-05 11:27:19 -03:00
|
|
|
def test_list(self):
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(set(self.spider_loader.list()),
|
2016-12-01 12:52:52 -03:00
|
|
|
set(['spider1', 'spider2', 'spider3', 'spider4']))
|
2010-04-05 11:27:19 -03:00
|
|
|
|
2014-07-17 10:25:07 -03:00
|
|
|
def test_load(self):
|
2015-04-17 01:54:26 +05:00
|
|
|
spider1 = self.spider_loader.load("spider1")
|
2014-07-17 10:25:07 -03:00
|
|
|
self.assertEqual(spider1.__name__, 'Spider1')
|
2010-04-05 11:27:19 -03:00
|
|
|
|
|
|
|
def test_find_by_request(self):
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(self.spider_loader.find_by_request(Request('http://scrapy1.org/test')),
|
2010-04-05 11:27:19 -03:00
|
|
|
['spider1'])
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(self.spider_loader.find_by_request(Request('http://scrapy2.org/test')),
|
2010-04-05 11:27:19 -03:00
|
|
|
['spider2'])
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(set(self.spider_loader.find_by_request(Request('http://scrapy3.org/test'))),
|
2010-04-05 11:27:19 -03:00
|
|
|
set(['spider1', 'spider2']))
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(self.spider_loader.find_by_request(Request('http://scrapy999.org/test')),
|
2010-04-05 11:27:19 -03:00
|
|
|
[])
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(self.spider_loader.find_by_request(Request('http://spider3.com')),
|
2010-08-06 14:59:18 -03:00
|
|
|
[])
|
2015-04-17 01:54:26 +05:00
|
|
|
self.assertEqual(self.spider_loader.find_by_request(Request('http://spider3.com/onlythis')),
|
2010-08-06 14:59:18 -03:00
|
|
|
['spider3'])
|
2010-04-05 11:27:19 -03:00
|
|
|
|
2010-07-30 17:30:32 -03:00
|
|
|
def test_load_spider_module(self):
|
2015-04-16 20:07:53 +05:00
|
|
|
module = 'tests.test_spiderloader.test_spiders.spider1'
|
2014-07-17 10:25:07 -03:00
|
|
|
settings = Settings({'SPIDER_MODULES': [module]})
|
2015-04-17 01:54:26 +05:00
|
|
|
self.spider_loader = SpiderLoader.from_settings(settings)
|
|
|
|
assert len(self.spider_loader._spiders) == 1
|
2010-04-06 10:55:21 -03:00
|
|
|
|
2016-12-01 11:56:33 -03:00
|
|
|
def test_load_spider_module_multiple(self):
|
2015-04-16 20:07:53 +05:00
|
|
|
prefix = 'tests.test_spiderloader.test_spiders.'
|
2014-09-10 23:25:57 -03:00
|
|
|
module = ','.join(prefix + s for s in ('spider1', 'spider2'))
|
|
|
|
settings = Settings({'SPIDER_MODULES': module})
|
2015-04-17 01:54:26 +05:00
|
|
|
self.spider_loader = SpiderLoader.from_settings(settings)
|
|
|
|
assert len(self.spider_loader._spiders) == 2
|
2014-09-10 23:25:57 -03:00
|
|
|
|
2010-07-30 17:30:32 -03:00
|
|
|
def test_load_base_spider(self):
|
2015-04-16 20:07:53 +05:00
|
|
|
module = 'tests.test_spiderloader.test_spiders.spider0'
|
2014-07-17 10:25:07 -03:00
|
|
|
settings = Settings({'SPIDER_MODULES': [module]})
|
2015-04-17 01:54:26 +05:00
|
|
|
self.spider_loader = SpiderLoader.from_settings(settings)
|
|
|
|
assert len(self.spider_loader._spiders) == 0
|
2015-10-06 17:23:47 +05:00
|
|
|
|
|
|
|
def test_crawler_runner_loading(self):
|
|
|
|
module = 'tests.test_spiderloader.test_spiders.spider1'
|
|
|
|
runner = CrawlerRunner({'SPIDER_MODULES': [module]})
|
|
|
|
|
|
|
|
self.assertRaisesRegexp(KeyError, 'Spider not found',
|
|
|
|
runner.create_crawler, 'spider2')
|
|
|
|
|
|
|
|
crawler = runner.create_crawler('spider1')
|
|
|
|
self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
|
|
|
|
self.assertEqual(crawler.spidercls.name, 'spider1')
|