2009-08-28 14:42:07 -03:00
|
|
|
import os
|
2012-08-09 16:55:05 -03:00
|
|
|
import sys
|
2009-08-28 14:42:07 -03:00
|
|
|
import subprocess
|
2014-07-14 18:47:22 +08:00
|
|
|
import tempfile
|
2012-08-09 16:55:05 -03:00
|
|
|
from time import sleep
|
2011-07-14 01:40:31 -03:00
|
|
|
from os.path import exists, join, abspath
|
2015-10-31 16:19:11 +01:00
|
|
|
from shutil import rmtree, copytree
|
2009-08-28 14:42:07 -03:00
|
|
|
from tempfile import mkdtemp
|
2016-02-17 23:03:12 +05:00
|
|
|
from contextlib import contextmanager
|
2010-08-19 02:30:15 -03:00
|
|
|
|
|
|
|
from twisted.trial import unittest
|
2015-01-19 10:28:25 -03:00
|
|
|
from twisted.internet import defer
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2015-10-31 16:19:11 +01:00
|
|
|
import scrapy
|
2015-10-12 17:59:42 +05:00
|
|
|
from scrapy.utils.python import to_native_str
|
2012-08-29 11:44:00 -03:00
|
|
|
from scrapy.utils.python import retry_on_eintr
|
2013-05-06 17:35:09 -03:00
|
|
|
from scrapy.utils.test import get_testenv
|
2015-01-19 10:28:25 -03:00
|
|
|
from scrapy.utils.testsite import SiteTest
|
|
|
|
from scrapy.utils.testproc import ProcessTest
|
|
|
|
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-28 17:47:45 -03:00
|
|
|
class ProjectTest(unittest.TestCase):
|
2009-08-28 14:42:07 -03:00
|
|
|
project_name = 'testproject'
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
self.temp_path = mkdtemp()
|
|
|
|
self.cwd = self.temp_path
|
2009-08-28 18:07:35 -03:00
|
|
|
self.proj_path = join(self.temp_path, self.project_name)
|
|
|
|
self.proj_mod_path = join(self.proj_path, self.project_name)
|
2013-05-06 17:35:09 -03:00
|
|
|
self.env = get_testenv()
|
2009-08-28 14:42:07 -03:00
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
rmtree(self.temp_path)
|
|
|
|
|
2009-08-30 01:51:34 -03:00
|
|
|
def call(self, *new_args, **kwargs):
|
2014-07-14 18:00:20 +06:00
|
|
|
with tempfile.TemporaryFile() as out:
|
|
|
|
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
|
|
|
|
return subprocess.call(args, stdout=out, stderr=out, cwd=self.cwd,
|
|
|
|
env=self.env, **kwargs)
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2016-12-09 02:14:12 +05:00
|
|
|
def proc(self, *new_args, **popen_kwargs):
|
2010-08-19 02:30:15 -03:00
|
|
|
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
|
2012-08-09 16:55:05 -03:00
|
|
|
p = subprocess.Popen(args, cwd=self.cwd, env=self.env,
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
2016-12-09 02:14:12 +05:00
|
|
|
**popen_kwargs)
|
2012-08-09 16:55:05 -03:00
|
|
|
|
|
|
|
waited = 0
|
|
|
|
interval = 0.2
|
|
|
|
while p.poll() is None:
|
|
|
|
sleep(interval)
|
|
|
|
waited += interval
|
2013-09-05 21:52:54 -03:00
|
|
|
if waited > 15:
|
2012-08-09 16:55:05 -03:00
|
|
|
p.kill()
|
|
|
|
assert False, 'Command took too much time to complete'
|
|
|
|
|
|
|
|
return p
|
2010-08-19 02:30:15 -03:00
|
|
|
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-28 17:58:50 -03:00
|
|
|
class StartprojectTest(ProjectTest):
|
2009-08-30 01:51:34 -03:00
|
|
|
|
2009-08-28 14:42:07 -03:00
|
|
|
def test_startproject(self):
|
2009-08-30 01:51:34 -03:00
|
|
|
self.assertEqual(0, self.call('startproject', self.project_name))
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2010-08-18 19:48:32 -03:00
|
|
|
assert exists(join(self.proj_path, 'scrapy.cfg'))
|
2009-08-28 18:07:35 -03:00
|
|
|
assert exists(join(self.proj_path, 'testproject'))
|
|
|
|
assert exists(join(self.proj_mod_path, '__init__.py'))
|
|
|
|
assert exists(join(self.proj_mod_path, 'items.py'))
|
|
|
|
assert exists(join(self.proj_mod_path, 'pipelines.py'))
|
|
|
|
assert exists(join(self.proj_mod_path, 'settings.py'))
|
|
|
|
assert exists(join(self.proj_mod_path, 'spiders', '__init__.py'))
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-30 01:51:34 -03:00
|
|
|
self.assertEqual(1, self.call('startproject', self.project_name))
|
|
|
|
self.assertEqual(1, self.call('startproject', 'wrong---project---name'))
|
2013-10-18 14:46:55 +01:00
|
|
|
self.assertEqual(1, self.call('startproject', 'sys'))
|
2016-01-21 18:31:58 +05:00
|
|
|
|
2016-05-23 23:15:53 -03:00
|
|
|
def test_startproject_with_project_dir(self):
|
|
|
|
project_dir = mkdtemp()
|
|
|
|
self.assertEqual(0, self.call('startproject', self.project_name, project_dir))
|
|
|
|
|
|
|
|
assert exists(join(abspath(project_dir), 'scrapy.cfg'))
|
|
|
|
assert exists(join(abspath(project_dir), 'testproject'))
|
|
|
|
assert exists(join(join(abspath(project_dir), self.project_name), '__init__.py'))
|
|
|
|
assert exists(join(join(abspath(project_dir), self.project_name), 'items.py'))
|
|
|
|
assert exists(join(join(abspath(project_dir), self.project_name), 'pipelines.py'))
|
|
|
|
assert exists(join(join(abspath(project_dir), self.project_name), 'settings.py'))
|
|
|
|
assert exists(join(join(abspath(project_dir), self.project_name), 'spiders', '__init__.py'))
|
|
|
|
|
2016-05-24 13:03:33 -03:00
|
|
|
self.assertEqual(0, self.call('startproject', self.project_name, project_dir + '2'))
|
|
|
|
|
2016-05-23 23:15:53 -03:00
|
|
|
self.assertEqual(1, self.call('startproject', self.project_name, project_dir))
|
|
|
|
self.assertEqual(1, self.call('startproject', self.project_name + '2', project_dir))
|
|
|
|
self.assertEqual(1, self.call('startproject', 'wrong---project---name'))
|
|
|
|
self.assertEqual(1, self.call('startproject', 'sys'))
|
2016-05-24 11:58:52 -03:00
|
|
|
self.assertEqual(2, self.call('startproject'))
|
|
|
|
self.assertEqual(2, self.call('startproject', self.project_name, project_dir, 'another_params'))
|
2016-05-23 23:15:53 -03:00
|
|
|
|
2015-10-31 16:19:11 +01:00
|
|
|
|
|
|
|
class StartprojectTemplatesTest(ProjectTest):
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
super(StartprojectTemplatesTest, self).setUp()
|
|
|
|
self.tmpl = join(self.temp_path, 'templates')
|
|
|
|
self.tmpl_proj = join(self.tmpl, 'project')
|
2016-01-21 18:31:58 +05:00
|
|
|
|
2015-10-31 16:19:11 +01:00
|
|
|
def test_startproject_template_override(self):
|
|
|
|
copytree(join(scrapy.__path__[0], 'templates'), self.tmpl)
|
2016-01-03 14:33:42 +01:00
|
|
|
with open(join(self.tmpl_proj, 'root_template'), 'w'):
|
|
|
|
pass
|
2015-10-31 16:19:11 +01:00
|
|
|
assert exists(join(self.tmpl_proj, 'root_template'))
|
|
|
|
|
|
|
|
args = ['--set', 'TEMPLATES_DIR=%s' % self.tmpl]
|
|
|
|
p = self.proc('startproject', self.project_name, *args)
|
|
|
|
out = to_native_str(retry_on_eintr(p.stdout.read))
|
2016-01-16 14:23:58 +01:00
|
|
|
self.assertIn("New Scrapy project %r, using template directory" % self.project_name, out)
|
|
|
|
self.assertIn(self.tmpl_proj, out)
|
2015-10-31 16:19:11 +01:00
|
|
|
assert exists(join(self.proj_path, 'root_template'))
|
2009-08-28 14:42:07 -03:00
|
|
|
|
|
|
|
|
|
|
|
class CommandTest(ProjectTest):
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
super(CommandTest, self).setUp()
|
2009-08-30 01:51:34 -03:00
|
|
|
self.call('startproject', self.project_name)
|
2009-08-28 14:42:07 -03:00
|
|
|
self.cwd = join(self.temp_path, self.project_name)
|
|
|
|
self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name
|
|
|
|
|
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
class GenspiderCommandTest(CommandTest):
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2010-04-01 18:27:22 -03:00
|
|
|
def test_arguments(self):
|
|
|
|
# only pass one argument. spider script shouldn't be created
|
2010-08-28 18:06:51 -03:00
|
|
|
self.assertEqual(2, self.call('genspider', 'test_name'))
|
2010-04-01 18:27:22 -03:00
|
|
|
assert not exists(join(self.proj_mod_path, 'spiders', 'test_name.py'))
|
|
|
|
# pass two arguments <name> <domain>. spider script should be created
|
|
|
|
self.assertEqual(0, self.call('genspider', 'test_name', 'test.com'))
|
|
|
|
assert exists(join(self.proj_mod_path, 'spiders', 'test_name.py'))
|
|
|
|
|
2010-08-21 04:46:48 -03:00
|
|
|
def test_template(self, tplname='crawl'):
|
|
|
|
args = ['--template=%s' % tplname] if tplname else []
|
|
|
|
spname = 'test_spider'
|
|
|
|
p = self.proc('genspider', spname, 'test.com', *args)
|
2015-10-12 17:59:42 +05:00
|
|
|
out = to_native_str(retry_on_eintr(p.stdout.read))
|
2015-04-14 20:11:46 +05:00
|
|
|
self.assertIn("Created spider %r using template %r in module" % (spname, tplname), out)
|
|
|
|
self.assertTrue(exists(join(self.proj_mod_path, 'spiders', 'test_spider.py')))
|
2010-08-21 04:46:48 -03:00
|
|
|
p = self.proc('genspider', spname, 'test.com', *args)
|
2015-10-12 17:59:42 +05:00
|
|
|
out = to_native_str(retry_on_eintr(p.stdout.read))
|
2015-04-14 20:11:46 +05:00
|
|
|
self.assertIn("Spider %r already exists in module" % spname, out)
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
def test_template_basic(self):
|
2010-08-21 04:46:48 -03:00
|
|
|
self.test_template('basic')
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
def test_template_csvfeed(self):
|
2010-08-21 04:46:48 -03:00
|
|
|
self.test_template('csvfeed')
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
def test_template_xmlfeed(self):
|
2010-08-21 04:46:48 -03:00
|
|
|
self.test_template('xmlfeed')
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
def test_list(self):
|
|
|
|
self.assertEqual(0, self.call('genspider', '--list'))
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
def test_dump(self):
|
2010-08-21 04:46:48 -03:00
|
|
|
self.assertEqual(0, self.call('genspider', '--dump=basic'))
|
|
|
|
self.assertEqual(0, self.call('genspider', '-d', 'basic'))
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2013-02-12 12:26:38 +00:00
|
|
|
def test_same_name_as_project(self):
|
|
|
|
self.assertEqual(2, self.call('genspider', self.project_name))
|
|
|
|
assert not exists(join(self.proj_mod_path, 'spiders', '%s.py' % self.project_name))
|
|
|
|
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2016-06-11 20:44:08 -03:00
|
|
|
class GenspiderStandaloneCommandTest(ProjectTest):
|
|
|
|
|
|
|
|
def test_generate_standalone_spider(self):
|
|
|
|
self.call('genspider', 'example', 'example.com')
|
|
|
|
assert exists(join(self.temp_path, 'example.py'))
|
|
|
|
|
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
class MiscCommandsTest(CommandTest):
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2009-08-31 08:58:29 -03:00
|
|
|
def test_list(self):
|
|
|
|
self.assertEqual(0, self.call('list'))
|
2009-08-28 14:42:07 -03:00
|
|
|
|
2015-03-18 07:26:56 +05:00
|
|
|
|
2010-08-19 02:30:15 -03:00
|
|
|
class RunSpiderCommandTest(CommandTest):
|
|
|
|
|
2016-12-09 02:14:12 +05:00
|
|
|
debug_log_spider = """
|
|
|
|
import scrapy
|
|
|
|
|
|
|
|
class MySpider(scrapy.Spider):
|
|
|
|
name = 'myspider'
|
|
|
|
|
|
|
|
def start_requests(self):
|
|
|
|
self.logger.debug("It Works!")
|
|
|
|
return []
|
|
|
|
"""
|
|
|
|
|
2016-02-17 23:03:12 +05:00
|
|
|
@contextmanager
|
|
|
|
def _create_file(self, content, name):
|
2010-08-19 02:30:15 -03:00
|
|
|
tmpdir = self.mktemp()
|
|
|
|
os.mkdir(tmpdir)
|
2016-02-17 23:03:12 +05:00
|
|
|
fname = abspath(join(tmpdir, name))
|
2010-08-19 02:30:15 -03:00
|
|
|
with open(fname, 'w') as f:
|
2016-02-17 23:03:12 +05:00
|
|
|
f.write(content)
|
|
|
|
try:
|
|
|
|
yield fname
|
|
|
|
finally:
|
|
|
|
rmtree(tmpdir)
|
|
|
|
|
2016-12-09 02:14:12 +05:00
|
|
|
def runspider(self, code, name='myspider.py', args=()):
|
2016-02-17 23:03:12 +05:00
|
|
|
with self._create_file(code, name) as fname:
|
2016-12-09 02:14:12 +05:00
|
|
|
return self.proc('runspider', fname, *args)
|
2010-08-19 02:30:15 -03:00
|
|
|
|
2016-12-09 02:14:12 +05:00
|
|
|
def get_log(self, code, name='myspider.py', args=()):
|
|
|
|
p = self.runspider(code, name=name, args=args)
|
|
|
|
return to_native_str(p.stderr.read())
|
2016-02-17 23:03:12 +05:00
|
|
|
|
2016-12-09 02:14:12 +05:00
|
|
|
def test_runspider(self):
|
|
|
|
log = self.get_log(self.debug_log_spider)
|
2015-02-27 23:36:30 -03:00
|
|
|
self.assertIn("DEBUG: It Works!", log)
|
|
|
|
self.assertIn("INFO: Spider opened", log)
|
|
|
|
self.assertIn("INFO: Closing spider (finished)", log)
|
|
|
|
self.assertIn("INFO: Spider closed (finished)", log)
|
2010-08-19 02:30:15 -03:00
|
|
|
|
2016-12-09 02:19:33 +05:00
|
|
|
def test_runspider_log_level(self):
|
|
|
|
log = self.get_log(self.debug_log_spider,
|
|
|
|
args=('-s', 'LOG_LEVEL=INFO'))
|
|
|
|
self.assertNotIn("DEBUG: It Works!", log)
|
|
|
|
self.assertIn("INFO: Spider opened", log)
|
|
|
|
|
2010-08-19 02:30:15 -03:00
|
|
|
def test_runspider_no_spider_found(self):
|
2016-12-09 02:14:12 +05:00
|
|
|
log = self.get_log("from scrapy.spiders import Spider\n")
|
2015-04-14 20:11:46 +05:00
|
|
|
self.assertIn("No spider found in file", log)
|
2010-08-19 02:30:15 -03:00
|
|
|
|
|
|
|
def test_runspider_file_not_found(self):
|
2010-08-21 04:46:48 -03:00
|
|
|
p = self.proc('runspider', 'some_non_existent_file')
|
2015-10-12 17:59:42 +05:00
|
|
|
log = to_native_str(p.stderr.read())
|
2015-04-14 20:11:46 +05:00
|
|
|
self.assertIn("File not found: some_non_existent_file", log)
|
2010-08-19 02:30:15 -03:00
|
|
|
|
|
|
|
def test_runspider_unable_to_load(self):
|
2016-12-09 02:14:12 +05:00
|
|
|
log = self.get_log('', name='myspider.txt')
|
2016-02-18 11:45:03 +01:00
|
|
|
self.assertIn('Unable to load', log)
|
2009-08-28 17:58:50 -03:00
|
|
|
|
2016-02-17 23:07:03 +05:00
|
|
|
def test_start_requests_errors(self):
|
2016-12-09 02:14:12 +05:00
|
|
|
log = self.get_log("""
|
2016-02-17 23:07:03 +05:00
|
|
|
import scrapy
|
|
|
|
|
|
|
|
class BadSpider(scrapy.Spider):
|
|
|
|
name = "bad"
|
|
|
|
def start_requests(self):
|
|
|
|
raise Exception("oops!")
|
|
|
|
""", name="badspider.py")
|
|
|
|
print(log)
|
|
|
|
self.assertIn("start_requests", log)
|
|
|
|
self.assertIn("badspider.py", log)
|
|
|
|
|
2016-12-09 02:14:12 +05:00
|
|
|
|
2013-05-28 12:42:50 -03:00
|
|
|
class BenchCommandTest(CommandTest):
|
|
|
|
|
|
|
|
def test_run(self):
|
|
|
|
p = self.proc('bench', '-s', 'LOGSTATS_INTERVAL=0.001',
|
|
|
|
'-s', 'CLOSESPIDER_TIMEOUT=0.01')
|
2015-10-12 17:59:42 +05:00
|
|
|
log = to_native_str(p.stderr.read())
|
2015-04-14 20:11:46 +05:00
|
|
|
self.assertIn('INFO: Crawled', log)
|
2016-01-21 18:31:58 +05:00
|
|
|
self.assertNotIn('Unhandled Error', log)
|