2018-08-15 01:08:40 -07:00
|
|
|
import os
|
2016-09-19 05:33:05 +02:00
|
|
|
from os.path import join, abspath
|
|
|
|
from twisted.internet import defer
|
|
|
|
from scrapy.utils.testsite import SiteTest
|
|
|
|
from scrapy.utils.testproc import ProcessTest
|
2019-08-20 21:35:13 +05:00
|
|
|
from scrapy.utils.python import to_unicode
|
2016-09-19 05:33:05 +02:00
|
|
|
from tests.test_commands import CommandTest
|
|
|
|
|
|
|
|
|
2018-08-15 01:08:40 -07:00
|
|
|
def _textmode(bstr):
|
|
|
|
"""Normalize input the same as writing to a file
|
|
|
|
and reading from it in text mode"""
|
2019-08-20 21:35:13 +05:00
|
|
|
return to_unicode(bstr).replace(os.linesep, '\n')
|
2018-08-15 01:08:40 -07:00
|
|
|
|
2016-09-19 05:33:05 +02:00
|
|
|
class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
|
|
|
|
command = 'parse'
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
super(ParseCommandTest, self).setUp()
|
|
|
|
self.spider_name = 'parse_spider'
|
|
|
|
fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py'))
|
|
|
|
with open(fname, 'w') as f:
|
|
|
|
f.write("""
|
|
|
|
import scrapy
|
|
|
|
from scrapy.linkextractors import LinkExtractor
|
|
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
|
|
|
|
|
|
|
|
|
|
class MySpider(scrapy.Spider):
|
|
|
|
name = '{0}'
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
if getattr(self, 'test_arg', None):
|
|
|
|
self.logger.debug('It Works!')
|
|
|
|
return [scrapy.Item(), dict(foo='bar')]
|
|
|
|
|
2017-11-11 18:30:01 -05:00
|
|
|
def parse_request_with_meta(self, response):
|
|
|
|
foo = response.meta.get('foo', 'bar')
|
|
|
|
|
|
|
|
if foo == 'bar':
|
|
|
|
self.logger.debug('It Does Not Work :(')
|
|
|
|
else:
|
|
|
|
self.logger.debug('It Works!')
|
|
|
|
|
2019-03-28 14:56:31 -03:00
|
|
|
def parse_request_with_cb_kwargs(self, response, foo=None, key=None):
|
|
|
|
if foo == 'bar' and key == 'value':
|
|
|
|
self.logger.debug('It Works!')
|
|
|
|
else:
|
|
|
|
self.logger.debug('It Does Not Work :(')
|
|
|
|
|
2017-11-11 18:30:01 -05:00
|
|
|
def parse_request_without_meta(self, response):
|
|
|
|
foo = response.meta.get('foo', 'bar')
|
|
|
|
|
|
|
|
if foo == 'bar':
|
|
|
|
self.logger.debug('It Works!')
|
|
|
|
else:
|
|
|
|
self.logger.debug('It Does Not Work :(')
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
class MyGoodCrawlSpider(CrawlSpider):
|
|
|
|
name = 'goodcrawl{0}'
|
|
|
|
|
|
|
|
rules = (
|
|
|
|
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
|
|
|
|
Rule(LinkExtractor(allow=r'/text'), follow=True),
|
|
|
|
)
|
|
|
|
|
|
|
|
def parse_item(self, response):
|
|
|
|
return [scrapy.Item(), dict(foo='bar')]
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
return [scrapy.Item(), dict(nomatch='default')]
|
|
|
|
|
|
|
|
|
|
|
|
class MyBadCrawlSpider(CrawlSpider):
|
|
|
|
'''Spider which doesn't define a parse_item callback while using it in a rule.'''
|
|
|
|
name = 'badcrawl{0}'
|
|
|
|
|
|
|
|
rules = (
|
|
|
|
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
|
|
|
|
)
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
return [scrapy.Item(), dict(foo='bar')]
|
|
|
|
""".format(self.spider_name))
|
|
|
|
|
|
|
|
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
|
|
|
|
with open(fname, 'w') as f:
|
|
|
|
f.write("""
|
|
|
|
import logging
|
|
|
|
|
|
|
|
class MyPipeline(object):
|
|
|
|
component_name = 'my_pipeline'
|
|
|
|
|
|
|
|
def process_item(self, item, spider):
|
|
|
|
logging.info('It Works!')
|
|
|
|
return item
|
|
|
|
""")
|
|
|
|
|
|
|
|
fname = abspath(join(self.proj_mod_path, 'settings.py'))
|
|
|
|
with open(fname, 'a') as f:
|
|
|
|
f.write("""
|
|
|
|
ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
|
|
|
|
""" % self.project_name)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_spider_arguments(self):
|
|
|
|
_, _, stderr = yield self.execute(['--spider', self.spider_name,
|
|
|
|
'-a', 'test_arg=1',
|
|
|
|
'-c', 'parse',
|
2019-09-27 00:56:43 -07:00
|
|
|
'--verbose',
|
2016-09-19 05:33:05 +02:00
|
|
|
self.url('/html')])
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("DEBUG: It Works!", _textmode(stderr))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
2017-11-11 18:30:01 -05:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_request_with_meta(self):
|
|
|
|
raw_json_string = '{"foo" : "baz"}'
|
|
|
|
_, _, stderr = yield self.execute(['--spider', self.spider_name,
|
|
|
|
'--meta', raw_json_string,
|
|
|
|
'-c', 'parse_request_with_meta',
|
2019-09-27 00:56:43 -07:00
|
|
|
'--verbose',
|
2017-11-11 18:30:01 -05:00
|
|
|
self.url('/html')])
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("DEBUG: It Works!", _textmode(stderr))
|
2017-11-11 18:30:01 -05:00
|
|
|
|
|
|
|
_, _, stderr = yield self.execute(['--spider', self.spider_name,
|
|
|
|
'-m', raw_json_string,
|
|
|
|
'-c', 'parse_request_with_meta',
|
2019-09-27 00:56:43 -07:00
|
|
|
'--verbose',
|
2017-11-11 18:30:01 -05:00
|
|
|
self.url('/html')])
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("DEBUG: It Works!", _textmode(stderr))
|
2017-11-11 18:30:01 -05:00
|
|
|
|
2019-03-28 14:56:31 -03:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_request_with_cb_kwargs(self):
|
|
|
|
raw_json_string = '{"foo" : "bar", "key": "value"}'
|
|
|
|
_, _, stderr = yield self.execute(['--spider', self.spider_name,
|
2019-03-29 14:03:26 -03:00
|
|
|
'--cbkwargs', raw_json_string,
|
2019-03-28 14:56:31 -03:00
|
|
|
'-c', 'parse_request_with_cb_kwargs',
|
2019-09-27 00:56:43 -07:00
|
|
|
'--verbose',
|
2019-03-28 14:56:31 -03:00
|
|
|
self.url('/html')])
|
|
|
|
self.assertIn("DEBUG: It Works!", _textmode(stderr))
|
2017-11-11 18:30:01 -05:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_request_without_meta(self):
|
|
|
|
_, _, stderr = yield self.execute(['--spider', self.spider_name,
|
|
|
|
'-c', 'parse_request_without_meta',
|
2019-09-27 00:56:43 -07:00
|
|
|
'--nolinks',
|
2017-11-11 18:30:01 -05:00
|
|
|
self.url('/html')])
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("DEBUG: It Works!", _textmode(stderr))
|
2017-11-11 18:30:01 -05:00
|
|
|
|
|
|
|
|
2016-09-19 05:33:05 +02:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_pipelines(self):
|
|
|
|
_, _, stderr = yield self.execute(['--spider', self.spider_name,
|
|
|
|
'--pipelines',
|
|
|
|
'-c', 'parse',
|
2019-09-27 00:56:43 -07:00
|
|
|
'--verbose',
|
2016-09-19 05:33:05 +02:00
|
|
|
self.url('/html')])
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("INFO: It Works!", _textmode(stderr))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_parse_items(self):
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', self.spider_name, '-c', 'parse', self.url('/html')]
|
|
|
|
)
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""[{}, {'foo': 'bar'}]""", _textmode(out))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_parse_items_no_callback_passed(self):
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', self.spider_name, self.url('/html')]
|
|
|
|
)
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""[{}, {'foo': 'bar'}]""", _textmode(out))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_wrong_callback_passed(self):
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', self.spider_name, '-c', 'dummy', self.url('/html')]
|
|
|
|
)
|
2019-07-13 20:23:10 -03:00
|
|
|
self.assertRegex(_textmode(out), r"""# Scraped Items -+\n\[\]""")
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""Cannot find callback""", _textmode(stderr))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_crawlspider_matching_rule_callback_set(self):
|
|
|
|
"""If a rule matches the URL, use it's defined callback."""
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', 'goodcrawl'+self.spider_name, '-r', self.url('/html')]
|
|
|
|
)
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""[{}, {'foo': 'bar'}]""", _textmode(out))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_crawlspider_matching_rule_default_callback(self):
|
|
|
|
"""If a rule match but it has no callback set, use the 'parse' callback."""
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', 'goodcrawl'+self.spider_name, '-r', self.url('/text')]
|
|
|
|
)
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""[{}, {'nomatch': 'default'}]""", _textmode(out))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_spider_with_no_rules_attribute(self):
|
|
|
|
"""Using -r with a spider with no rule should not produce items."""
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', self.spider_name, '-r', self.url('/html')]
|
|
|
|
)
|
2019-07-13 20:23:10 -03:00
|
|
|
self.assertRegex(_textmode(out), r"""# Scraped Items -+\n\[\]""")
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""No CrawlSpider rules found""", _textmode(stderr))
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_crawlspider_missing_callback(self):
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', 'badcrawl'+self.spider_name, '-r', self.url('/html')]
|
|
|
|
)
|
2019-07-13 20:23:10 -03:00
|
|
|
self.assertRegex(_textmode(out), r"""# Scraped Items -+\n\[\]""")
|
2016-09-19 05:33:05 +02:00
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_crawlspider_no_matching_rule(self):
|
|
|
|
"""The requested URL has no matching rule, so no items should be scraped"""
|
|
|
|
status, out, stderr = yield self.execute(
|
|
|
|
['--spider', 'badcrawl'+self.spider_name, '-r', self.url('/enc-gb18030')]
|
|
|
|
)
|
2019-07-13 20:23:10 -03:00
|
|
|
self.assertRegex(_textmode(out), r"""# Scraped Items -+\n\[\]""")
|
2018-08-15 01:08:40 -07:00
|
|
|
self.assertIn("""Cannot find a rule that matches""", _textmode(stderr))
|