1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 23:23:44 +00:00
scrapy/tests/test_command_parse.py

157 lines
5.7 KiB
Python

from os.path import join, abspath
from twisted.trial import unittest
from twisted.internet import defer
from scrapy.utils.testsite import SiteTest
from scrapy.utils.testproc import ProcessTest
from scrapy.utils.python import to_native_str
from tests.test_commands import CommandTest
class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
command = 'parse'
def setUp(self):
super(ParseCommandTest, self).setUp()
self.spider_name = 'parse_spider'
fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py'))
with open(fname, 'w') as f:
f.write("""
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(scrapy.Spider):
name = '{0}'
def parse(self, response):
if getattr(self, 'test_arg', None):
self.logger.debug('It Works!')
return [scrapy.Item(), dict(foo='bar')]
class MyGoodCrawlSpider(CrawlSpider):
name = 'goodcrawl{0}'
rules = (
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'/text'), follow=True),
)
def parse_item(self, response):
return [scrapy.Item(), dict(foo='bar')]
def parse(self, response):
return [scrapy.Item(), dict(nomatch='default')]
class MyBadCrawlSpider(CrawlSpider):
'''Spider which doesn't define a parse_item callback while using it in a rule.'''
name = 'badcrawl{0}'
rules = (
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
)
def parse(self, response):
return [scrapy.Item(), dict(foo='bar')]
""".format(self.spider_name))
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
with open(fname, 'w') as f:
f.write("""
import logging
class MyPipeline(object):
component_name = 'my_pipeline'
def process_item(self, item, spider):
logging.info('It Works!')
return item
""")
fname = abspath(join(self.proj_mod_path, 'settings.py'))
with open(fname, 'a') as f:
f.write("""
ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
""" % self.project_name)
@defer.inlineCallbacks
def test_spider_arguments(self):
_, _, stderr = yield self.execute(['--spider', self.spider_name,
'-a', 'test_arg=1',
'-c', 'parse',
self.url('/html')])
self.assertIn("DEBUG: It Works!", to_native_str(stderr))
@defer.inlineCallbacks
def test_pipelines(self):
_, _, stderr = yield self.execute(['--spider', self.spider_name,
'--pipelines',
'-c', 'parse',
self.url('/html')])
self.assertIn("INFO: It Works!", to_native_str(stderr))
@defer.inlineCallbacks
def test_parse_items(self):
status, out, stderr = yield self.execute(
['--spider', self.spider_name, '-c', 'parse', self.url('/html')]
)
self.assertIn("""[{}, {'foo': 'bar'}]""", to_native_str(out))
@defer.inlineCallbacks
def test_parse_items_no_callback_passed(self):
status, out, stderr = yield self.execute(
['--spider', self.spider_name, self.url('/html')]
)
self.assertIn("""[{}, {'foo': 'bar'}]""", to_native_str(out))
@defer.inlineCallbacks
def test_wrong_callback_passed(self):
status, out, stderr = yield self.execute(
['--spider', self.spider_name, '-c', 'dummy', self.url('/html')]
)
self.assertRegexpMatches(to_native_str(out), """# Scraped Items -+\n\[\]""")
self.assertIn("""Cannot find callback""", to_native_str(stderr))
@defer.inlineCallbacks
def test_crawlspider_matching_rule_callback_set(self):
"""If a rule matches the URL, use it's defined callback."""
status, out, stderr = yield self.execute(
['--spider', 'goodcrawl'+self.spider_name, '-r', self.url('/html')]
)
self.assertIn("""[{}, {'foo': 'bar'}]""", to_native_str(out))
@defer.inlineCallbacks
def test_crawlspider_matching_rule_default_callback(self):
"""If a rule match but it has no callback set, use the 'parse' callback."""
status, out, stderr = yield self.execute(
['--spider', 'goodcrawl'+self.spider_name, '-r', self.url('/text')]
)
self.assertIn("""[{}, {'nomatch': 'default'}]""", to_native_str(out))
@defer.inlineCallbacks
def test_spider_with_no_rules_attribute(self):
"""Using -r with a spider with no rule should not produce items."""
status, out, stderr = yield self.execute(
['--spider', self.spider_name, '-r', self.url('/html')]
)
self.assertRegexpMatches(to_native_str(out), """# Scraped Items -+\n\[\]""")
self.assertIn("""No CrawlSpider rules found""", to_native_str(stderr))
@defer.inlineCallbacks
def test_crawlspider_missing_callback(self):
status, out, stderr = yield self.execute(
['--spider', 'badcrawl'+self.spider_name, '-r', self.url('/html')]
)
self.assertRegexpMatches(to_native_str(out), """# Scraped Items -+\n\[\]""")
@defer.inlineCallbacks
def test_crawlspider_no_matching_rule(self):
"""The requested URL has no matching rule, so no items should be scraped"""
status, out, stderr = yield self.execute(
['--spider', 'badcrawl'+self.spider_name, '-r', self.url('/enc-gb18030')]
)
self.assertRegexpMatches(to_native_str(out), """# Scraped Items -+\n\[\]""")
self.assertIn("""Cannot find a rule that matches""", to_native_str(stderr))