1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 16:03:56 +00:00

Allow spiders to return dicts. See GH-1064.

This commit is contained in:
Mikhail Korobov 2015-03-18 07:26:56 +05:00
parent da90449edf
commit 39635e5f55
14 changed files with 310 additions and 167 deletions

View File

@ -197,12 +197,17 @@ BaseItemExporter
Some exporters (like :class:`CsvItemExporter`) respect the order of the
fields defined in this attribute.
Some exporters may require fields_to_export list in order to export the
data properly when spiders return dicts (not :class:`~Item` instances).
.. attribute:: export_empty_fields
Whether to include empty/unpopulated item fields in the exported data.
Defaults to ``False``. Some exporters (like :class:`CsvItemExporter`)
ignore this attribute and always export all empty fields.
This option is ignored for dict items.
.. attribute:: encoding
The encoding that will be used to encode unicode values. This only

View File

@ -107,7 +107,7 @@ class Command(ScrapyCommand):
items, requests = [], []
for x in iterate_spider_output(cb(response)):
if isinstance(x, BaseItem):
if isinstance(x, (BaseItem, dict)):
items.append(x)
elif isinstance(x, Request):
requests.append(x)

View File

@ -35,8 +35,8 @@ class ReturnsContract(Contract):
objects = {
'request': Request,
'requests': Request,
'item': BaseItem,
'items': BaseItem,
'item': (BaseItem, dict),
'items': (BaseItem, dict),
}
def __init__(self, *args, **kwargs):
@ -83,7 +83,7 @@ class ScrapesContract(Contract):
def post_process(self, output):
for x in output:
if isinstance(x, BaseItem):
if isinstance(x, (BaseItem, dict)):
for arg in self.args:
if not arg in x:
raise ContractFail("'%s' field is missing" % arg)

View File

@ -9,6 +9,7 @@ import marshal
import six
from six.moves import cPickle as pickle
from xml.sax.saxutils import XMLGenerator
from scrapy.utils.serialize import ScrapyJSONEncoder
from scrapy.item import BaseItem
@ -50,13 +51,13 @@ class BaseItemExporter(object):
return value.encode(self.encoding) if isinstance(value, unicode) else value
def _get_serialized_fields(self, item, default_value=None, include_empty=None):
"""Return the fields to export as an iterable of tuples (name,
serialized_value)
"""Return the fields to export as an iterable of tuples
(name, serialized_value)
"""
if include_empty is None:
include_empty = self.export_empty_fields
if self.fields_to_export is None:
if include_empty:
if include_empty and not isinstance(item, dict):
field_iter = six.iterkeys(item.fields)
else:
field_iter = six.iterkeys(item)
@ -64,12 +65,11 @@ class BaseItemExporter(object):
if include_empty:
field_iter = self.fields_to_export
else:
nonempty_fields = set(item.keys())
field_iter = (x for x in self.fields_to_export if x in
nonempty_fields)
field_iter = (x for x in self.fields_to_export if x in item)
for field_name in field_iter:
if field_name in item:
field = item.fields[field_name]
field = {} if isinstance(item, dict) else item.fields[field_name]
value = self.serialize_field(field, field_name, item[field_name])
else:
value = default_value
@ -191,7 +191,12 @@ class CsvItemExporter(BaseItemExporter):
def _write_headers_and_set_fields_to_export(self, item):
if self.include_headers_line:
if not self.fields_to_export:
self.fields_to_export = item.fields.keys()
if isinstance(item, dict):
# for dicts try using fields of the first item
self.fields_to_export = list(item.keys())
else:
# use fields declared in Item
self.fields_to_export = list(item.fields.keys())
self.csv_writer.writerow(self.fields_to_export)

View File

@ -267,7 +267,7 @@ class FilesPipeline(MediaPipeline):
return checksum
def item_completed(self, results, item, info):
if self.FILES_RESULT_FIELD in item.fields:
if isinstance(item, dict) or self.FILES_RESULT_FIELD in item.fields:
item[self.FILES_RESULT_FIELD] = [x for ok, x in results if ok]
return item

View File

@ -109,7 +109,7 @@ class ImagesPipeline(FilesPipeline):
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
def item_completed(self, results, item, info):
if self.IMAGES_RESULT_FIELD in item.fields:
if isinstance(item, dict) or self.IMAGES_RESULT_FIELD in item.fields:
item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
return item

View File

@ -174,7 +174,7 @@ class Scraper(object):
"""
if isinstance(output, Request):
self.crawler.engine.crawl(request=output, spider=spider)
elif isinstance(output, BaseItem):
elif isinstance(output, (BaseItem, dict)):
self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
@ -183,7 +183,7 @@ class Scraper(object):
pass
else:
typename = type(output).__name__
log.msg(format='Spider must return Request, BaseItem or None, '
log.msg(format='Spider must return Request, BaseItem, dict or None, '
'got %(typename)r in %(request)s',
level=log.ERROR, spider=spider, request=request, typename=typename)

View File

@ -85,6 +85,7 @@ class ItemSpider(FollowAllSpider):
for request in super(ItemSpider, self).parse(response):
yield request
yield Item()
yield {}
class DefaultError(Exception):

View File

@ -127,6 +127,7 @@ class MiscCommandsTest(CommandTest):
def test_list(self):
self.assertEqual(0, self.call('list'))
class RunSpiderCommandTest(CommandTest):
def test_runspider(self):
@ -135,10 +136,10 @@ class RunSpiderCommandTest(CommandTest):
fname = abspath(join(tmpdir, 'myspider.py'))
with open(fname, 'w') as f:
f.write("""
import scrapy
from scrapy import log
from scrapy.spider import Spider
class MySpider(Spider):
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
@ -192,16 +193,15 @@ class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
with open(fname, 'w') as f:
f.write("""
from scrapy import log
from scrapy.spider import Spider
from scrapy.item import Item
import scrapy
class MySpider(Spider):
class MySpider(scrapy.Spider):
name = '{0}'
def parse(self, response):
if getattr(self, 'test_arg', None):
self.log('It Works!')
return [Item()]
return [scrapy.Item(), dict(foo='bar')]
""".format(self.spider_name))
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
@ -239,6 +239,14 @@ ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
self.url('/html')])
self.assert_("[scrapy] INFO: It Works!" in stderr, stderr)
@defer.inlineCallbacks
def test_parse_items(self):
status, out, stderr = yield self.execute(
['--spider', self.spider_name, '-c', 'parse', self.url('/html')]
)
self.assertIn("""[{}, {'foo': 'bar'}]""", out)
class BenchCommandTest(CommandTest):

View File

@ -39,6 +39,13 @@ class TestSpider(Spider):
"""
return TestItem(url=response.url)
def returns_dict_item(self, response):
""" method which returns item
@url http://scrapy.org
@returns items 1 1
"""
return {"url": response.url}
def returns_fail(self, response):
""" method which returns item
@url http://scrapy.org
@ -46,6 +53,13 @@ class TestSpider(Spider):
"""
return TestItem(url=response.url)
def returns_dict_fail(self, response):
""" method which returns item
@url http://scrapy.org
@returns items 0 0
"""
return {'url': response.url}
def scrapes_item_ok(self, response):
""" returns item with name and url
@url http://scrapy.org
@ -54,6 +68,14 @@ class TestSpider(Spider):
"""
return TestItem(name='test', url=response.url)
def scrapes_dict_item_ok(self, response):
""" returns item with name and url
@url http://scrapy.org
@returns items 1 1
@scrapes name url
"""
return {'name': 'test', 'url': response.url}
def scrapes_item_fail(self, response):
""" returns item with no name
@url http://scrapy.org
@ -62,6 +84,14 @@ class TestSpider(Spider):
"""
return TestItem(url=response.url)
def scrapes_dict_item_fail(self, response):
""" returns item with no name
@url http://scrapy.org
@returns items 1 1
@scrapes name url
"""
return {'url': response.url}
def parse_no_url(self, response):
""" method with no url
@returns items 1 1
@ -110,6 +140,11 @@ class ContractsManagerTest(unittest.TestCase):
request.callback(response)
self.should_succeed()
# returns_dict_item
request = self.conman.from_method(spider.returns_dict_item, self.results)
request.callback(response)
self.should_succeed()
# returns_request
request = self.conman.from_method(spider.returns_request, self.results)
request.callback(response)
@ -120,6 +155,11 @@ class ContractsManagerTest(unittest.TestCase):
request.callback(response)
self.should_fail()
# returns_dict_fail
request = self.conman.from_method(spider.returns_dict_fail, self.results)
request.callback(response)
self.should_fail()
def test_scrapes(self):
spider = TestSpider()
response = ResponseMock()
@ -129,8 +169,19 @@ class ContractsManagerTest(unittest.TestCase):
request.callback(response)
self.should_succeed()
# scrapes_dict_item_ok
request = self.conman.from_method(spider.scrapes_dict_item_ok, self.results)
request.callback(response)
self.should_succeed()
# scrapes_item_fail
request = self.conman.from_method(spider.scrapes_item_fail,
self.results)
request.callback(response)
self.should_fail()
# scrapes_dict_item_fail
request = self.conman.from_method(spider.scrapes_dict_item_fail,
self.results)
request.callback(response)
self.should_fail()

View File

@ -1,14 +1,19 @@
import unittest, json
from __future__ import absolute_import
import re
import json
import unittest
from io import BytesIO
from six.moves import cPickle as pickle
import lxml.etree
import re
from scrapy.item import Item, Field
from scrapy.utils.python import str_to_unicode
from scrapy.contrib.exporter import BaseItemExporter, PprintItemExporter, \
PickleItemExporter, CsvItemExporter, XmlItemExporter, JsonLinesItemExporter, \
JsonItemExporter, PythonItemExporter
from scrapy.contrib.exporter import (
BaseItemExporter, PprintItemExporter, PickleItemExporter, CsvItemExporter,
XmlItemExporter, JsonLinesItemExporter, JsonItemExporter, PythonItemExporter
)
class TestItem(Item):
name = Field()
@ -33,21 +38,28 @@ class BaseItemExporterTest(unittest.TestCase):
exported_dict[k] = str_to_unicode(v)
self.assertEqual(self.i, exported_dict)
def test_export_item(self):
def assertItemExportWorks(self, item):
self.ie.start_exporting()
try:
self.ie.export_item(self.i)
self.ie.export_item(item)
except NotImplementedError:
if self.ie.__class__ is not BaseItemExporter:
raise
self.ie.finish_exporting()
self._check_output()
def test_export_item(self):
self.assertItemExportWorks(self.i)
def test_export_dict_item(self):
self.assertItemExportWorks(dict(self.i))
def test_serialize_field(self):
self.assertEqual(self.ie.serialize_field( \
self.i.fields['name'], 'name', self.i['name']), 'John\xc2\xa3')
self.assertEqual( \
self.ie.serialize_field(self.i.fields['age'], 'age', self.i['age']), '22')
res = self.ie.serialize_field(self.i.fields['name'], 'name', self.i['name'])
self.assertEqual(res, 'John\xc2\xa3')
res = self.ie.serialize_field(self.i.fields['age'], 'age', self.i['age'])
self.assertEqual(res, '22')
def test_fields_to_export(self):
ie = self._get_exporter(fields_to_export=['name'])
@ -72,13 +84,14 @@ class BaseItemExporterTest(unittest.TestCase):
self.assertEqual(ie.serialize_field(i.fields['name'], 'name', i['name']), 'John\xc2\xa3')
self.assertEqual(ie.serialize_field(i.fields['age'], 'age', i['age']), '24')
class PythonItemExporterTest(BaseItemExporterTest):
def _get_exporter(self, **kwargs):
return PythonItemExporter(**kwargs)
def test_nested_item(self):
i1 = TestItem(name=u'Joseph', age='22')
i2 = TestItem(name=u'Maria', age=i1)
i2 = dict(name=u'Maria', age=i1)
i3 = TestItem(name=u'Jesus', age=i2)
ie = self._get_exporter()
exported = ie.export_item(i3)
@ -107,6 +120,7 @@ class PythonItemExporterTest(BaseItemExporterTest):
self.assertEqual(type(exported['age'][0]), dict)
self.assertEqual(type(exported['age'][0]['age'][0]), dict)
class PprintItemExporterTest(BaseItemExporterTest):
def _get_exporter(self, **kwargs):
@ -115,6 +129,7 @@ class PprintItemExporterTest(BaseItemExporterTest):
def _check_output(self):
self._assert_expected_item(eval(self.output.getvalue()))
class PickleItemExporterTest(BaseItemExporterTest):
def _get_exporter(self, **kwargs):
@ -150,48 +165,65 @@ class CsvItemExporterTest(BaseItemExporterTest):
def _check_output(self):
self.assertCsvEqual(self.output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n')
def test_header(self):
output = BytesIO()
ie = CsvItemExporter(output, fields_to_export=self.i.fields.keys())
def assertExportResult(self, item, expected, **kwargs):
fp = BytesIO()
ie = CsvItemExporter(fp, **kwargs)
ie.start_exporting()
ie.export_item(self.i)
ie.export_item(item)
ie.finish_exporting()
self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n')
self.assertCsvEqual(fp.getvalue(), expected)
output = BytesIO()
ie = CsvItemExporter(output, fields_to_export=['age'])
ie.start_exporting()
ie.export_item(self.i)
ie.finish_exporting()
self.assertCsvEqual(output.getvalue(), 'age\r\n22\r\n')
def test_header_export_all(self):
self.assertExportResult(
item=self.i,
fields_to_export=self.i.fields.keys(),
expected='age,name\r\n22,John\xc2\xa3\r\n',
)
output = BytesIO()
ie = CsvItemExporter(output)
ie.start_exporting()
ie.export_item(self.i)
ie.export_item(self.i)
ie.finish_exporting()
self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
def test_header_export_all_dict(self):
self.assertExportResult(
item=dict(self.i),
expected='age,name\r\n22,John\xc2\xa3\r\n',
)
output = BytesIO()
ie = CsvItemExporter(output, include_headers_line=False)
ie.start_exporting()
ie.export_item(self.i)
ie.finish_exporting()
self.assertCsvEqual(output.getvalue(), '22,John\xc2\xa3\r\n')
def test_header_export_single_field(self):
for item in [self.i, dict(self.i)]:
self.assertExportResult(
item=item,
fields_to_export=['age'],
expected='age\r\n22\r\n',
)
def test_header_export_two_items(self):
for item in [self.i, dict(self.i)]:
output = BytesIO()
ie = CsvItemExporter(output)
ie.start_exporting()
ie.export_item(item)
ie.export_item(item)
ie.finish_exporting()
self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
def test_header_no_header_line(self):
for item in [self.i, dict(self.i)]:
self.assertExportResult(
item=item,
include_headers_line=False,
expected='22,John\xc2\xa3\r\n',
)
def test_join_multivalue(self):
class TestItem2(Item):
name = Field()
friends = Field()
i = TestItem2(name='John', friends=['Mary', 'Paul'])
output = BytesIO()
ie = CsvItemExporter(output, include_headers_line=False)
ie.start_exporting()
ie.export_item(i)
ie.finish_exporting()
self.assertCsvEqual(output.getvalue(), '"Mary,Paul",John\r\n')
for cls in TestItem2, dict:
self.assertExportResult(
item=cls(name='John', friends=['Mary', 'Paul']),
include_headers_line=False,
expected='"Mary,Paul",John\r\n',
)
class XmlItemExporterTest(BaseItemExporterTest):
@ -211,60 +243,62 @@ class XmlItemExporterTest(BaseItemExporterTest):
return xmltuple(doc)
return self.assertEqual(xmlsplit(first), xmlsplit(second), msg)
def assertExportResult(self, item, expected_value):
fp = BytesIO()
ie = XmlItemExporter(fp)
ie.start_exporting()
ie.export_item(item)
ie.finish_exporting()
self.assertXmlEquivalent(fp.getvalue(), expected_value)
def _check_output(self):
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><age>22</age><name>John\xc2\xa3</name></item></items>'
self.assertXmlEquivalent(self.output.getvalue(), expected_value)
def test_multivalued_fields(self):
output = BytesIO()
item = TestItem(name=[u'John\xa3', u'Doe'])
ie = XmlItemExporter(output)
ie.start_exporting()
ie.export_item(item)
ie.finish_exporting()
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>'
self.assertXmlEquivalent(output.getvalue(), expected_value)
self.assertExportResult(
TestItem(name=[u'John\xa3', u'Doe']),
'<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>'
)
def test_nested_item(self):
output = BytesIO()
i1 = TestItem(name=u'foo\xa3hoo', age='22')
i2 = TestItem(name=u'bar', age=i1)
i2 = dict(name=u'bar', age=i1)
i3 = TestItem(name=u'buz', age=i2)
ie = XmlItemExporter(output)
ie.start_exporting()
ie.export_item(i3)
ie.finish_exporting()
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\
'<items><item>'\
'<age>'\
'<age>'\
'<age>22</age>'\
'<name>foo\xc2\xa3hoo</name>'\
'</age>'\
'<name>bar</name>'\
'</age>'\
'<name>buz</name>'\
'</item></items>'
self.assertXmlEquivalent(output.getvalue(), expected_value)
self.assertExportResult(i3,
'<?xml version="1.0" encoding="utf-8"?>\n'
'<items>'
'<item>'
'<age>'
'<age>'
'<age>22</age>'
'<name>foo\xc2\xa3hoo</name>'
'</age>'
'<name>bar</name>'
'</age>'
'<name>buz</name>'
'</item>'
'</items>'
)
def test_nested_list_item(self):
output = BytesIO()
i1 = TestItem(name=u'foo')
i2 = TestItem(name=u'bar')
i2 = dict(name=u'bar', v2={"egg": ["spam"]})
i3 = TestItem(name=u'buz', age=[i1, i2])
ie = XmlItemExporter(output)
ie.start_exporting()
ie.export_item(i3)
ie.finish_exporting()
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\
'<items><item>'\
'<age>'\
'<value><name>foo</name></value>'\
'<value><name>bar</name></value>'\
'</age>'\
'<name>buz</name>'\
'</item></items>'
self.assertXmlEquivalent(output.getvalue(), expected_value)
self.assertExportResult(i3,
'<?xml version="1.0" encoding="utf-8"?>\n'
'<items>'
'<item>'
'<age>'
'<value><name>foo</name></value>'
'<value><name>bar</name><v2><egg><value>spam</value></egg></v2></value>'
'</age>'
'<name>buz</name>'
'</item>'
'</items>'
)
class JsonLinesItemExporterTest(BaseItemExporterTest):
@ -280,7 +314,7 @@ class JsonLinesItemExporterTest(BaseItemExporterTest):
def test_nested_item(self):
i1 = TestItem(name=u'Joseph', age='22')
i2 = TestItem(name=u'Maria', age=i1)
i2 = dict(name=u'Maria', age=i1)
i3 = TestItem(name=u'Jesus', age=i2)
self.ie.start_exporting()
self.ie.export_item(i3)
@ -306,13 +340,19 @@ class JsonItemExporterTest(JsonLinesItemExporterTest):
exported = json.loads(self.output.getvalue().strip())
self.assertEqual(exported, [dict(self.i)])
def test_two_items(self):
def assertTwoItemsExported(self, item):
self.ie.start_exporting()
self.ie.export_item(self.i)
self.ie.export_item(self.i)
self.ie.export_item(item)
self.ie.export_item(item)
self.ie.finish_exporting()
exported = json.loads(self.output.getvalue())
self.assertEqual(exported, [dict(self.i), dict(self.i)])
self.assertEqual(exported, [dict(item), dict(item)])
def test_two_items(self):
self.assertTwoItemsExported(self.i)
def test_two_dict_items(self):
self.assertTwoItemsExported(dict(self.i))
def test_nested_item(self):
i1 = TestItem(name=u'Joseph\xa3', age='22')
@ -325,6 +365,18 @@ class JsonItemExporterTest(JsonLinesItemExporterTest):
expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': dict(i1)}}
self.assertEqual(exported, [expected])
def test_nested_dict_item(self):
i1 = dict(name=u'Joseph\xa3', age='22')
i2 = TestItem(name=u'Maria', age=i1)
i3 = dict(name=u'Jesus', age=i2)
self.ie.start_exporting()
self.ie.export_item(i3)
self.ie.finish_exporting()
exported = json.loads(self.output.getvalue())
expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}}
self.assertEqual(exported, [expected])
class CustomItemExporterTest(unittest.TestCase):
def test_exporter_custom_serializer(self):
@ -333,16 +385,17 @@ class CustomItemExporterTest(unittest.TestCase):
if name == 'age':
return str(int(value) + 1)
else:
return super(CustomItemExporter, self).serialize_field(field, \
name, value)
return super(CustomItemExporter, self).serialize_field(field, name, value)
i = TestItem(name=u'John', age='22')
ie = CustomItemExporter()
self.assertEqual( \
ie.serialize_field(i.fields['name'], 'name', i['name']), 'John')
self.assertEqual(
ie.serialize_field(i.fields['age'], 'age', i['age']), '23')
self.assertEqual(ie.serialize_field(i.fields['name'], 'name', i['name']), 'John')
self.assertEqual(ie.serialize_field(i.fields['age'], 'age', i['age']), '23')
i2 = {'name': u'John', 'age': '22'}
self.assertEqual(ie.serialize_field({}, 'name', i2['name']), 'John')
self.assertEqual(ie.serialize_field({}, 'age', i2['age']), '23')
if __name__ == '__main__':

View File

@ -28,11 +28,13 @@ from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.utils.signal import disconnect_all
class TestItem(Item):
name = Field()
url = Field()
price = Field()
class TestSpider(Spider):
name = "scrapytest.org"
allowed_domains = ["scrapytest.org", "localhost"]
@ -41,6 +43,8 @@ class TestSpider(Spider):
name_re = re.compile("<h1>(.*?)</h1>", re.M)
price_re = re.compile(">Price: \$(.*?)<", re.M)
item_cls = TestItem
def parse(self, response):
xlink = LinkExtractor()
itemre = re.compile(self.itemurl_re)
@ -49,7 +53,7 @@ class TestSpider(Spider):
yield Request(url=link.url, callback=self.parse_item)
def parse_item(self, response):
item = TestItem()
item = self.item_cls()
m = self.name_re.search(response.body)
if m:
item['name'] = m.group(1)
@ -65,6 +69,10 @@ class TestDupeFilterSpider(TestSpider):
return Request(url) # dont_filter=False
class DictItemsSpider(TestSpider):
item_cls = dict
def start_test_site(debug=False):
root_dir = os.path.join(tests_datadir, "test_site")
r = static.File(root_dir)
@ -81,15 +89,14 @@ def start_test_site(debug=False):
class CrawlerRun(object):
"""A class to run the crawler and keep track of events occurred"""
def __init__(self, with_dupefilter=False):
def __init__(self, spider_class):
self.spider = None
self.respplug = []
self.reqplug = []
self.reqdropped = []
self.itemresp = []
self.signals_catched = {}
self.spider_class = TestSpider if not with_dupefilter else \
TestDupeFilterSpider
self.spider_class = spider_class
def run(self):
self.port = start_test_site()
@ -152,14 +159,17 @@ class EngineTest(unittest.TestCase):
@defer.inlineCallbacks
def test_crawler(self):
self.run = CrawlerRun()
yield self.run.run()
self._assert_visited_urls()
self._assert_scheduled_requests(urls_to_visit=8)
self._assert_downloaded_responses()
self._assert_scraped_items()
self._assert_signals_catched()
self.run = CrawlerRun(with_dupefilter=True)
for spider in TestSpider, DictItemsSpider:
self.run = CrawlerRun(spider)
yield self.run.run()
self._assert_visited_urls()
self._assert_scheduled_requests(urls_to_visit=8)
self._assert_downloaded_responses()
self._assert_scraped_items()
self._assert_signals_catched()
self.run = CrawlerRun(TestDupeFilterSpider)
yield self.run.run()
self._assert_scheduled_requests(urls_to_visit=7)
self._assert_dropped_requests()

View File

@ -142,35 +142,40 @@ class DeprecatedFilesPipelineTestCase(unittest.TestCase):
class FilesPipelineTestCaseFields(unittest.TestCase):
def test_item_fields_default(self):
from scrapy.contrib.pipeline.files import FilesPipeline
class TestItem(Item):
name = Field()
file_urls = Field()
files = Field()
url = 'http://www.example.com/files/1.txt'
item = TestItem({'name': 'item1', 'file_urls': [url]})
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['files'], [results[0][1]])
for cls in TestItem, dict:
url = 'http://www.example.com/files/1.txt'
item = cls({'name': 'item1', 'file_urls': [url]})
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['files'], [results[0][1]])
def test_item_fields_override_settings(self):
from scrapy.contrib.pipeline.files import FilesPipeline
class TestItem(Item):
name = Field()
files = Field()
stored_file = Field()
url = 'http://www.example.com/files/1.txt'
item = TestItem({'name': 'item1', 'files': [url]})
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/',
'FILES_URLS_FIELD': 'files', 'FILES_RESULT_FIELD': 'stored_file'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['stored_file'], [results[0][1]])
for cls in TestItem, dict:
url = 'http://www.example.com/files/1.txt'
item = cls({'name': 'item1', 'files': [url]})
pipeline = FilesPipeline.from_settings(Settings({
'FILES_STORE': 's3://example/files/',
'FILES_URLS_FIELD': 'files',
'FILES_RESULT_FIELD': 'stored_file'
}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['stored_file'], [results[0][1]])
class ItemWithFiles(Item):

View File

@ -168,35 +168,40 @@ class DeprecatedImagesPipelineTestCase(unittest.TestCase):
class ImagesPipelineTestCaseFields(unittest.TestCase):
def test_item_fields_default(self):
from scrapy.contrib.pipeline.images import ImagesPipeline
class TestItem(Item):
name = Field()
image_urls = Field()
images = Field()
url = 'http://www.example.com/images/1.jpg'
item = TestItem({'name': 'item1', 'image_urls': [url]})
pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['images'], [results[0][1]])
for cls in TestItem, dict:
url = 'http://www.example.com/images/1.jpg'
item = cls({'name': 'item1', 'image_urls': [url]})
pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['images'], [results[0][1]])
def test_item_fields_override_settings(self):
from scrapy.contrib.pipeline.images import ImagesPipeline
class TestItem(Item):
name = Field()
image = Field()
stored_image = Field()
url = 'http://www.example.com/images/1.jpg'
item = TestItem({'name': 'item1', 'image': [url]})
pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/',
'IMAGES_URLS_FIELD': 'image', 'IMAGES_RESULT_FIELD': 'stored_image'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['stored_image'], [results[0][1]])
for cls in TestItem, dict:
url = 'http://www.example.com/images/1.jpg'
item = cls({'name': 'item1', 'image': [url]})
pipeline = ImagesPipeline.from_settings(Settings({
'IMAGES_STORE': 's3://example/images/',
'IMAGES_URLS_FIELD': 'image',
'IMAGES_RESULT_FIELD': 'stored_image'
}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['stored_image'], [results[0][1]])
def _create_image(format, *a, **kw):