mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 16:03:56 +00:00
Allow spiders to return dicts. See GH-1064.
This commit is contained in:
parent
da90449edf
commit
39635e5f55
@ -197,12 +197,17 @@ BaseItemExporter
|
||||
Some exporters (like :class:`CsvItemExporter`) respect the order of the
|
||||
fields defined in this attribute.
|
||||
|
||||
Some exporters may require fields_to_export list in order to export the
|
||||
data properly when spiders return dicts (not :class:`~Item` instances).
|
||||
|
||||
.. attribute:: export_empty_fields
|
||||
|
||||
Whether to include empty/unpopulated item fields in the exported data.
|
||||
Defaults to ``False``. Some exporters (like :class:`CsvItemExporter`)
|
||||
ignore this attribute and always export all empty fields.
|
||||
|
||||
This option is ignored for dict items.
|
||||
|
||||
.. attribute:: encoding
|
||||
|
||||
The encoding that will be used to encode unicode values. This only
|
||||
|
@ -107,7 +107,7 @@ class Command(ScrapyCommand):
|
||||
items, requests = [], []
|
||||
|
||||
for x in iterate_spider_output(cb(response)):
|
||||
if isinstance(x, BaseItem):
|
||||
if isinstance(x, (BaseItem, dict)):
|
||||
items.append(x)
|
||||
elif isinstance(x, Request):
|
||||
requests.append(x)
|
||||
|
@ -35,8 +35,8 @@ class ReturnsContract(Contract):
|
||||
objects = {
|
||||
'request': Request,
|
||||
'requests': Request,
|
||||
'item': BaseItem,
|
||||
'items': BaseItem,
|
||||
'item': (BaseItem, dict),
|
||||
'items': (BaseItem, dict),
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -83,7 +83,7 @@ class ScrapesContract(Contract):
|
||||
|
||||
def post_process(self, output):
|
||||
for x in output:
|
||||
if isinstance(x, BaseItem):
|
||||
if isinstance(x, (BaseItem, dict)):
|
||||
for arg in self.args:
|
||||
if not arg in x:
|
||||
raise ContractFail("'%s' field is missing" % arg)
|
||||
|
@ -9,6 +9,7 @@ import marshal
|
||||
import six
|
||||
from six.moves import cPickle as pickle
|
||||
from xml.sax.saxutils import XMLGenerator
|
||||
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
from scrapy.item import BaseItem
|
||||
|
||||
@ -50,13 +51,13 @@ class BaseItemExporter(object):
|
||||
return value.encode(self.encoding) if isinstance(value, unicode) else value
|
||||
|
||||
def _get_serialized_fields(self, item, default_value=None, include_empty=None):
|
||||
"""Return the fields to export as an iterable of tuples (name,
|
||||
serialized_value)
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
"""
|
||||
if include_empty is None:
|
||||
include_empty = self.export_empty_fields
|
||||
if self.fields_to_export is None:
|
||||
if include_empty:
|
||||
if include_empty and not isinstance(item, dict):
|
||||
field_iter = six.iterkeys(item.fields)
|
||||
else:
|
||||
field_iter = six.iterkeys(item)
|
||||
@ -64,12 +65,11 @@ class BaseItemExporter(object):
|
||||
if include_empty:
|
||||
field_iter = self.fields_to_export
|
||||
else:
|
||||
nonempty_fields = set(item.keys())
|
||||
field_iter = (x for x in self.fields_to_export if x in
|
||||
nonempty_fields)
|
||||
field_iter = (x for x in self.fields_to_export if x in item)
|
||||
|
||||
for field_name in field_iter:
|
||||
if field_name in item:
|
||||
field = item.fields[field_name]
|
||||
field = {} if isinstance(item, dict) else item.fields[field_name]
|
||||
value = self.serialize_field(field, field_name, item[field_name])
|
||||
else:
|
||||
value = default_value
|
||||
@ -191,7 +191,12 @@ class CsvItemExporter(BaseItemExporter):
|
||||
def _write_headers_and_set_fields_to_export(self, item):
|
||||
if self.include_headers_line:
|
||||
if not self.fields_to_export:
|
||||
self.fields_to_export = item.fields.keys()
|
||||
if isinstance(item, dict):
|
||||
# for dicts try using fields of the first item
|
||||
self.fields_to_export = list(item.keys())
|
||||
else:
|
||||
# use fields declared in Item
|
||||
self.fields_to_export = list(item.fields.keys())
|
||||
self.csv_writer.writerow(self.fields_to_export)
|
||||
|
||||
|
||||
|
@ -267,7 +267,7 @@ class FilesPipeline(MediaPipeline):
|
||||
return checksum
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if self.FILES_RESULT_FIELD in item.fields:
|
||||
if isinstance(item, dict) or self.FILES_RESULT_FIELD in item.fields:
|
||||
item[self.FILES_RESULT_FIELD] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
|
@ -109,7 +109,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if self.IMAGES_RESULT_FIELD in item.fields:
|
||||
if isinstance(item, dict) or self.IMAGES_RESULT_FIELD in item.fields:
|
||||
item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
|
@ -174,7 +174,7 @@ class Scraper(object):
|
||||
"""
|
||||
if isinstance(output, Request):
|
||||
self.crawler.engine.crawl(request=output, spider=spider)
|
||||
elif isinstance(output, BaseItem):
|
||||
elif isinstance(output, (BaseItem, dict)):
|
||||
self.slot.itemproc_size += 1
|
||||
dfd = self.itemproc.process_item(output, spider)
|
||||
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
||||
@ -183,7 +183,7 @@ class Scraper(object):
|
||||
pass
|
||||
else:
|
||||
typename = type(output).__name__
|
||||
log.msg(format='Spider must return Request, BaseItem or None, '
|
||||
log.msg(format='Spider must return Request, BaseItem, dict or None, '
|
||||
'got %(typename)r in %(request)s',
|
||||
level=log.ERROR, spider=spider, request=request, typename=typename)
|
||||
|
||||
|
@ -85,6 +85,7 @@ class ItemSpider(FollowAllSpider):
|
||||
for request in super(ItemSpider, self).parse(response):
|
||||
yield request
|
||||
yield Item()
|
||||
yield {}
|
||||
|
||||
|
||||
class DefaultError(Exception):
|
||||
|
@ -127,6 +127,7 @@ class MiscCommandsTest(CommandTest):
|
||||
def test_list(self):
|
||||
self.assertEqual(0, self.call('list'))
|
||||
|
||||
|
||||
class RunSpiderCommandTest(CommandTest):
|
||||
|
||||
def test_runspider(self):
|
||||
@ -135,10 +136,10 @@ class RunSpiderCommandTest(CommandTest):
|
||||
fname = abspath(join(tmpdir, 'myspider.py'))
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
import scrapy
|
||||
from scrapy import log
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class MySpider(Spider):
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def start_requests(self):
|
||||
@ -192,16 +193,15 @@ class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.item import Item
|
||||
import scrapy
|
||||
|
||||
class MySpider(Spider):
|
||||
class MySpider(scrapy.Spider):
|
||||
name = '{0}'
|
||||
|
||||
def parse(self, response):
|
||||
if getattr(self, 'test_arg', None):
|
||||
self.log('It Works!')
|
||||
return [Item()]
|
||||
return [scrapy.Item(), dict(foo='bar')]
|
||||
""".format(self.spider_name))
|
||||
|
||||
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
|
||||
@ -239,6 +239,14 @@ ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
|
||||
self.url('/html')])
|
||||
self.assert_("[scrapy] INFO: It Works!" in stderr, stderr)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_parse_items(self):
|
||||
status, out, stderr = yield self.execute(
|
||||
['--spider', self.spider_name, '-c', 'parse', self.url('/html')]
|
||||
)
|
||||
self.assertIn("""[{}, {'foo': 'bar'}]""", out)
|
||||
|
||||
|
||||
|
||||
class BenchCommandTest(CommandTest):
|
||||
|
||||
|
@ -39,6 +39,13 @@ class TestSpider(Spider):
|
||||
"""
|
||||
return TestItem(url=response.url)
|
||||
|
||||
def returns_dict_item(self, response):
|
||||
""" method which returns item
|
||||
@url http://scrapy.org
|
||||
@returns items 1 1
|
||||
"""
|
||||
return {"url": response.url}
|
||||
|
||||
def returns_fail(self, response):
|
||||
""" method which returns item
|
||||
@url http://scrapy.org
|
||||
@ -46,6 +53,13 @@ class TestSpider(Spider):
|
||||
"""
|
||||
return TestItem(url=response.url)
|
||||
|
||||
def returns_dict_fail(self, response):
|
||||
""" method which returns item
|
||||
@url http://scrapy.org
|
||||
@returns items 0 0
|
||||
"""
|
||||
return {'url': response.url}
|
||||
|
||||
def scrapes_item_ok(self, response):
|
||||
""" returns item with name and url
|
||||
@url http://scrapy.org
|
||||
@ -54,6 +68,14 @@ class TestSpider(Spider):
|
||||
"""
|
||||
return TestItem(name='test', url=response.url)
|
||||
|
||||
def scrapes_dict_item_ok(self, response):
|
||||
""" returns item with name and url
|
||||
@url http://scrapy.org
|
||||
@returns items 1 1
|
||||
@scrapes name url
|
||||
"""
|
||||
return {'name': 'test', 'url': response.url}
|
||||
|
||||
def scrapes_item_fail(self, response):
|
||||
""" returns item with no name
|
||||
@url http://scrapy.org
|
||||
@ -62,6 +84,14 @@ class TestSpider(Spider):
|
||||
"""
|
||||
return TestItem(url=response.url)
|
||||
|
||||
def scrapes_dict_item_fail(self, response):
|
||||
""" returns item with no name
|
||||
@url http://scrapy.org
|
||||
@returns items 1 1
|
||||
@scrapes name url
|
||||
"""
|
||||
return {'url': response.url}
|
||||
|
||||
def parse_no_url(self, response):
|
||||
""" method with no url
|
||||
@returns items 1 1
|
||||
@ -110,6 +140,11 @@ class ContractsManagerTest(unittest.TestCase):
|
||||
request.callback(response)
|
||||
self.should_succeed()
|
||||
|
||||
# returns_dict_item
|
||||
request = self.conman.from_method(spider.returns_dict_item, self.results)
|
||||
request.callback(response)
|
||||
self.should_succeed()
|
||||
|
||||
# returns_request
|
||||
request = self.conman.from_method(spider.returns_request, self.results)
|
||||
request.callback(response)
|
||||
@ -120,6 +155,11 @@ class ContractsManagerTest(unittest.TestCase):
|
||||
request.callback(response)
|
||||
self.should_fail()
|
||||
|
||||
# returns_dict_fail
|
||||
request = self.conman.from_method(spider.returns_dict_fail, self.results)
|
||||
request.callback(response)
|
||||
self.should_fail()
|
||||
|
||||
def test_scrapes(self):
|
||||
spider = TestSpider()
|
||||
response = ResponseMock()
|
||||
@ -129,8 +169,19 @@ class ContractsManagerTest(unittest.TestCase):
|
||||
request.callback(response)
|
||||
self.should_succeed()
|
||||
|
||||
# scrapes_dict_item_ok
|
||||
request = self.conman.from_method(spider.scrapes_dict_item_ok, self.results)
|
||||
request.callback(response)
|
||||
self.should_succeed()
|
||||
|
||||
# scrapes_item_fail
|
||||
request = self.conman.from_method(spider.scrapes_item_fail,
|
||||
self.results)
|
||||
request.callback(response)
|
||||
self.should_fail()
|
||||
|
||||
# scrapes_dict_item_fail
|
||||
request = self.conman.from_method(spider.scrapes_dict_item_fail,
|
||||
self.results)
|
||||
request.callback(response)
|
||||
self.should_fail()
|
||||
|
@ -1,14 +1,19 @@
|
||||
import unittest, json
|
||||
from __future__ import absolute_import
|
||||
import re
|
||||
import json
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from six.moves import cPickle as pickle
|
||||
|
||||
import lxml.etree
|
||||
import re
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.utils.python import str_to_unicode
|
||||
from scrapy.contrib.exporter import BaseItemExporter, PprintItemExporter, \
|
||||
PickleItemExporter, CsvItemExporter, XmlItemExporter, JsonLinesItemExporter, \
|
||||
JsonItemExporter, PythonItemExporter
|
||||
from scrapy.contrib.exporter import (
|
||||
BaseItemExporter, PprintItemExporter, PickleItemExporter, CsvItemExporter,
|
||||
XmlItemExporter, JsonLinesItemExporter, JsonItemExporter, PythonItemExporter
|
||||
)
|
||||
|
||||
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
@ -33,21 +38,28 @@ class BaseItemExporterTest(unittest.TestCase):
|
||||
exported_dict[k] = str_to_unicode(v)
|
||||
self.assertEqual(self.i, exported_dict)
|
||||
|
||||
def test_export_item(self):
|
||||
def assertItemExportWorks(self, item):
|
||||
self.ie.start_exporting()
|
||||
try:
|
||||
self.ie.export_item(self.i)
|
||||
self.ie.export_item(item)
|
||||
except NotImplementedError:
|
||||
if self.ie.__class__ is not BaseItemExporter:
|
||||
raise
|
||||
self.ie.finish_exporting()
|
||||
self._check_output()
|
||||
|
||||
def test_export_item(self):
|
||||
self.assertItemExportWorks(self.i)
|
||||
|
||||
def test_export_dict_item(self):
|
||||
self.assertItemExportWorks(dict(self.i))
|
||||
|
||||
def test_serialize_field(self):
|
||||
self.assertEqual(self.ie.serialize_field( \
|
||||
self.i.fields['name'], 'name', self.i['name']), 'John\xc2\xa3')
|
||||
self.assertEqual( \
|
||||
self.ie.serialize_field(self.i.fields['age'], 'age', self.i['age']), '22')
|
||||
res = self.ie.serialize_field(self.i.fields['name'], 'name', self.i['name'])
|
||||
self.assertEqual(res, 'John\xc2\xa3')
|
||||
|
||||
res = self.ie.serialize_field(self.i.fields['age'], 'age', self.i['age'])
|
||||
self.assertEqual(res, '22')
|
||||
|
||||
def test_fields_to_export(self):
|
||||
ie = self._get_exporter(fields_to_export=['name'])
|
||||
@ -72,13 +84,14 @@ class BaseItemExporterTest(unittest.TestCase):
|
||||
self.assertEqual(ie.serialize_field(i.fields['name'], 'name', i['name']), 'John\xc2\xa3')
|
||||
self.assertEqual(ie.serialize_field(i.fields['age'], 'age', i['age']), '24')
|
||||
|
||||
|
||||
class PythonItemExporterTest(BaseItemExporterTest):
|
||||
def _get_exporter(self, **kwargs):
|
||||
return PythonItemExporter(**kwargs)
|
||||
|
||||
def test_nested_item(self):
|
||||
i1 = TestItem(name=u'Joseph', age='22')
|
||||
i2 = TestItem(name=u'Maria', age=i1)
|
||||
i2 = dict(name=u'Maria', age=i1)
|
||||
i3 = TestItem(name=u'Jesus', age=i2)
|
||||
ie = self._get_exporter()
|
||||
exported = ie.export_item(i3)
|
||||
@ -107,6 +120,7 @@ class PythonItemExporterTest(BaseItemExporterTest):
|
||||
self.assertEqual(type(exported['age'][0]), dict)
|
||||
self.assertEqual(type(exported['age'][0]['age'][0]), dict)
|
||||
|
||||
|
||||
class PprintItemExporterTest(BaseItemExporterTest):
|
||||
|
||||
def _get_exporter(self, **kwargs):
|
||||
@ -115,6 +129,7 @@ class PprintItemExporterTest(BaseItemExporterTest):
|
||||
def _check_output(self):
|
||||
self._assert_expected_item(eval(self.output.getvalue()))
|
||||
|
||||
|
||||
class PickleItemExporterTest(BaseItemExporterTest):
|
||||
|
||||
def _get_exporter(self, **kwargs):
|
||||
@ -150,48 +165,65 @@ class CsvItemExporterTest(BaseItemExporterTest):
|
||||
def _check_output(self):
|
||||
self.assertCsvEqual(self.output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n')
|
||||
|
||||
def test_header(self):
|
||||
output = BytesIO()
|
||||
ie = CsvItemExporter(output, fields_to_export=self.i.fields.keys())
|
||||
def assertExportResult(self, item, expected, **kwargs):
|
||||
fp = BytesIO()
|
||||
ie = CsvItemExporter(fp, **kwargs)
|
||||
ie.start_exporting()
|
||||
ie.export_item(self.i)
|
||||
ie.export_item(item)
|
||||
ie.finish_exporting()
|
||||
self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n')
|
||||
self.assertCsvEqual(fp.getvalue(), expected)
|
||||
|
||||
output = BytesIO()
|
||||
ie = CsvItemExporter(output, fields_to_export=['age'])
|
||||
ie.start_exporting()
|
||||
ie.export_item(self.i)
|
||||
ie.finish_exporting()
|
||||
self.assertCsvEqual(output.getvalue(), 'age\r\n22\r\n')
|
||||
def test_header_export_all(self):
|
||||
self.assertExportResult(
|
||||
item=self.i,
|
||||
fields_to_export=self.i.fields.keys(),
|
||||
expected='age,name\r\n22,John\xc2\xa3\r\n',
|
||||
)
|
||||
|
||||
output = BytesIO()
|
||||
ie = CsvItemExporter(output)
|
||||
ie.start_exporting()
|
||||
ie.export_item(self.i)
|
||||
ie.export_item(self.i)
|
||||
ie.finish_exporting()
|
||||
self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
|
||||
def test_header_export_all_dict(self):
|
||||
self.assertExportResult(
|
||||
item=dict(self.i),
|
||||
expected='age,name\r\n22,John\xc2\xa3\r\n',
|
||||
)
|
||||
|
||||
output = BytesIO()
|
||||
ie = CsvItemExporter(output, include_headers_line=False)
|
||||
ie.start_exporting()
|
||||
ie.export_item(self.i)
|
||||
ie.finish_exporting()
|
||||
self.assertCsvEqual(output.getvalue(), '22,John\xc2\xa3\r\n')
|
||||
def test_header_export_single_field(self):
|
||||
for item in [self.i, dict(self.i)]:
|
||||
self.assertExportResult(
|
||||
item=item,
|
||||
fields_to_export=['age'],
|
||||
expected='age\r\n22\r\n',
|
||||
)
|
||||
|
||||
def test_header_export_two_items(self):
|
||||
for item in [self.i, dict(self.i)]:
|
||||
output = BytesIO()
|
||||
ie = CsvItemExporter(output)
|
||||
ie.start_exporting()
|
||||
ie.export_item(item)
|
||||
ie.export_item(item)
|
||||
ie.finish_exporting()
|
||||
self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
|
||||
|
||||
def test_header_no_header_line(self):
|
||||
for item in [self.i, dict(self.i)]:
|
||||
self.assertExportResult(
|
||||
item=item,
|
||||
include_headers_line=False,
|
||||
expected='22,John\xc2\xa3\r\n',
|
||||
)
|
||||
|
||||
def test_join_multivalue(self):
|
||||
class TestItem2(Item):
|
||||
name = Field()
|
||||
friends = Field()
|
||||
|
||||
i = TestItem2(name='John', friends=['Mary', 'Paul'])
|
||||
output = BytesIO()
|
||||
ie = CsvItemExporter(output, include_headers_line=False)
|
||||
ie.start_exporting()
|
||||
ie.export_item(i)
|
||||
ie.finish_exporting()
|
||||
self.assertCsvEqual(output.getvalue(), '"Mary,Paul",John\r\n')
|
||||
for cls in TestItem2, dict:
|
||||
self.assertExportResult(
|
||||
item=cls(name='John', friends=['Mary', 'Paul']),
|
||||
include_headers_line=False,
|
||||
expected='"Mary,Paul",John\r\n',
|
||||
)
|
||||
|
||||
|
||||
class XmlItemExporterTest(BaseItemExporterTest):
|
||||
|
||||
@ -211,60 +243,62 @@ class XmlItemExporterTest(BaseItemExporterTest):
|
||||
return xmltuple(doc)
|
||||
return self.assertEqual(xmlsplit(first), xmlsplit(second), msg)
|
||||
|
||||
def assertExportResult(self, item, expected_value):
|
||||
fp = BytesIO()
|
||||
ie = XmlItemExporter(fp)
|
||||
ie.start_exporting()
|
||||
ie.export_item(item)
|
||||
ie.finish_exporting()
|
||||
self.assertXmlEquivalent(fp.getvalue(), expected_value)
|
||||
|
||||
def _check_output(self):
|
||||
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><age>22</age><name>John\xc2\xa3</name></item></items>'
|
||||
self.assertXmlEquivalent(self.output.getvalue(), expected_value)
|
||||
|
||||
def test_multivalued_fields(self):
|
||||
output = BytesIO()
|
||||
item = TestItem(name=[u'John\xa3', u'Doe'])
|
||||
ie = XmlItemExporter(output)
|
||||
ie.start_exporting()
|
||||
ie.export_item(item)
|
||||
ie.finish_exporting()
|
||||
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>'
|
||||
self.assertXmlEquivalent(output.getvalue(), expected_value)
|
||||
self.assertExportResult(
|
||||
TestItem(name=[u'John\xa3', u'Doe']),
|
||||
'<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>'
|
||||
)
|
||||
|
||||
def test_nested_item(self):
|
||||
output = BytesIO()
|
||||
i1 = TestItem(name=u'foo\xa3hoo', age='22')
|
||||
i2 = TestItem(name=u'bar', age=i1)
|
||||
i2 = dict(name=u'bar', age=i1)
|
||||
i3 = TestItem(name=u'buz', age=i2)
|
||||
ie = XmlItemExporter(output)
|
||||
ie.start_exporting()
|
||||
ie.export_item(i3)
|
||||
ie.finish_exporting()
|
||||
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\
|
||||
'<items><item>'\
|
||||
'<age>'\
|
||||
'<age>'\
|
||||
'<age>22</age>'\
|
||||
'<name>foo\xc2\xa3hoo</name>'\
|
||||
'</age>'\
|
||||
'<name>bar</name>'\
|
||||
'</age>'\
|
||||
'<name>buz</name>'\
|
||||
'</item></items>'
|
||||
self.assertXmlEquivalent(output.getvalue(), expected_value)
|
||||
|
||||
self.assertExportResult(i3,
|
||||
'<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
'<items>'
|
||||
'<item>'
|
||||
'<age>'
|
||||
'<age>'
|
||||
'<age>22</age>'
|
||||
'<name>foo\xc2\xa3hoo</name>'
|
||||
'</age>'
|
||||
'<name>bar</name>'
|
||||
'</age>'
|
||||
'<name>buz</name>'
|
||||
'</item>'
|
||||
'</items>'
|
||||
)
|
||||
|
||||
def test_nested_list_item(self):
|
||||
output = BytesIO()
|
||||
i1 = TestItem(name=u'foo')
|
||||
i2 = TestItem(name=u'bar')
|
||||
i2 = dict(name=u'bar', v2={"egg": ["spam"]})
|
||||
i3 = TestItem(name=u'buz', age=[i1, i2])
|
||||
ie = XmlItemExporter(output)
|
||||
ie.start_exporting()
|
||||
ie.export_item(i3)
|
||||
ie.finish_exporting()
|
||||
expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\
|
||||
'<items><item>'\
|
||||
'<age>'\
|
||||
'<value><name>foo</name></value>'\
|
||||
'<value><name>bar</name></value>'\
|
||||
'</age>'\
|
||||
'<name>buz</name>'\
|
||||
'</item></items>'
|
||||
self.assertXmlEquivalent(output.getvalue(), expected_value)
|
||||
|
||||
self.assertExportResult(i3,
|
||||
'<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
'<items>'
|
||||
'<item>'
|
||||
'<age>'
|
||||
'<value><name>foo</name></value>'
|
||||
'<value><name>bar</name><v2><egg><value>spam</value></egg></v2></value>'
|
||||
'</age>'
|
||||
'<name>buz</name>'
|
||||
'</item>'
|
||||
'</items>'
|
||||
)
|
||||
|
||||
|
||||
class JsonLinesItemExporterTest(BaseItemExporterTest):
|
||||
@ -280,7 +314,7 @@ class JsonLinesItemExporterTest(BaseItemExporterTest):
|
||||
|
||||
def test_nested_item(self):
|
||||
i1 = TestItem(name=u'Joseph', age='22')
|
||||
i2 = TestItem(name=u'Maria', age=i1)
|
||||
i2 = dict(name=u'Maria', age=i1)
|
||||
i3 = TestItem(name=u'Jesus', age=i2)
|
||||
self.ie.start_exporting()
|
||||
self.ie.export_item(i3)
|
||||
@ -306,13 +340,19 @@ class JsonItemExporterTest(JsonLinesItemExporterTest):
|
||||
exported = json.loads(self.output.getvalue().strip())
|
||||
self.assertEqual(exported, [dict(self.i)])
|
||||
|
||||
def test_two_items(self):
|
||||
def assertTwoItemsExported(self, item):
|
||||
self.ie.start_exporting()
|
||||
self.ie.export_item(self.i)
|
||||
self.ie.export_item(self.i)
|
||||
self.ie.export_item(item)
|
||||
self.ie.export_item(item)
|
||||
self.ie.finish_exporting()
|
||||
exported = json.loads(self.output.getvalue())
|
||||
self.assertEqual(exported, [dict(self.i), dict(self.i)])
|
||||
self.assertEqual(exported, [dict(item), dict(item)])
|
||||
|
||||
def test_two_items(self):
|
||||
self.assertTwoItemsExported(self.i)
|
||||
|
||||
def test_two_dict_items(self):
|
||||
self.assertTwoItemsExported(dict(self.i))
|
||||
|
||||
def test_nested_item(self):
|
||||
i1 = TestItem(name=u'Joseph\xa3', age='22')
|
||||
@ -325,6 +365,18 @@ class JsonItemExporterTest(JsonLinesItemExporterTest):
|
||||
expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': dict(i1)}}
|
||||
self.assertEqual(exported, [expected])
|
||||
|
||||
def test_nested_dict_item(self):
|
||||
i1 = dict(name=u'Joseph\xa3', age='22')
|
||||
i2 = TestItem(name=u'Maria', age=i1)
|
||||
i3 = dict(name=u'Jesus', age=i2)
|
||||
self.ie.start_exporting()
|
||||
self.ie.export_item(i3)
|
||||
self.ie.finish_exporting()
|
||||
exported = json.loads(self.output.getvalue())
|
||||
expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}}
|
||||
self.assertEqual(exported, [expected])
|
||||
|
||||
|
||||
class CustomItemExporterTest(unittest.TestCase):
|
||||
|
||||
def test_exporter_custom_serializer(self):
|
||||
@ -333,16 +385,17 @@ class CustomItemExporterTest(unittest.TestCase):
|
||||
if name == 'age':
|
||||
return str(int(value) + 1)
|
||||
else:
|
||||
return super(CustomItemExporter, self).serialize_field(field, \
|
||||
name, value)
|
||||
return super(CustomItemExporter, self).serialize_field(field, name, value)
|
||||
|
||||
i = TestItem(name=u'John', age='22')
|
||||
ie = CustomItemExporter()
|
||||
|
||||
self.assertEqual( \
|
||||
ie.serialize_field(i.fields['name'], 'name', i['name']), 'John')
|
||||
self.assertEqual(
|
||||
ie.serialize_field(i.fields['age'], 'age', i['age']), '23')
|
||||
self.assertEqual(ie.serialize_field(i.fields['name'], 'name', i['name']), 'John')
|
||||
self.assertEqual(ie.serialize_field(i.fields['age'], 'age', i['age']), '23')
|
||||
|
||||
i2 = {'name': u'John', 'age': '22'}
|
||||
self.assertEqual(ie.serialize_field({}, 'name', i2['name']), 'John')
|
||||
self.assertEqual(ie.serialize_field({}, 'age', i2['age']), '23')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -28,11 +28,13 @@ from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.signal import disconnect_all
|
||||
|
||||
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
url = Field()
|
||||
price = Field()
|
||||
|
||||
|
||||
class TestSpider(Spider):
|
||||
name = "scrapytest.org"
|
||||
allowed_domains = ["scrapytest.org", "localhost"]
|
||||
@ -41,6 +43,8 @@ class TestSpider(Spider):
|
||||
name_re = re.compile("<h1>(.*?)</h1>", re.M)
|
||||
price_re = re.compile(">Price: \$(.*?)<", re.M)
|
||||
|
||||
item_cls = TestItem
|
||||
|
||||
def parse(self, response):
|
||||
xlink = LinkExtractor()
|
||||
itemre = re.compile(self.itemurl_re)
|
||||
@ -49,7 +53,7 @@ class TestSpider(Spider):
|
||||
yield Request(url=link.url, callback=self.parse_item)
|
||||
|
||||
def parse_item(self, response):
|
||||
item = TestItem()
|
||||
item = self.item_cls()
|
||||
m = self.name_re.search(response.body)
|
||||
if m:
|
||||
item['name'] = m.group(1)
|
||||
@ -65,6 +69,10 @@ class TestDupeFilterSpider(TestSpider):
|
||||
return Request(url) # dont_filter=False
|
||||
|
||||
|
||||
class DictItemsSpider(TestSpider):
|
||||
item_cls = dict
|
||||
|
||||
|
||||
def start_test_site(debug=False):
|
||||
root_dir = os.path.join(tests_datadir, "test_site")
|
||||
r = static.File(root_dir)
|
||||
@ -81,15 +89,14 @@ def start_test_site(debug=False):
|
||||
class CrawlerRun(object):
|
||||
"""A class to run the crawler and keep track of events occurred"""
|
||||
|
||||
def __init__(self, with_dupefilter=False):
|
||||
def __init__(self, spider_class):
|
||||
self.spider = None
|
||||
self.respplug = []
|
||||
self.reqplug = []
|
||||
self.reqdropped = []
|
||||
self.itemresp = []
|
||||
self.signals_catched = {}
|
||||
self.spider_class = TestSpider if not with_dupefilter else \
|
||||
TestDupeFilterSpider
|
||||
self.spider_class = spider_class
|
||||
|
||||
def run(self):
|
||||
self.port = start_test_site()
|
||||
@ -152,14 +159,17 @@ class EngineTest(unittest.TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_crawler(self):
|
||||
self.run = CrawlerRun()
|
||||
yield self.run.run()
|
||||
self._assert_visited_urls()
|
||||
self._assert_scheduled_requests(urls_to_visit=8)
|
||||
self._assert_downloaded_responses()
|
||||
self._assert_scraped_items()
|
||||
self._assert_signals_catched()
|
||||
self.run = CrawlerRun(with_dupefilter=True)
|
||||
|
||||
for spider in TestSpider, DictItemsSpider:
|
||||
self.run = CrawlerRun(spider)
|
||||
yield self.run.run()
|
||||
self._assert_visited_urls()
|
||||
self._assert_scheduled_requests(urls_to_visit=8)
|
||||
self._assert_downloaded_responses()
|
||||
self._assert_scraped_items()
|
||||
self._assert_signals_catched()
|
||||
|
||||
self.run = CrawlerRun(TestDupeFilterSpider)
|
||||
yield self.run.run()
|
||||
self._assert_scheduled_requests(urls_to_visit=7)
|
||||
self._assert_dropped_requests()
|
||||
|
@ -142,35 +142,40 @@ class DeprecatedFilesPipelineTestCase(unittest.TestCase):
|
||||
class FilesPipelineTestCaseFields(unittest.TestCase):
|
||||
|
||||
def test_item_fields_default(self):
|
||||
from scrapy.contrib.pipeline.files import FilesPipeline
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
file_urls = Field()
|
||||
files = Field()
|
||||
url = 'http://www.example.com/files/1.txt'
|
||||
item = TestItem({'name': 'item1', 'file_urls': [url]})
|
||||
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['files'], [results[0][1]])
|
||||
|
||||
for cls in TestItem, dict:
|
||||
url = 'http://www.example.com/files/1.txt'
|
||||
item = cls({'name': 'item1', 'file_urls': [url]})
|
||||
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['files'], [results[0][1]])
|
||||
|
||||
def test_item_fields_override_settings(self):
|
||||
from scrapy.contrib.pipeline.files import FilesPipeline
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
files = Field()
|
||||
stored_file = Field()
|
||||
url = 'http://www.example.com/files/1.txt'
|
||||
item = TestItem({'name': 'item1', 'files': [url]})
|
||||
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/',
|
||||
'FILES_URLS_FIELD': 'files', 'FILES_RESULT_FIELD': 'stored_file'}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['stored_file'], [results[0][1]])
|
||||
|
||||
for cls in TestItem, dict:
|
||||
url = 'http://www.example.com/files/1.txt'
|
||||
item = cls({'name': 'item1', 'files': [url]})
|
||||
pipeline = FilesPipeline.from_settings(Settings({
|
||||
'FILES_STORE': 's3://example/files/',
|
||||
'FILES_URLS_FIELD': 'files',
|
||||
'FILES_RESULT_FIELD': 'stored_file'
|
||||
}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['stored_file'], [results[0][1]])
|
||||
|
||||
|
||||
class ItemWithFiles(Item):
|
||||
|
@ -168,35 +168,40 @@ class DeprecatedImagesPipelineTestCase(unittest.TestCase):
|
||||
class ImagesPipelineTestCaseFields(unittest.TestCase):
|
||||
|
||||
def test_item_fields_default(self):
|
||||
from scrapy.contrib.pipeline.images import ImagesPipeline
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
image_urls = Field()
|
||||
images = Field()
|
||||
url = 'http://www.example.com/images/1.jpg'
|
||||
item = TestItem({'name': 'item1', 'image_urls': [url]})
|
||||
pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/'}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['images'], [results[0][1]])
|
||||
|
||||
for cls in TestItem, dict:
|
||||
url = 'http://www.example.com/images/1.jpg'
|
||||
item = cls({'name': 'item1', 'image_urls': [url]})
|
||||
pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/'}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['images'], [results[0][1]])
|
||||
|
||||
def test_item_fields_override_settings(self):
|
||||
from scrapy.contrib.pipeline.images import ImagesPipeline
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
image = Field()
|
||||
stored_image = Field()
|
||||
url = 'http://www.example.com/images/1.jpg'
|
||||
item = TestItem({'name': 'item1', 'image': [url]})
|
||||
pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': 's3://example/images/',
|
||||
'IMAGES_URLS_FIELD': 'image', 'IMAGES_RESULT_FIELD': 'stored_image'}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['stored_image'], [results[0][1]])
|
||||
|
||||
for cls in TestItem, dict:
|
||||
url = 'http://www.example.com/images/1.jpg'
|
||||
item = cls({'name': 'item1', 'image': [url]})
|
||||
pipeline = ImagesPipeline.from_settings(Settings({
|
||||
'IMAGES_STORE': 's3://example/images/',
|
||||
'IMAGES_URLS_FIELD': 'image',
|
||||
'IMAGES_RESULT_FIELD': 'stored_image'
|
||||
}))
|
||||
requests = list(pipeline.get_media_requests(item, None))
|
||||
self.assertEqual(requests[0].url, url)
|
||||
results = [(True, {'url': url})]
|
||||
pipeline.item_completed(results, item, None)
|
||||
self.assertEqual(item['stored_image'], [results[0][1]])
|
||||
|
||||
|
||||
def _create_image(format, *a, **kw):
|
||||
|
Loading…
x
Reference in New Issue
Block a user