1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 02:04:22 +00:00

added FEED_EXPORT_ENCODING setting to allow encoding specification

This commit is contained in:
Dracony 2016-06-08 17:24:08 +02:00
parent b7925e4220
commit 33a39b368f
5 changed files with 69 additions and 7 deletions

View File

@ -207,6 +207,7 @@ These are the settings used for configuring the feed exports:
* :setting:`FEED_STORAGES`
* :setting:`FEED_EXPORTERS`
* :setting:`FEED_STORE_EMPTY`
* :setting:`FEED_EXPORT_ENCODING`
* :setting:`FEED_EXPORT_FIELDS`
.. currentmodule:: scrapy.extensions.feedexport
@ -231,6 +232,20 @@ FEED_FORMAT
The serialization format to be used for the feed. See
:ref:`topics-feed-format` for possible values.
.. setting:: FEED_EXPORT_ENCODING
FEED_EXPORT_ENCODING
--------------------
Default: ``None``
The encoding to be used for the feed.
If unset or set to ``None`` (default) it uses UTF-8 for everything except JSON output,
which uses safe numeric encoding (``\uXXXX`` sequences) for historic reasons.
Use ``utf-8`` if you want UTF-8 for JSON too.
.. setting:: FEED_EXPORT_FIELDS
FEED_EXPORT_FIELDS

View File

@ -33,9 +33,9 @@ class BaseItemExporter(object):
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses constructors)
"""
self.encoding = options.pop('encoding', None)
self.fields_to_export = options.pop('fields_to_export', None)
self.export_empty_fields = options.pop('export_empty_fields', False)
self.encoding = options.pop('encoding', 'utf-8')
if not dont_fail and options:
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
@ -84,11 +84,13 @@ class JsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
kwargs.setdefault('ensure_ascii', not self.encoding)
self.encoder = ScrapyJSONEncoder(**kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(to_bytes(self.encoder.encode(itemdict) + '\n'))
data = self.encoder.encode(itemdict) + '\n'
self.file.write(to_bytes(data, self.encoding))
class JsonItemExporter(BaseItemExporter):
@ -96,6 +98,7 @@ class JsonItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
kwargs.setdefault('ensure_ascii', not self.encoding)
self.encoder = ScrapyJSONEncoder(**kwargs)
self.first_item = True
@ -111,7 +114,8 @@ class JsonItemExporter(BaseItemExporter):
else:
self.file.write(b',\n')
itemdict = dict(self._get_serialized_fields(item))
self.file.write(to_bytes(self.encoder.encode(itemdict)))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
class XmlItemExporter(BaseItemExporter):
@ -120,6 +124,8 @@ class XmlItemExporter(BaseItemExporter):
self.item_element = kwargs.pop('item_element', 'item')
self.root_element = kwargs.pop('root_element', 'items')
self._configure(kwargs)
if not self.encoding:
self.encoding = 'utf-8'
self.xg = XMLGenerator(file, encoding=self.encoding)
def start_exporting(self):
@ -169,9 +175,16 @@ class CsvItemExporter(BaseItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
self._configure(kwargs, dont_fail=True)
if not self.encoding:
self.encoding = 'utf-8'
self.include_headers_line = include_headers_line
file = file if six.PY2 else io.TextIOWrapper(file, line_buffering=True)
self.csv_writer = csv.writer(file, **kwargs)
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
self._headers_not_written = True
self._join_multivalued = join_multivalued
@ -200,7 +213,7 @@ class CsvItemExporter(BaseItemExporter):
def _build_row(self, values):
for s in values:
try:
yield to_native_str(s)
yield to_native_str(s, self.encoding)
except TypeError:
yield s
@ -263,6 +276,8 @@ class PythonItemExporter(BaseItemExporter):
warnings.warn(
"PythonItemExporter will drop support for binary export in the future",
ScrapyDeprecationWarning)
if not self.encoding:
self.encoding = 'utf-8'
def serialize_field(self, field, name, value):
serializer = field.get('serializer', self._serialize_value)

View File

@ -162,6 +162,7 @@ class FeedExporter(object):
if not self.urifmt:
raise NotConfigured
self.format = settings['FEED_FORMAT'].lower()
self.export_encoding = settings['FEED_EXPORT_ENCODING']
self.storages = self._load_components('FEED_STORAGES')
self.exporters = self._load_components('FEED_EXPORTERS')
if not self._storage_supported(self.urifmt):
@ -185,7 +186,8 @@ class FeedExporter(object):
uri = self.urifmt % self._get_uri_params(spider)
storage = self._get_storage(uri)
file = storage.open(spider)
exporter = self._get_exporter(file, fields_to_export=self.export_fields)
exporter = self._get_exporter(file, fields_to_export=self.export_fields,
encoding=self.export_encoding)
exporter.start_exporting()
self.slot = SpiderSlot(file, exporter, storage, uri)

View File

@ -139,6 +139,7 @@ FEED_URI = None
FEED_URI_PARAMS = None # a function to extend uri arguments
FEED_FORMAT = 'jsonlines'
FEED_STORE_EMPTY = False
FEED_EXPORT_ENCODING = None
FEED_EXPORT_FIELDS = None
FEED_STORAGES = {}
FEED_STORAGES_BASE = {

View File

@ -371,3 +371,32 @@ class FeedExportTest(unittest.TestCase):
]
yield self.assertExported(items, ['egg', 'baz'], rows,
settings=settings, ordered=True)
@defer.inlineCallbacks
def test_export_encoding(self):
items = [dict({'foo': u'Test\xd6'})]
header = ['foo']
formats = {
'json': u'[\n{"foo": "Test\\u00d6"}\n]'.encode('utf-8'),
'jsonlines': u'{"foo": "Test\\u00d6"}\n'.encode('utf-8'),
'xml': u'<?xml version="1.0" encoding="utf-8"?>\n<items><item><foo>Test\xd6</foo></item></items>'.encode('utf-8'),
'csv': u'foo\r\nTest\xd6\r\n'.encode('utf-8'),
}
for format in formats:
settings = {'FEED_FORMAT': format}
data = yield self.exported_data(items, settings)
self.assertEqual(formats[format], data)
formats = {
'json': u'[\n{"foo": "Test\xd6"}\n]'.encode('latin-1'),
'jsonlines': u'{"foo": "Test\xd6"}\n'.encode('latin-1'),
'xml': u'<?xml version="1.0" encoding="latin-1"?>\n<items><item><foo>Test\xd6</foo></item></items>'.encode('latin-1'),
'csv': u'foo\r\nTest\xd6\r\n'.encode('latin-1'),
}
for format in formats:
settings = {'FEED_FORMAT': format, 'FEED_EXPORT_ENCODING': 'latin-1'}
data = yield self.exported_data(items, settings)
self.assertEqual(formats[format], data)