mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 02:04:22 +00:00
added FEED_EXPORT_ENCODING setting to allow encoding specification
This commit is contained in:
parent
b7925e4220
commit
33a39b368f
@ -207,6 +207,7 @@ These are the settings used for configuring the feed exports:
|
||||
* :setting:`FEED_STORAGES`
|
||||
* :setting:`FEED_EXPORTERS`
|
||||
* :setting:`FEED_STORE_EMPTY`
|
||||
* :setting:`FEED_EXPORT_ENCODING`
|
||||
* :setting:`FEED_EXPORT_FIELDS`
|
||||
|
||||
.. currentmodule:: scrapy.extensions.feedexport
|
||||
@ -231,6 +232,20 @@ FEED_FORMAT
|
||||
The serialization format to be used for the feed. See
|
||||
:ref:`topics-feed-format` for possible values.
|
||||
|
||||
.. setting:: FEED_EXPORT_ENCODING
|
||||
|
||||
FEED_EXPORT_ENCODING
|
||||
--------------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
The encoding to be used for the feed.
|
||||
|
||||
If unset or set to ``None`` (default) it uses UTF-8 for everything except JSON output,
|
||||
which uses safe numeric encoding (``\uXXXX`` sequences) for historic reasons.
|
||||
|
||||
Use ``utf-8`` if you want UTF-8 for JSON too.
|
||||
|
||||
.. setting:: FEED_EXPORT_FIELDS
|
||||
|
||||
FEED_EXPORT_FIELDS
|
||||
|
@ -33,9 +33,9 @@ class BaseItemExporter(object):
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses constructors)
|
||||
"""
|
||||
self.encoding = options.pop('encoding', None)
|
||||
self.fields_to_export = options.pop('fields_to_export', None)
|
||||
self.export_empty_fields = options.pop('export_empty_fields', False)
|
||||
self.encoding = options.pop('encoding', 'utf-8')
|
||||
if not dont_fail and options:
|
||||
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
|
||||
|
||||
@ -84,11 +84,13 @@ class JsonLinesItemExporter(BaseItemExporter):
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.file = file
|
||||
kwargs.setdefault('ensure_ascii', not self.encoding)
|
||||
self.encoder = ScrapyJSONEncoder(**kwargs)
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(to_bytes(self.encoder.encode(itemdict) + '\n'))
|
||||
data = self.encoder.encode(itemdict) + '\n'
|
||||
self.file.write(to_bytes(data, self.encoding))
|
||||
|
||||
|
||||
class JsonItemExporter(BaseItemExporter):
|
||||
@ -96,6 +98,7 @@ class JsonItemExporter(BaseItemExporter):
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.file = file
|
||||
kwargs.setdefault('ensure_ascii', not self.encoding)
|
||||
self.encoder = ScrapyJSONEncoder(**kwargs)
|
||||
self.first_item = True
|
||||
|
||||
@ -111,7 +114,8 @@ class JsonItemExporter(BaseItemExporter):
|
||||
else:
|
||||
self.file.write(b',\n')
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(to_bytes(self.encoder.encode(itemdict)))
|
||||
data = self.encoder.encode(itemdict)
|
||||
self.file.write(to_bytes(data, self.encoding))
|
||||
|
||||
|
||||
class XmlItemExporter(BaseItemExporter):
|
||||
@ -120,6 +124,8 @@ class XmlItemExporter(BaseItemExporter):
|
||||
self.item_element = kwargs.pop('item_element', 'item')
|
||||
self.root_element = kwargs.pop('root_element', 'items')
|
||||
self._configure(kwargs)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.xg = XMLGenerator(file, encoding=self.encoding)
|
||||
|
||||
def start_exporting(self):
|
||||
@ -169,9 +175,16 @@ class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.include_headers_line = include_headers_line
|
||||
file = file if six.PY2 else io.TextIOWrapper(file, line_buffering=True)
|
||||
self.csv_writer = csv.writer(file, **kwargs)
|
||||
self.stream = io.TextIOWrapper(
|
||||
file,
|
||||
line_buffering=False,
|
||||
write_through=True,
|
||||
encoding=self.encoding
|
||||
) if six.PY3 else file
|
||||
self.csv_writer = csv.writer(self.stream, **kwargs)
|
||||
self._headers_not_written = True
|
||||
self._join_multivalued = join_multivalued
|
||||
|
||||
@ -200,7 +213,7 @@ class CsvItemExporter(BaseItemExporter):
|
||||
def _build_row(self, values):
|
||||
for s in values:
|
||||
try:
|
||||
yield to_native_str(s)
|
||||
yield to_native_str(s, self.encoding)
|
||||
except TypeError:
|
||||
yield s
|
||||
|
||||
@ -263,6 +276,8 @@ class PythonItemExporter(BaseItemExporter):
|
||||
warnings.warn(
|
||||
"PythonItemExporter will drop support for binary export in the future",
|
||||
ScrapyDeprecationWarning)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._serialize_value)
|
||||
|
@ -162,6 +162,7 @@ class FeedExporter(object):
|
||||
if not self.urifmt:
|
||||
raise NotConfigured
|
||||
self.format = settings['FEED_FORMAT'].lower()
|
||||
self.export_encoding = settings['FEED_EXPORT_ENCODING']
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
self.exporters = self._load_components('FEED_EXPORTERS')
|
||||
if not self._storage_supported(self.urifmt):
|
||||
@ -185,7 +186,8 @@ class FeedExporter(object):
|
||||
uri = self.urifmt % self._get_uri_params(spider)
|
||||
storage = self._get_storage(uri)
|
||||
file = storage.open(spider)
|
||||
exporter = self._get_exporter(file, fields_to_export=self.export_fields)
|
||||
exporter = self._get_exporter(file, fields_to_export=self.export_fields,
|
||||
encoding=self.export_encoding)
|
||||
exporter.start_exporting()
|
||||
self.slot = SpiderSlot(file, exporter, storage, uri)
|
||||
|
||||
|
@ -139,6 +139,7 @@ FEED_URI = None
|
||||
FEED_URI_PARAMS = None # a function to extend uri arguments
|
||||
FEED_FORMAT = 'jsonlines'
|
||||
FEED_STORE_EMPTY = False
|
||||
FEED_EXPORT_ENCODING = None
|
||||
FEED_EXPORT_FIELDS = None
|
||||
FEED_STORAGES = {}
|
||||
FEED_STORAGES_BASE = {
|
||||
|
@ -371,3 +371,32 @@ class FeedExportTest(unittest.TestCase):
|
||||
]
|
||||
yield self.assertExported(items, ['egg', 'baz'], rows,
|
||||
settings=settings, ordered=True)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_export_encoding(self):
|
||||
items = [dict({'foo': u'Test\xd6'})]
|
||||
header = ['foo']
|
||||
|
||||
formats = {
|
||||
'json': u'[\n{"foo": "Test\\u00d6"}\n]'.encode('utf-8'),
|
||||
'jsonlines': u'{"foo": "Test\\u00d6"}\n'.encode('utf-8'),
|
||||
'xml': u'<?xml version="1.0" encoding="utf-8"?>\n<items><item><foo>Test\xd6</foo></item></items>'.encode('utf-8'),
|
||||
'csv': u'foo\r\nTest\xd6\r\n'.encode('utf-8'),
|
||||
}
|
||||
|
||||
for format in formats:
|
||||
settings = {'FEED_FORMAT': format}
|
||||
data = yield self.exported_data(items, settings)
|
||||
self.assertEqual(formats[format], data)
|
||||
|
||||
formats = {
|
||||
'json': u'[\n{"foo": "Test\xd6"}\n]'.encode('latin-1'),
|
||||
'jsonlines': u'{"foo": "Test\xd6"}\n'.encode('latin-1'),
|
||||
'xml': u'<?xml version="1.0" encoding="latin-1"?>\n<items><item><foo>Test\xd6</foo></item></items>'.encode('latin-1'),
|
||||
'csv': u'foo\r\nTest\xd6\r\n'.encode('latin-1'),
|
||||
}
|
||||
|
||||
for format in formats:
|
||||
settings = {'FEED_FORMAT': format, 'FEED_EXPORT_ENCODING': 'latin-1'}
|
||||
data = yield self.exported_data(items, settings)
|
||||
self.assertEqual(formats[format], data)
|
||||
|
Loading…
x
Reference in New Issue
Block a user