diff --git a/docs/topics/feed-exports.rst b/docs/topics/feed-exports.rst index faf9abc1c..d9444e34a 100644 --- a/docs/topics/feed-exports.rst +++ b/docs/topics/feed-exports.rst @@ -236,14 +236,20 @@ The serialization format to be used for the feed. See FEED_EXPORT_FIELDS ------------------ +Default: ``None`` + A list of fields to export, optional. Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``. Use FEED_EXPORT_FIELDS option to define fields to export and their order. -When omitted, Scrapy uses fields defined in :class:`~.Item` subclasses a spider -is yielding. If raw dicts are used as items Scrapy tries to infer field names -from the exported data - currently it uses field names from the first item. +When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses fields +defined in dicts or :class:`~.Item` subclasses a spider is yielding. + +If an exporter requires a fixed set of fields (this is the case for +:ref:`CSV ` export format) and FEED_EXPORT_FIELDS +is empty or None, then Scrapy tries to infer field names from the +exported data - currently it uses field names from the first item. .. setting:: FEED_STORE_EMPTY diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 35ddc0fd1..7560e89d3 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -21,7 +21,6 @@ from scrapy import signals from scrapy.utils.ftp import ftp_makedirs_cwd from scrapy.exceptions import NotConfigured from scrapy.utils.misc import load_object -from scrapy.utils.python import get_func_args from scrapy.utils.log import failure_to_exc_info logger = logging.getLogger(__name__) @@ -152,7 +151,7 @@ class FeedExporter(object): if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') - self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') + self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 41913e401..d6c96ca74 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import os import csv +import json from io import BytesIO import tempfile import shutil @@ -126,7 +127,6 @@ class FeedExportTest(unittest.TestCase): egg = scrapy.Field() baz = scrapy.Field() - @defer.inlineCallbacks def run_and_export(self, spider_cls, settings=None): """ Run spider with specified settings; return exported data. """ @@ -180,7 +180,21 @@ class FeedExportTest(unittest.TestCase): self.assertEqual(rows, got_rows) @defer.inlineCallbacks - def test_export_csv_items(self): + def assertExportedJsonLines(self, items, rows, settings=None): + settings = settings or {} + settings.update({'FEED_FORMAT': 'jl'}) + data = yield self.exported_data(items, settings) + parsed = [json.loads(line) for line in data.splitlines()] + rows = [{k: v for k, v in row.items() if v} for row in rows] + self.assertEqual(rows, parsed) + + @defer.inlineCallbacks + def assertExported(self, items, header, rows, settings=None, ordered=True): + yield self.assertExportedCsv(items, header, rows, settings, ordered) + yield self.assertExportedJsonLines(items, rows, settings) + + @defer.inlineCallbacks + def test_export_items(self): # feed exporters use field names from Item items = [ self.MyItem({'foo': 'bar1', 'egg': 'spam1'}), @@ -191,10 +205,10 @@ class FeedExportTest(unittest.TestCase): {'egg': 'spam2', 'foo': 'bar2', 'baz': 'quux2'} ] header = self.MyItem.fields.keys() - yield self.assertExportedCsv(items, header, rows, ordered=False) + yield self.assertExported(items, header, rows, ordered=False) @defer.inlineCallbacks - def test_export_csv_multiple_item_classes(self): + def test_export_multiple_item_classes(self): class MyItem2(scrapy.Item): foo = scrapy.Field() @@ -207,17 +221,25 @@ class FeedExportTest(unittest.TestCase): {'hello': 'world4', 'egg': 'spam4'}, ] - # by default, Scrapy uses fields of the first Item + # by default, Scrapy uses fields of the first Item for CSV and + # all fields for JSON Lines header = self.MyItem.fields.keys() - rows = [ + rows_csv = [ {'egg': 'spam1', 'foo': 'bar1', 'baz': ''}, {'egg': '', 'foo': 'bar2', 'baz': ''}, {'egg': 'spam3', 'foo': 'bar3', 'baz': 'quux3'}, {'egg': 'spam4', 'foo': '', 'baz': ''}, ] - yield self.assertExportedCsv(items, header, rows, ordered=False) + rows_jl = [dict(row) for row in items] + yield self.assertExportedCsv(items, header, rows_csv, ordered=False) + yield self.assertExportedJsonLines(items, rows_jl) - # but it is possible to override fields using FEED_EXPORT_FIELDS + # edge case: FEED_EXPORT_FIELDS==[] means the same as default None + settings = {'FEED_EXPORT_FIELDS': []} + yield self.assertExportedCsv(items, header, rows_csv, ordered=False) + yield self.assertExportedJsonLines(items, rows_jl, settings) + + # it is possible to override fields using FEED_EXPORT_FIELDS header = ["foo", "baz", "hello"] settings = {'FEED_EXPORT_FIELDS': header} rows = [ @@ -226,25 +248,27 @@ class FeedExportTest(unittest.TestCase): {'foo': 'bar3', 'baz': 'quux3', 'hello': ''}, {'foo': '', 'baz': '', 'hello': 'world4'}, ] - yield self.assertExportedCsv(items, header, rows, - settings=settings, ordered=True) + yield self.assertExported(items, header, rows, + settings=settings, ordered=True) @defer.inlineCallbacks - def test_export_csv_dicts(self): + def test_export_dicts(self): # When dicts are used, only keys from the first row are used as - # a header. + # a header for CSV, and all fields are used for JSON Lines. items = [ {'foo': 'bar', 'egg': 'spam'}, {'foo': 'bar', 'egg': 'spam', 'baz': 'quux'}, ] - rows = [ + rows_csv = [ {'egg': 'spam', 'foo': 'bar'}, {'egg': 'spam', 'foo': 'bar'} ] - yield self.assertExportedCsv(items, ['egg', 'foo'], rows, ordered=False) + rows_jl = items + yield self.assertExportedCsv(items, ['egg', 'foo'], rows_csv, ordered=False) + yield self.assertExportedJsonLines(items, rows_jl) @defer.inlineCallbacks - def test_export_csv_feed_export_fields(self): + def test_export_feed_export_fields(self): # FEED_EXPORT_FIELDS option allows to order export fields # and to select a subset of fields to export, both for Items and dicts. @@ -260,8 +284,8 @@ class FeedExportTest(unittest.TestCase): {'egg': 'spam1', 'foo': 'bar1', 'baz': ''}, {'egg': 'spam2', 'foo': 'bar2', 'baz': 'quux2'} ] - yield self.assertExportedCsv(items, ['foo', 'baz', 'egg'], rows, - settings=settings, ordered=True) + yield self.assertExported(items, ['foo', 'baz', 'egg'], rows, + settings=settings, ordered=True) # export a subset of columns settings = {'FEED_EXPORT_FIELDS': 'egg,baz'} @@ -269,5 +293,5 @@ class FeedExportTest(unittest.TestCase): {'egg': 'spam1', 'baz': ''}, {'egg': 'spam2', 'baz': 'quux2'} ] - yield self.assertExportedCsv(items, ['egg', 'baz'], rows, - settings=settings, ordered=True) + yield self.assertExported(items, ['egg', 'baz'], rows, + settings=settings, ordered=True)