From 40b5cfc0a4adbc51fa35018b09902255228e360c Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 23 Jul 2019 18:33:19 -0300 Subject: [PATCH] Item loaders: allow single-argument processors (unbound methods) --- docs/topics/loaders.rst | 16 +------------ scrapy/loader/__init__.py | 22 +++++++++++++----- tests/test_loader.py | 48 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 21 deletions(-) diff --git a/docs/topics/loaders.rst b/docs/topics/loaders.rst index 12a5e5c60..81c8dab03 100644 --- a/docs/topics/loaders.rst +++ b/docs/topics/loaders.rst @@ -142,20 +142,6 @@ accept one (and only one) positional argument, which will be an iterable. containing the collected values (for that field). The result of the output processors is the value that will be finally assigned to the item. -If you want to use a plain function as a processor, make sure it receives -``self`` as the first argument:: - - def lowercase_processor(self, values): - for v in values: - yield v.lower() - - class MyItemLoader(ItemLoader): - name_in = lowercase_processor - -This is because whenever a function is assigned as a class variable, it becomes -a method and would be passed the instance as the the first argument when being -called. See `this answer on stackoverflow`_ for more details. - The other thing you need to keep in mind is that the values returned by input processors are collected internally (in lists) and then passed to output processors to populate the fields. @@ -163,7 +149,7 @@ processors to populate the fields. Last, but not least, Scrapy comes with some :ref:`commonly used processors ` built-in for convenience. -.. _this answer on stackoverflow: https://stackoverflow.com/a/35322635 + Declaring Item Loaders ====================== diff --git a/scrapy/loader/__init__.py b/scrapy/loader/__init__.py index 60fd6d222..7cf67e29e 100644 --- a/scrapy/loader/__init__.py +++ b/scrapy/loader/__init__.py @@ -4,8 +4,7 @@ Item Loader See documentation in docs/topics/loaders.rst """ from collections import defaultdict - -import six +from contextlib import suppress from scrapy.item import Item from scrapy.loader.common import wrap_loader_context @@ -15,6 +14,17 @@ from scrapy.utils.misc import arg_to_iter, extract_regex from scrapy.utils.python import flatten +def unbound_method(method): + """ + Allow to use single-argument functions as input or output processors + (no need to define an unused first 'self' argument) + """ + with suppress(AttributeError): + if '.' not in method.__qualname__: + return method.__func__ + return method + + class ItemLoader(object): default_item_class = Item @@ -72,7 +82,7 @@ class ItemLoader(object): if value is None: return if not field_name: - for k, v in six.iteritems(value): + for k, v in value.items(): self._add_value(k, v) else: self._add_value(field_name, value) @@ -82,7 +92,7 @@ class ItemLoader(object): if value is None: return if not field_name: - for k, v in six.iteritems(value): + for k, v in value.items(): self._replace_value(k, v) else: self._replace_value(field_name, value) @@ -142,14 +152,14 @@ class ItemLoader(object): if not proc: proc = self._get_item_field_attr(field_name, 'input_processor', self.default_input_processor) - return proc + return unbound_method(proc) def get_output_processor(self, field_name): proc = getattr(self, '%s_out' % field_name, None) if not proc: proc = self._get_item_field_attr(field_name, 'output_processor', self.default_output_processor) - return proc + return unbound_method(proc) def _process_input_value(self, field_name, value): proc = self.get_input_processor(field_name) diff --git a/tests/test_loader.py b/tests/test_loader.py index b87602809..6bfc31dbf 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -994,5 +994,53 @@ class SelectJmesTestCase(unittest.TestCase): ) +# Functions as processors + +def function_processor_strip(iterable): + return [x.strip() for x in iterable] + + +def function_processor_upper(iterable): + return [x.upper() for x in iterable] + + +class FunctionProcessorItem(Item): + foo = Field( + input_processor=function_processor_strip, + output_processor=function_processor_upper, + ) + + +class FunctionProcessorItemLoader(ItemLoader): + default_item_class = FunctionProcessorItem + + +class FunctionProcessorDictLoader(ItemLoader): + default_item_class = dict + foo_in = function_processor_strip + foo_out = function_processor_upper + + +class FunctionProcessorTestCase(unittest.TestCase): + + def test_processor_defined_in_item(self): + lo = FunctionProcessorItemLoader() + lo.add_value('foo', ' bar ') + lo.add_value('foo', [' asdf ', ' qwerty ']) + self.assertEqual( + dict(lo.load_item()), + {'foo': ['BAR', 'ASDF', 'QWERTY']} + ) + + def test_processor_defined_in_item_loader(self): + lo = FunctionProcessorDictLoader() + lo.add_value('foo', ' bar ') + lo.add_value('foo', [' asdf ', ' qwerty ']) + self.assertEqual( + dict(lo.load_item()), + {'foo': ['BAR', 'ASDF', 'QWERTY']} + ) + + if __name__ == "__main__": unittest.main()