1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 07:32:44 +00:00

Item loaders: allow single-argument processors (unbound methods)

This commit is contained in:
Eugenio Lacuesta 2019-07-23 18:33:19 -03:00
parent 16e0636dcf
commit 40b5cfc0a4
No known key found for this signature in database
GPG Key ID: DA3EF2D0913E9810
3 changed files with 65 additions and 21 deletions

View File

@ -142,20 +142,6 @@ accept one (and only one) positional argument, which will be an iterable.
containing the collected values (for that field). The result of the output
processors is the value that will be finally assigned to the item.
If you want to use a plain function as a processor, make sure it receives
``self`` as the first argument::
def lowercase_processor(self, values):
for v in values:
yield v.lower()
class MyItemLoader(ItemLoader):
name_in = lowercase_processor
This is because whenever a function is assigned as a class variable, it becomes
a method and would be passed the instance as the the first argument when being
called. See `this answer on stackoverflow`_ for more details.
The other thing you need to keep in mind is that the values returned by input
processors are collected internally (in lists) and then passed to output
processors to populate the fields.
@ -163,7 +149,7 @@ processors to populate the fields.
Last, but not least, Scrapy comes with some :ref:`commonly used processors
<topics-loaders-available-processors>` built-in for convenience.
.. _this answer on stackoverflow: https://stackoverflow.com/a/35322635
Declaring Item Loaders
======================

View File

@ -4,8 +4,7 @@ Item Loader
See documentation in docs/topics/loaders.rst
"""
from collections import defaultdict
import six
from contextlib import suppress
from scrapy.item import Item
from scrapy.loader.common import wrap_loader_context
@ -15,6 +14,17 @@ from scrapy.utils.misc import arg_to_iter, extract_regex
from scrapy.utils.python import flatten
def unbound_method(method):
"""
Allow to use single-argument functions as input or output processors
(no need to define an unused first 'self' argument)
"""
with suppress(AttributeError):
if '.' not in method.__qualname__:
return method.__func__
return method
class ItemLoader(object):
default_item_class = Item
@ -72,7 +82,7 @@ class ItemLoader(object):
if value is None:
return
if not field_name:
for k, v in six.iteritems(value):
for k, v in value.items():
self._add_value(k, v)
else:
self._add_value(field_name, value)
@ -82,7 +92,7 @@ class ItemLoader(object):
if value is None:
return
if not field_name:
for k, v in six.iteritems(value):
for k, v in value.items():
self._replace_value(k, v)
else:
self._replace_value(field_name, value)
@ -142,14 +152,14 @@ class ItemLoader(object):
if not proc:
proc = self._get_item_field_attr(field_name, 'input_processor',
self.default_input_processor)
return proc
return unbound_method(proc)
def get_output_processor(self, field_name):
proc = getattr(self, '%s_out' % field_name, None)
if not proc:
proc = self._get_item_field_attr(field_name, 'output_processor',
self.default_output_processor)
return proc
return unbound_method(proc)
def _process_input_value(self, field_name, value):
proc = self.get_input_processor(field_name)

View File

@ -994,5 +994,53 @@ class SelectJmesTestCase(unittest.TestCase):
)
# Functions as processors
def function_processor_strip(iterable):
return [x.strip() for x in iterable]
def function_processor_upper(iterable):
return [x.upper() for x in iterable]
class FunctionProcessorItem(Item):
foo = Field(
input_processor=function_processor_strip,
output_processor=function_processor_upper,
)
class FunctionProcessorItemLoader(ItemLoader):
default_item_class = FunctionProcessorItem
class FunctionProcessorDictLoader(ItemLoader):
default_item_class = dict
foo_in = function_processor_strip
foo_out = function_processor_upper
class FunctionProcessorTestCase(unittest.TestCase):
def test_processor_defined_in_item(self):
lo = FunctionProcessorItemLoader()
lo.add_value('foo', ' bar ')
lo.add_value('foo', [' asdf ', ' qwerty '])
self.assertEqual(
dict(lo.load_item()),
{'foo': ['BAR', 'ASDF', 'QWERTY']}
)
def test_processor_defined_in_item_loader(self):
lo = FunctionProcessorDictLoader()
lo.add_value('foo', ' bar ')
lo.add_value('foo', [' asdf ', ' qwerty '])
self.assertEqual(
dict(lo.load_item()),
{'foo': ['BAR', 'ASDF', 'QWERTY']}
)
if __name__ == "__main__":
unittest.main()