1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 06:52:53 +00:00

Merge branch 'master' into remove-six-code

This commit is contained in:
Adrián Chaves 2019-11-25 10:34:21 +01:00 committed by GitHub
commit 6d9ed6146d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 63 additions and 17 deletions

View File

@ -142,20 +142,6 @@ accept one (and only one) positional argument, which will be an iterable.
containing the collected values (for that field). The result of the output containing the collected values (for that field). The result of the output
processors is the value that will be finally assigned to the item. processors is the value that will be finally assigned to the item.
If you want to use a plain function as a processor, make sure it receives
``self`` as the first argument::
def lowercase_processor(self, values):
for v in values:
yield v.lower()
class MyItemLoader(ItemLoader):
name_in = lowercase_processor
This is because whenever a function is assigned as a class variable, it becomes
a method and would be passed the instance as the the first argument when being
called. See `this answer on stackoverflow`_ for more details.
The other thing you need to keep in mind is that the values returned by input The other thing you need to keep in mind is that the values returned by input
processors are collected internally (in lists) and then passed to output processors are collected internally (in lists) and then passed to output
processors to populate the fields. processors to populate the fields.
@ -163,7 +149,7 @@ processors to populate the fields.
Last, but not least, Scrapy comes with some :ref:`commonly used processors Last, but not least, Scrapy comes with some :ref:`commonly used processors
<topics-loaders-available-processors>` built-in for convenience. <topics-loaders-available-processors>` built-in for convenience.
.. _this answer on stackoverflow: https://stackoverflow.com/a/35322635
Declaring Item Loaders Declaring Item Loaders
====================== ======================

View File

@ -4,6 +4,7 @@ Item Loader
See documentation in docs/topics/loaders.rst See documentation in docs/topics/loaders.rst
""" """
from collections import defaultdict from collections import defaultdict
from contextlib import suppress
from scrapy.item import Item from scrapy.item import Item
from scrapy.loader.common import wrap_loader_context from scrapy.loader.common import wrap_loader_context
@ -13,6 +14,17 @@ from scrapy.utils.misc import arg_to_iter, extract_regex
from scrapy.utils.python import flatten from scrapy.utils.python import flatten
def unbound_method(method):
"""
Allow to use single-argument functions as input or output processors
(no need to define an unused first 'self' argument)
"""
with suppress(AttributeError):
if '.' not in method.__qualname__:
return method.__func__
return method
class ItemLoader(object): class ItemLoader(object):
default_item_class = Item default_item_class = Item
@ -140,14 +152,14 @@ class ItemLoader(object):
if not proc: if not proc:
proc = self._get_item_field_attr(field_name, 'input_processor', proc = self._get_item_field_attr(field_name, 'input_processor',
self.default_input_processor) self.default_input_processor)
return proc return unbound_method(proc)
def get_output_processor(self, field_name): def get_output_processor(self, field_name):
proc = getattr(self, '%s_out' % field_name, None) proc = getattr(self, '%s_out' % field_name, None)
if not proc: if not proc:
proc = self._get_item_field_attr(field_name, 'output_processor', proc = self._get_item_field_attr(field_name, 'output_processor',
self.default_output_processor) self.default_output_processor)
return proc return unbound_method(proc)
def _process_input_value(self, field_name, value): def _process_input_value(self, field_name, value):
proc = self.get_input_processor(field_name) proc = self.get_input_processor(field_name)

View File

@ -992,5 +992,53 @@ class SelectJmesTestCase(unittest.TestCase):
) )
# Functions as processors
def function_processor_strip(iterable):
return [x.strip() for x in iterable]
def function_processor_upper(iterable):
return [x.upper() for x in iterable]
class FunctionProcessorItem(Item):
foo = Field(
input_processor=function_processor_strip,
output_processor=function_processor_upper,
)
class FunctionProcessorItemLoader(ItemLoader):
default_item_class = FunctionProcessorItem
class FunctionProcessorDictLoader(ItemLoader):
default_item_class = dict
foo_in = function_processor_strip
foo_out = function_processor_upper
class FunctionProcessorTestCase(unittest.TestCase):
def test_processor_defined_in_item(self):
lo = FunctionProcessorItemLoader()
lo.add_value('foo', ' bar ')
lo.add_value('foo', [' asdf ', ' qwerty '])
self.assertEqual(
dict(lo.load_item()),
{'foo': ['BAR', 'ASDF', 'QWERTY']}
)
def test_processor_defined_in_item_loader(self):
lo = FunctionProcessorDictLoader()
lo.add_value('foo', ' bar ')
lo.add_value('foo', [' asdf ', ' qwerty '])
self.assertEqual(
dict(lo.load_item()),
{'foo': ['BAR', 'ASDF', 'QWERTY']}
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()