1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 07:32:44 +00:00

Merge branch 'master' into remove-six-code

This commit is contained in:
Adrián Chaves 2019-11-25 10:34:21 +01:00 committed by GitHub
commit 6d9ed6146d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 63 additions and 17 deletions

View File

@ -142,20 +142,6 @@ accept one (and only one) positional argument, which will be an iterable.
containing the collected values (for that field). The result of the output
processors is the value that will be finally assigned to the item.
If you want to use a plain function as a processor, make sure it receives
``self`` as the first argument::
def lowercase_processor(self, values):
for v in values:
yield v.lower()
class MyItemLoader(ItemLoader):
name_in = lowercase_processor
This is because whenever a function is assigned as a class variable, it becomes
a method and would be passed the instance as the the first argument when being
called. See `this answer on stackoverflow`_ for more details.
The other thing you need to keep in mind is that the values returned by input
processors are collected internally (in lists) and then passed to output
processors to populate the fields.
@ -163,7 +149,7 @@ processors to populate the fields.
Last, but not least, Scrapy comes with some :ref:`commonly used processors
<topics-loaders-available-processors>` built-in for convenience.
.. _this answer on stackoverflow: https://stackoverflow.com/a/35322635
Declaring Item Loaders
======================

View File

@ -4,6 +4,7 @@ Item Loader
See documentation in docs/topics/loaders.rst
"""
from collections import defaultdict
from contextlib import suppress
from scrapy.item import Item
from scrapy.loader.common import wrap_loader_context
@ -13,6 +14,17 @@ from scrapy.utils.misc import arg_to_iter, extract_regex
from scrapy.utils.python import flatten
def unbound_method(method):
"""
Allow to use single-argument functions as input or output processors
(no need to define an unused first 'self' argument)
"""
with suppress(AttributeError):
if '.' not in method.__qualname__:
return method.__func__
return method
class ItemLoader(object):
default_item_class = Item
@ -140,14 +152,14 @@ class ItemLoader(object):
if not proc:
proc = self._get_item_field_attr(field_name, 'input_processor',
self.default_input_processor)
return proc
return unbound_method(proc)
def get_output_processor(self, field_name):
proc = getattr(self, '%s_out' % field_name, None)
if not proc:
proc = self._get_item_field_attr(field_name, 'output_processor',
self.default_output_processor)
return proc
return unbound_method(proc)
def _process_input_value(self, field_name, value):
proc = self.get_input_processor(field_name)

View File

@ -992,5 +992,53 @@ class SelectJmesTestCase(unittest.TestCase):
)
# Functions as processors
def function_processor_strip(iterable):
return [x.strip() for x in iterable]
def function_processor_upper(iterable):
return [x.upper() for x in iterable]
class FunctionProcessorItem(Item):
foo = Field(
input_processor=function_processor_strip,
output_processor=function_processor_upper,
)
class FunctionProcessorItemLoader(ItemLoader):
default_item_class = FunctionProcessorItem
class FunctionProcessorDictLoader(ItemLoader):
default_item_class = dict
foo_in = function_processor_strip
foo_out = function_processor_upper
class FunctionProcessorTestCase(unittest.TestCase):
def test_processor_defined_in_item(self):
lo = FunctionProcessorItemLoader()
lo.add_value('foo', ' bar ')
lo.add_value('foo', [' asdf ', ' qwerty '])
self.assertEqual(
dict(lo.load_item()),
{'foo': ['BAR', 'ASDF', 'QWERTY']}
)
def test_processor_defined_in_item_loader(self):
lo = FunctionProcessorDictLoader()
lo.add_value('foo', ' bar ')
lo.add_value('foo', [' asdf ', ' qwerty '])
self.assertEqual(
dict(lo.load_item()),
{'foo': ['BAR', 'ASDF', 'QWERTY']}
)
if __name__ == "__main__":
unittest.main()