mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 06:52:53 +00:00
Merge branch 'master' into remove-six-code
This commit is contained in:
commit
6d9ed6146d
@ -142,20 +142,6 @@ accept one (and only one) positional argument, which will be an iterable.
|
|||||||
containing the collected values (for that field). The result of the output
|
containing the collected values (for that field). The result of the output
|
||||||
processors is the value that will be finally assigned to the item.
|
processors is the value that will be finally assigned to the item.
|
||||||
|
|
||||||
If you want to use a plain function as a processor, make sure it receives
|
|
||||||
``self`` as the first argument::
|
|
||||||
|
|
||||||
def lowercase_processor(self, values):
|
|
||||||
for v in values:
|
|
||||||
yield v.lower()
|
|
||||||
|
|
||||||
class MyItemLoader(ItemLoader):
|
|
||||||
name_in = lowercase_processor
|
|
||||||
|
|
||||||
This is because whenever a function is assigned as a class variable, it becomes
|
|
||||||
a method and would be passed the instance as the the first argument when being
|
|
||||||
called. See `this answer on stackoverflow`_ for more details.
|
|
||||||
|
|
||||||
The other thing you need to keep in mind is that the values returned by input
|
The other thing you need to keep in mind is that the values returned by input
|
||||||
processors are collected internally (in lists) and then passed to output
|
processors are collected internally (in lists) and then passed to output
|
||||||
processors to populate the fields.
|
processors to populate the fields.
|
||||||
@ -163,7 +149,7 @@ processors to populate the fields.
|
|||||||
Last, but not least, Scrapy comes with some :ref:`commonly used processors
|
Last, but not least, Scrapy comes with some :ref:`commonly used processors
|
||||||
<topics-loaders-available-processors>` built-in for convenience.
|
<topics-loaders-available-processors>` built-in for convenience.
|
||||||
|
|
||||||
.. _this answer on stackoverflow: https://stackoverflow.com/a/35322635
|
|
||||||
|
|
||||||
Declaring Item Loaders
|
Declaring Item Loaders
|
||||||
======================
|
======================
|
||||||
|
@ -4,6 +4,7 @@ Item Loader
|
|||||||
See documentation in docs/topics/loaders.rst
|
See documentation in docs/topics/loaders.rst
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from contextlib import suppress
|
||||||
|
|
||||||
from scrapy.item import Item
|
from scrapy.item import Item
|
||||||
from scrapy.loader.common import wrap_loader_context
|
from scrapy.loader.common import wrap_loader_context
|
||||||
@ -13,6 +14,17 @@ from scrapy.utils.misc import arg_to_iter, extract_regex
|
|||||||
from scrapy.utils.python import flatten
|
from scrapy.utils.python import flatten
|
||||||
|
|
||||||
|
|
||||||
|
def unbound_method(method):
|
||||||
|
"""
|
||||||
|
Allow to use single-argument functions as input or output processors
|
||||||
|
(no need to define an unused first 'self' argument)
|
||||||
|
"""
|
||||||
|
with suppress(AttributeError):
|
||||||
|
if '.' not in method.__qualname__:
|
||||||
|
return method.__func__
|
||||||
|
return method
|
||||||
|
|
||||||
|
|
||||||
class ItemLoader(object):
|
class ItemLoader(object):
|
||||||
|
|
||||||
default_item_class = Item
|
default_item_class = Item
|
||||||
@ -140,14 +152,14 @@ class ItemLoader(object):
|
|||||||
if not proc:
|
if not proc:
|
||||||
proc = self._get_item_field_attr(field_name, 'input_processor',
|
proc = self._get_item_field_attr(field_name, 'input_processor',
|
||||||
self.default_input_processor)
|
self.default_input_processor)
|
||||||
return proc
|
return unbound_method(proc)
|
||||||
|
|
||||||
def get_output_processor(self, field_name):
|
def get_output_processor(self, field_name):
|
||||||
proc = getattr(self, '%s_out' % field_name, None)
|
proc = getattr(self, '%s_out' % field_name, None)
|
||||||
if not proc:
|
if not proc:
|
||||||
proc = self._get_item_field_attr(field_name, 'output_processor',
|
proc = self._get_item_field_attr(field_name, 'output_processor',
|
||||||
self.default_output_processor)
|
self.default_output_processor)
|
||||||
return proc
|
return unbound_method(proc)
|
||||||
|
|
||||||
def _process_input_value(self, field_name, value):
|
def _process_input_value(self, field_name, value):
|
||||||
proc = self.get_input_processor(field_name)
|
proc = self.get_input_processor(field_name)
|
||||||
|
@ -992,5 +992,53 @@ class SelectJmesTestCase(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Functions as processors
|
||||||
|
|
||||||
|
def function_processor_strip(iterable):
|
||||||
|
return [x.strip() for x in iterable]
|
||||||
|
|
||||||
|
|
||||||
|
def function_processor_upper(iterable):
|
||||||
|
return [x.upper() for x in iterable]
|
||||||
|
|
||||||
|
|
||||||
|
class FunctionProcessorItem(Item):
|
||||||
|
foo = Field(
|
||||||
|
input_processor=function_processor_strip,
|
||||||
|
output_processor=function_processor_upper,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FunctionProcessorItemLoader(ItemLoader):
|
||||||
|
default_item_class = FunctionProcessorItem
|
||||||
|
|
||||||
|
|
||||||
|
class FunctionProcessorDictLoader(ItemLoader):
|
||||||
|
default_item_class = dict
|
||||||
|
foo_in = function_processor_strip
|
||||||
|
foo_out = function_processor_upper
|
||||||
|
|
||||||
|
|
||||||
|
class FunctionProcessorTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_processor_defined_in_item(self):
|
||||||
|
lo = FunctionProcessorItemLoader()
|
||||||
|
lo.add_value('foo', ' bar ')
|
||||||
|
lo.add_value('foo', [' asdf ', ' qwerty '])
|
||||||
|
self.assertEqual(
|
||||||
|
dict(lo.load_item()),
|
||||||
|
{'foo': ['BAR', 'ASDF', 'QWERTY']}
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_processor_defined_in_item_loader(self):
|
||||||
|
lo = FunctionProcessorDictLoader()
|
||||||
|
lo.add_value('foo', ' bar ')
|
||||||
|
lo.add_value('foo', [' asdf ', ' qwerty '])
|
||||||
|
self.assertEqual(
|
||||||
|
dict(lo.load_item()),
|
||||||
|
{'foo': ['BAR', 'ASDF', 'QWERTY']}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user