mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 07:43:48 +00:00
Merge pull request #461 from redapple/selectorloader
Add "unified" SelectorItemLoader (supports .add_css() and .add_xpath())
This commit is contained in:
commit
36c8da2ad6
@ -39,15 +39,15 @@ Here is a typical Item Loader usage in a :ref:`Spider <topics-spiders>`, using
|
|||||||
the :ref:`Product item <topics-items-declaring>` declared in the :ref:`Items
|
the :ref:`Product item <topics-items-declaring>` declared in the :ref:`Items
|
||||||
chapter <topics-items>`::
|
chapter <topics-items>`::
|
||||||
|
|
||||||
from scrapy.contrib.loader import XPathItemLoader
|
from scrapy.contrib.loader import ItemLoader
|
||||||
from myproject.items import Product
|
from myproject.items import Product
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
l = XPathItemLoader(item=Product(), response=response)
|
l = ItemLoader(item=Product(), response=response)
|
||||||
l.add_xpath('name', '//div[@class="product_name"]')
|
l.add_xpath('name', '//div[@class="product_name"]')
|
||||||
l.add_xpath('name', '//div[@class="product_title"]')
|
l.add_xpath('name', '//div[@class="product_title"]')
|
||||||
l.add_xpath('price', '//p[@id="price"]')
|
l.add_xpath('price', '//p[@id="price"]')
|
||||||
l.add_xpath('stock', '//p[@id="stock"]')
|
l.add_css('stock', 'p#stock]')
|
||||||
l.add_value('last_updated', 'today') # you can also use literal values
|
l.add_value('last_updated', 'today') # you can also use literal values
|
||||||
return l.load_item()
|
return l.load_item()
|
||||||
|
|
||||||
@ -58,17 +58,18 @@ extracted from two different XPath locations in the page:
|
|||||||
2. ``//div[@class="product_title"]``
|
2. ``//div[@class="product_title"]``
|
||||||
|
|
||||||
In other words, data is being collected by extracting it from two XPath
|
In other words, data is being collected by extracting it from two XPath
|
||||||
locations, using the :meth:`~XPathItemLoader.add_xpath` method. This is the
|
locations, using the :meth:`~ItemLoader.add_xpath` method. This is the
|
||||||
data that will be assigned to the ``name`` field later.
|
data that will be assigned to the ``name`` field later.
|
||||||
|
|
||||||
Afterwords, similar calls are used for ``price`` and ``stock`` fields, and
|
Afterwords, similar calls are used for ``price`` and ``stock`` fields
|
||||||
finally the ``last_update`` field is populated directly with a literal value
|
(the later using a CSS selector with the :meth:`~ItemLoader.add_css` method),
|
||||||
|
and finally the ``last_update`` field is populated directly with a literal value
|
||||||
(``today``) using a different method: :meth:`~ItemLoader.add_value`.
|
(``today``) using a different method: :meth:`~ItemLoader.add_value`.
|
||||||
|
|
||||||
Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is
|
Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is
|
||||||
called which actually populates and returns the item populated with the data
|
called which actually populates and returns the item populated with the data
|
||||||
previously extracted and collected with the :meth:`~XPathItemLoader.add_xpath`
|
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
|
||||||
and :meth:`~ItemLoader.add_value` calls.
|
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
|
||||||
|
|
||||||
.. _topics-loaders-processors:
|
.. _topics-loaders-processors:
|
||||||
|
|
||||||
@ -77,7 +78,7 @@ Input and Output processors
|
|||||||
|
|
||||||
An Item Loader contains one input processor and one output processor for each
|
An Item Loader contains one input processor and one output processor for each
|
||||||
(item) field. The input processor processes the extracted data as soon as it's
|
(item) field. The input processor processes the extracted data as soon as it's
|
||||||
received (through the :meth:`~XPathItemLoader.add_xpath` or
|
received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css` or
|
||||||
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
||||||
collected and kept inside the ItemLoader. After collecting all data, the
|
collected and kept inside the ItemLoader. After collecting all data, the
|
||||||
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
||||||
@ -89,11 +90,12 @@ assigned to the item.
|
|||||||
Let's see an example to illustrate how the input and output processors are
|
Let's see an example to illustrate how the input and output processors are
|
||||||
called for a particular field (the same applies for any other field)::
|
called for a particular field (the same applies for any other field)::
|
||||||
|
|
||||||
l = XPathItemLoader(Product(), some_xpath_selector)
|
l = ItemLoader(Product(), some_selector)
|
||||||
l.add_xpath('name', xpath1) # (1)
|
l.add_xpath('name', xpath1) # (1)
|
||||||
l.add_xpath('name', xpath2) # (2)
|
l.add_xpath('name', xpath2) # (2)
|
||||||
l.add_value('name', 'test') # (3)
|
l.add_css('name', css) # (3)
|
||||||
return l.load_item() # (4)
|
l.add_value('name', 'test') # (4)
|
||||||
|
return l.load_item() # (5)
|
||||||
|
|
||||||
So what happens is:
|
So what happens is:
|
||||||
|
|
||||||
@ -105,16 +107,23 @@ So what happens is:
|
|||||||
processor* used in (1). The result of the input processor is appended to the
|
processor* used in (1). The result of the input processor is appended to the
|
||||||
data collected in (1) (if any).
|
data collected in (1) (if any).
|
||||||
|
|
||||||
3. This case is similar to the previous ones, except that the value to be
|
3. This case is similar to the previous ones, except that the data is extracted
|
||||||
collected is assigned directly, instead of being extracted from a XPath.
|
from the ``css`` CSS selector, and passed through the same *input
|
||||||
|
processor* used in (1) and (2). The result of the input processor is appended to the
|
||||||
|
data collected in (1) and (2) (if any).
|
||||||
|
|
||||||
|
4. This case is also similar to the previous ones, except that the value to be
|
||||||
|
collected is assigned directly, instead of being extracted from a XPath
|
||||||
|
expression or a CSS selector.
|
||||||
However, the value is still passed through the input processors. In this
|
However, the value is still passed through the input processors. In this
|
||||||
case, since the value is not iterable it is converted to an iterable of a
|
case, since the value is not iterable it is converted to an iterable of a
|
||||||
single element before passing it to the input processor, because input
|
single element before passing it to the input processor, because input
|
||||||
processor always receive iterables.
|
processor always receive iterables.
|
||||||
|
|
||||||
4. The data collected in (1) and (2) is passed through the *output processor* of
|
5. The data collected in steps (1), (2), (3) and (4) is passed through
|
||||||
the ``name`` field. The result of the output processor is the value assigned to
|
the *output processor* of the ``name`` field.
|
||||||
the ``name`` field in the item.
|
The result of the output processor is the value assigned to the ``name``
|
||||||
|
field in the item.
|
||||||
|
|
||||||
It's worth noticing that processors are just callable objects, which are called
|
It's worth noticing that processors are just callable objects, which are called
|
||||||
with the data to be parsed, and return a parsed value. So you can use any
|
with the data to be parsed, and return a parsed value. So you can use any
|
||||||
@ -246,14 +255,35 @@ There are several ways to modify Item Loader context values:
|
|||||||
ItemLoader objects
|
ItemLoader objects
|
||||||
==================
|
==================
|
||||||
|
|
||||||
.. class:: ItemLoader([item], \**kwargs)
|
.. class:: ItemLoader([item, selector, response], \**kwargs)
|
||||||
|
|
||||||
Return a new Item Loader for populating the given Item. If no item is
|
Return a new Item Loader for populating the given Item. If no item is
|
||||||
given, one is instantiated automatically using the class in
|
given, one is instantiated automatically using the class in
|
||||||
:attr:`default_item_class`.
|
:attr:`default_item_class`.
|
||||||
|
|
||||||
The item and the remaining keyword arguments are assigned to the Loader
|
When instantiated with a `selector` or a `response` parameters
|
||||||
context (accessible through the :attr:`context` attribute).
|
the :class:`ItemLoader` class provides convenient mechanisms for extracting
|
||||||
|
data from web pages using :ref:`selectors <topics-selectors>`.
|
||||||
|
|
||||||
|
:param item: The item instance to populate using subsequent calls to
|
||||||
|
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||||
|
or :meth:`~ItemLoader.add_value`.
|
||||||
|
:type item: :class:`~scrapy.item.Item` object
|
||||||
|
|
||||||
|
:param selector: The selector to extract data from, when using the
|
||||||
|
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
||||||
|
(resp. :meth:`replace_css`) method.
|
||||||
|
:type selector: :class:`~scrapy.selector.Selector` object
|
||||||
|
|
||||||
|
:param response: The response used to construct the selector using the
|
||||||
|
:attr:`default_selector_class`, unless the selector argument is given,
|
||||||
|
in which case this argument is ignored.
|
||||||
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
|
|
||||||
|
The item, selector, response and the remaining keyword arguments are
|
||||||
|
assigned to the Loader context (accessible through the :attr:`context` attribute).
|
||||||
|
|
||||||
|
:class:`ItemLoader` instances have the following methods:
|
||||||
|
|
||||||
.. method:: get_value(value, \*processors, \**kwargs)
|
.. method:: get_value(value, \*processors, \**kwargs)
|
||||||
|
|
||||||
@ -299,6 +329,91 @@ ItemLoader objects
|
|||||||
|
|
||||||
Similar to :meth:`add_value` but replaces the collected data with the
|
Similar to :meth:`add_value` but replaces the collected data with the
|
||||||
new value instead of adding it.
|
new value instead of adding it.
|
||||||
|
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
||||||
|
|
||||||
|
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||||
|
value, which is used to extract a list of unicode strings from the
|
||||||
|
selector associated with this :class:`ItemLoader`.
|
||||||
|
|
||||||
|
:param xpath: the XPath to extract data from
|
||||||
|
:type xpath: str
|
||||||
|
|
||||||
|
:param re: a regular expression to use for extracting data from the
|
||||||
|
selected XPath region
|
||||||
|
:type re: str or compiled regex
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
# HTML snippet: <p class="product-name">Color TV</p>
|
||||||
|
loader.get_xpath('//p[@class="product-name"]')
|
||||||
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
|
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||||
|
|
||||||
|
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||||
|
|
||||||
|
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||||
|
value, which is used to extract a list of unicode strings from the
|
||||||
|
selector associated with this :class:`ItemLoader`.
|
||||||
|
|
||||||
|
See :meth:`get_xpath` for ``kwargs``.
|
||||||
|
|
||||||
|
:param xpath: the XPath to extract data from
|
||||||
|
:type xpath: str
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
# HTML snippet: <p class="product-name">Color TV</p>
|
||||||
|
loader.add_xpath('name', '//p[@class="product-name"]')
|
||||||
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
|
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||||
|
|
||||||
|
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||||
|
|
||||||
|
Similar to :meth:`add_xpath` but replaces collected data instead of
|
||||||
|
adding it.
|
||||||
|
|
||||||
|
.. method:: get_css(css, \*processors, \**kwargs)
|
||||||
|
|
||||||
|
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
||||||
|
instead of a value, which is used to extract a list of unicode strings
|
||||||
|
from the selector associated with this :class:`ItemLoader`.
|
||||||
|
|
||||||
|
:param css: the CSS selector to extract data from
|
||||||
|
:type css: str
|
||||||
|
|
||||||
|
:param re: a regular expression to use for extracting data from the
|
||||||
|
selected CSS region
|
||||||
|
:type re: str or compiled regex
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
# HTML snippet: <p class="product-name">Color TV</p>
|
||||||
|
loader.get_css('p.product-name')
|
||||||
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
|
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
||||||
|
|
||||||
|
.. method:: add_css(field_name, css, \*processors, \**kwargs)
|
||||||
|
|
||||||
|
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
||||||
|
instead of a value, which is used to extract a list of unicode strings
|
||||||
|
from the selector associated with this :class:`ItemLoader`.
|
||||||
|
|
||||||
|
See :meth:`get_css` for ``kwargs``.
|
||||||
|
|
||||||
|
:param css: the CSS selector to extract data from
|
||||||
|
:type css: str
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
# HTML snippet: <p class="product-name">Color TV</p>
|
||||||
|
loader.add_css('name', 'p.product-name')
|
||||||
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
|
loader.add_css('price', 'p#price', re='the price is (.*)')
|
||||||
|
|
||||||
|
.. method:: replace_css(field_name, css, \*processors, \**kwargs)
|
||||||
|
|
||||||
|
Similar to :meth:`add_css` but replaces collected data instead of
|
||||||
|
adding it.
|
||||||
|
|
||||||
.. method:: load_item()
|
.. method:: load_item()
|
||||||
|
|
||||||
@ -324,6 +439,8 @@ ItemLoader objects
|
|||||||
|
|
||||||
Return the output processor for the given field.
|
Return the output processor for the given field.
|
||||||
|
|
||||||
|
:class:`ItemLoader` instances have the following attributes:
|
||||||
|
|
||||||
.. attribute:: item
|
.. attribute:: item
|
||||||
|
|
||||||
The :class:`~scrapy.item.Item` object being parsed by this Item Loader.
|
The :class:`~scrapy.item.Item` object being parsed by this Item Loader.
|
||||||
@ -348,71 +465,10 @@ ItemLoader objects
|
|||||||
The default output processor to use for those fields which don't specify
|
The default output processor to use for those fields which don't specify
|
||||||
one.
|
one.
|
||||||
|
|
||||||
.. class:: XPathItemLoader([item, selector, response], \**kwargs)
|
|
||||||
|
|
||||||
The :class:`XPathItemLoader` class extends the :class:`ItemLoader` class
|
|
||||||
providing more convenient mechanisms for extracting data from web pages
|
|
||||||
using :ref:`selectors <topics-selectors>`.
|
|
||||||
|
|
||||||
:class:`XPathItemLoader` objects accept two more additional parameters in
|
|
||||||
their constructors:
|
|
||||||
|
|
||||||
:param selector: The selector to extract data from, when using the
|
|
||||||
:meth:`add_xpath` or :meth:`replace_xpath` method.
|
|
||||||
:type selector: :class:`~scrapy.selector.Selector` object
|
|
||||||
|
|
||||||
:param response: The response used to construct the selector using the
|
|
||||||
:attr:`default_selector_class`, unless the selector argument is given,
|
|
||||||
in which case this argument is ignored.
|
|
||||||
:type response: :class:`~scrapy.http.Response` object
|
|
||||||
|
|
||||||
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
|
||||||
|
|
||||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
|
||||||
value, which is used to extract a list of unicode strings from the
|
|
||||||
selector associated with this :class:`XPathItemLoader`.
|
|
||||||
|
|
||||||
:param xpath: the XPath to extract data from
|
|
||||||
:type xpath: str
|
|
||||||
|
|
||||||
:param re: a regular expression to use for extracting data from the
|
|
||||||
selected XPath region
|
|
||||||
:type re: str or compiled regex
|
|
||||||
|
|
||||||
Examples::
|
|
||||||
|
|
||||||
# HTML snippet: <p class="product-name">Color TV</p>
|
|
||||||
loader.get_xpath('//p[@class="product-name"]')
|
|
||||||
# HTML snippet: <p id="price">the price is $1200</p>
|
|
||||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
|
||||||
|
|
||||||
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
|
||||||
|
|
||||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
|
||||||
value, which is used to extract a list of unicode strings from the
|
|
||||||
selector associated with this :class:`XPathItemLoader`.
|
|
||||||
|
|
||||||
See :meth:`get_xpath` for ``kwargs``.
|
|
||||||
|
|
||||||
:param xpath: the XPath to extract data from
|
|
||||||
:type xpath: str
|
|
||||||
|
|
||||||
Examples::
|
|
||||||
|
|
||||||
# HTML snippet: <p class="product-name">Color TV</p>
|
|
||||||
loader.add_xpath('name', '//p[@class="product-name"]')
|
|
||||||
# HTML snippet: <p id="price">the price is $1200</p>
|
|
||||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
|
||||||
|
|
||||||
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
|
||||||
|
|
||||||
Similar to :meth:`add_xpath` but replaces collected data instead of
|
|
||||||
adding it.
|
|
||||||
|
|
||||||
.. attribute:: default_selector_class
|
.. attribute:: default_selector_class
|
||||||
|
|
||||||
The class used to construct the :attr:`selector` of this
|
The class used to construct the :attr:`selector` of this
|
||||||
:class:`XPathItemLoader`, if only a response is given in the constructor.
|
:class:`ItemLoader`, if only a response is given in the constructor.
|
||||||
If a selector is given in the constructor this attribute is ignored.
|
If a selector is given in the constructor this attribute is ignored.
|
||||||
This attribute is sometimes overridden in subclasses.
|
This attribute is sometimes overridden in subclasses.
|
||||||
|
|
||||||
|
@ -11,16 +11,23 @@ from scrapy.item import Item
|
|||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
from scrapy.utils.misc import arg_to_iter, extract_regex
|
from scrapy.utils.misc import arg_to_iter, extract_regex
|
||||||
from scrapy.utils.python import flatten
|
from scrapy.utils.python import flatten
|
||||||
|
from scrapy.utils.decorator import deprecated
|
||||||
from .common import wrap_loader_context
|
from .common import wrap_loader_context
|
||||||
from .processor import Identity
|
from .processor import Identity
|
||||||
|
|
||||||
|
|
||||||
class ItemLoader(object):
|
class ItemLoader(object):
|
||||||
|
|
||||||
default_item_class = Item
|
default_item_class = Item
|
||||||
default_input_processor = Identity()
|
default_input_processor = Identity()
|
||||||
default_output_processor = Identity()
|
default_output_processor = Identity()
|
||||||
|
default_selector_class = Selector
|
||||||
|
|
||||||
def __init__(self, item=None, **context):
|
def __init__(self, item=None, selector=None, response=None, **context):
|
||||||
|
if selector is None and response is not None:
|
||||||
|
selector = self.default_selector_class(response)
|
||||||
|
self.selector = selector
|
||||||
|
context.update(selector=selector, response=response)
|
||||||
if item is None:
|
if item is None:
|
||||||
item = self.default_item_class()
|
item = self.default_item_class()
|
||||||
self.item = context['item'] = item
|
self.item = context['item'] = item
|
||||||
@ -114,32 +121,56 @@ class ItemLoader(object):
|
|||||||
value = default
|
value = default
|
||||||
return value
|
return value
|
||||||
|
|
||||||
class XPathItemLoader(ItemLoader):
|
def _check_selector_method(self):
|
||||||
|
if self.selector is None:
|
||||||
default_selector_class = Selector
|
raise RuntimeError("To use XPath or CSS selectors, "
|
||||||
|
"%s must be instantiated with a selector "
|
||||||
def __init__(self, item=None, selector=None, response=None, **context):
|
"or a response" % self.__class__.__name__)
|
||||||
if selector is None and response is None:
|
|
||||||
raise RuntimeError("%s must be instantiated with a selector " \
|
|
||||||
"or response" % self.__class__.__name__)
|
|
||||||
if selector is None:
|
|
||||||
selector = self.default_selector_class(response)
|
|
||||||
self.selector = selector
|
|
||||||
context.update(selector=selector, response=response)
|
|
||||||
super(XPathItemLoader, self).__init__(item, **context)
|
|
||||||
|
|
||||||
def add_xpath(self, field_name, xpath, *processors, **kw):
|
def add_xpath(self, field_name, xpath, *processors, **kw):
|
||||||
values = self._get_values(xpath, **kw)
|
values = self._get_xpathvalues(xpath, **kw)
|
||||||
self.add_value(field_name, values, *processors, **kw)
|
self.add_value(field_name, values, *processors, **kw)
|
||||||
|
|
||||||
def replace_xpath(self, field_name, xpath, *processors, **kw):
|
def replace_xpath(self, field_name, xpath, *processors, **kw):
|
||||||
values = self._get_values(xpath, **kw)
|
values = self._get_xpathvalues(xpath, **kw)
|
||||||
self.replace_value(field_name, values, *processors, **kw)
|
self.replace_value(field_name, values, *processors, **kw)
|
||||||
|
|
||||||
def get_xpath(self, xpath, *processors, **kw):
|
def get_xpath(self, xpath, *processors, **kw):
|
||||||
values = self._get_values(xpath, **kw)
|
values = self._get_xpathvalues(xpath, **kw)
|
||||||
return self.get_value(values, *processors, **kw)
|
return self.get_value(values, *processors, **kw)
|
||||||
|
|
||||||
|
@deprecated(use_instead='._get_xpathvalues()')
|
||||||
def _get_values(self, xpaths, **kw):
|
def _get_values(self, xpaths, **kw):
|
||||||
|
return self._get_xpathvalues(xpaths, **kw)
|
||||||
|
|
||||||
|
def _get_xpathvalues(self, xpaths, **kw):
|
||||||
|
self._check_selector_method()
|
||||||
xpaths = arg_to_iter(xpaths)
|
xpaths = arg_to_iter(xpaths)
|
||||||
return flatten([self.selector.xpath(xpath).extract() for xpath in xpaths])
|
return flatten([self.selector.xpath(xpath).extract() for xpath in xpaths])
|
||||||
|
|
||||||
|
def add_css(self, field_name, css, *processors, **kw):
|
||||||
|
values = self._get_cssvalues(css, **kw)
|
||||||
|
self.add_value(field_name, values, *processors, **kw)
|
||||||
|
|
||||||
|
def replace_css(self, field_name, css, *processors, **kw):
|
||||||
|
values = self._get_cssvalues(css, **kw)
|
||||||
|
self.replace_value(field_name, values, *processors, **kw)
|
||||||
|
|
||||||
|
def get_css(self, css, *processors, **kw):
|
||||||
|
values = self._get_cssvalues(css, **kw)
|
||||||
|
return self.get_value(values, *processors, **kw)
|
||||||
|
|
||||||
|
def _get_cssvalues(self, csss, **kw):
|
||||||
|
self._check_selector_method()
|
||||||
|
csss = arg_to_iter(csss)
|
||||||
|
return flatten([self.selector.css(css).extract() for css in csss])
|
||||||
|
|
||||||
|
|
||||||
|
class XPathItemLoader(ItemLoader):
|
||||||
|
def __init__(self, *a, **kw):
|
||||||
|
import warnings
|
||||||
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||||
|
warnings.warn('%s is deprecated, instanciate scrapy.contrib.loader.ItemLoader '
|
||||||
|
'instead' % type(self).__name__,
|
||||||
|
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||||
|
super(XPathItemLoader, self).__init__(*a, **kw)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from scrapy.contrib.loader import ItemLoader, XPathItemLoader
|
from scrapy.contrib.loader import ItemLoader
|
||||||
from scrapy.contrib.loader.processor import Join, Identity, TakeFirst, \
|
from scrapy.contrib.loader.processor import Join, Identity, TakeFirst, \
|
||||||
Compose, MapCompose
|
Compose, MapCompose
|
||||||
from scrapy.item import Item, Field
|
from scrapy.item import Item, Field
|
||||||
@ -38,7 +38,7 @@ def processor_with_args(value, other=None, loader_context=None):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
class ItemLoaderTest(unittest.TestCase):
|
class BasicItemLoaderTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_load_item_using_default_loader(self):
|
def test_load_item_using_default_loader(self):
|
||||||
i = TestItem()
|
i = TestItem()
|
||||||
@ -367,37 +367,78 @@ class ProcessorsTest(unittest.TestCase):
|
|||||||
[u'HELLO', u'THIS', u'IS', u'SCRAPY'])
|
[u'HELLO', u'THIS', u'IS', u'SCRAPY'])
|
||||||
|
|
||||||
|
|
||||||
class TestXPathItemLoader(XPathItemLoader):
|
class SelectortemLoaderTest(unittest.TestCase):
|
||||||
default_item_class = TestItem
|
response = HtmlResponse(url="", body="""
|
||||||
name_in = MapCompose(lambda v: v.title())
|
<html>
|
||||||
|
<body>
|
||||||
|
<div id="id">marta</div>
|
||||||
|
<p>paragraph</p>
|
||||||
|
<a href="http://www.scrapy.org">homepage</a>
|
||||||
|
<img src="/images/logo.png" width="244" height="65" alt="Scrapy">
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
def test_constructor(self):
|
||||||
class XPathItemLoaderTest(unittest.TestCase):
|
l = TestItemLoader()
|
||||||
response = HtmlResponse(url="", body='<html><body><div id="id">marta</div><p>paragraph</p></body></html>')
|
self.assertEqual(l.selector, None)
|
||||||
|
|
||||||
def test_constructor_errors(self):
|
def test_constructor_errors(self):
|
||||||
self.assertRaises(RuntimeError, XPathItemLoader)
|
l = TestItemLoader()
|
||||||
|
self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href')
|
||||||
|
self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href')
|
||||||
|
self.assertRaises(RuntimeError, l.get_xpath, '//a/@href')
|
||||||
|
self.assertRaises(RuntimeError, l.add_css, 'name', '#name::text')
|
||||||
|
self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text')
|
||||||
|
self.assertRaises(RuntimeError, l.get_css, '#name::text')
|
||||||
|
|
||||||
def test_constructor_with_selector(self):
|
def test_constructor_with_selector(self):
|
||||||
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
||||||
l = TestXPathItemLoader(selector=sel)
|
l = TestItemLoader(selector=sel)
|
||||||
self.assert_(l.selector is sel)
|
self.assert_(l.selector is sel)
|
||||||
|
|
||||||
l.add_xpath('name', '//div/text()')
|
l.add_xpath('name', '//div/text()')
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
|
|
||||||
|
def test_constructor_with_selector_css(self):
|
||||||
|
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
||||||
|
l = TestItemLoader(selector=sel)
|
||||||
|
self.assert_(l.selector is sel)
|
||||||
|
|
||||||
|
l.add_css('name', 'div::text')
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
|
|
||||||
def test_constructor_with_response(self):
|
def test_constructor_with_response(self):
|
||||||
l = TestXPathItemLoader(response=self.response)
|
l = TestItemLoader(response=self.response)
|
||||||
self.assert_(l.selector)
|
self.assert_(l.selector)
|
||||||
|
|
||||||
l.add_xpath('name', '//div/text()')
|
l.add_xpath('name', '//div/text()')
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
|
|
||||||
|
def test_constructor_with_response_css(self):
|
||||||
|
l = TestItemLoader(response=self.response)
|
||||||
|
self.assert_(l.selector)
|
||||||
|
|
||||||
|
l.add_css('name', 'div::text')
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
|
|
||||||
|
l.add_css('url', 'a::attr(href)')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
|
||||||
|
|
||||||
|
# combining/accumulating CSS selectors and XPath expressions
|
||||||
|
l.add_xpath('name', '//div/text()')
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Marta', u'Marta'])
|
||||||
|
|
||||||
|
l.add_xpath('url', '//img/@src')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png'])
|
||||||
|
|
||||||
def test_add_xpath_re(self):
|
def test_add_xpath_re(self):
|
||||||
l = TestXPathItemLoader(response=self.response)
|
l = TestItemLoader(response=self.response)
|
||||||
l.add_xpath('name', '//div/text()', re='ma')
|
l.add_xpath('name', '//div/text()', re='ma')
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
||||||
|
|
||||||
def test_replace_xpath(self):
|
def test_replace_xpath(self):
|
||||||
l = TestXPathItemLoader(response=self.response)
|
l = TestItemLoader(response=self.response)
|
||||||
self.assert_(l.selector)
|
self.assert_(l.selector)
|
||||||
l.add_xpath('name', '//div/text()')
|
l.add_xpath('name', '//div/text()')
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
@ -408,7 +449,7 @@ class XPathItemLoaderTest(unittest.TestCase):
|
|||||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])
|
self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])
|
||||||
|
|
||||||
def test_get_xpath(self):
|
def test_get_xpath(self):
|
||||||
l = TestXPathItemLoader(response=self.response)
|
l = TestItemLoader(response=self.response)
|
||||||
self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph'])
|
self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph'])
|
||||||
self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph')
|
self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph')
|
||||||
self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa')
|
self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa')
|
||||||
@ -416,20 +457,74 @@ class XPathItemLoaderTest(unittest.TestCase):
|
|||||||
self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta'])
|
self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta'])
|
||||||
|
|
||||||
def test_replace_xpath_multi_fields(self):
|
def test_replace_xpath_multi_fields(self):
|
||||||
l = TestXPathItemLoader(response=self.response)
|
l = TestItemLoader(response=self.response)
|
||||||
l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x})
|
l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x})
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x})
|
l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x})
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
||||||
|
|
||||||
def test_replace_xpath_re(self):
|
def test_replace_xpath_re(self):
|
||||||
l = TestXPathItemLoader(response=self.response)
|
l = TestItemLoader(response=self.response)
|
||||||
self.assert_(l.selector)
|
self.assert_(l.selector)
|
||||||
l.add_xpath('name', '//div/text()')
|
l.add_xpath('name', '//div/text()')
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
l.replace_xpath('name', '//div/text()', re='ma')
|
l.replace_xpath('name', '//div/text()', re='ma')
|
||||||
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
||||||
|
|
||||||
|
def test_add_css_re(self):
|
||||||
|
l = TestItemLoader(response=self.response)
|
||||||
|
l.add_css('name', 'div::text', re='ma')
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
||||||
|
|
||||||
|
l.add_css('url', 'a::attr(href)', re='http://(.+)')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])
|
||||||
|
|
||||||
|
def test_replace_css(self):
|
||||||
|
l = TestItemLoader(response=self.response)
|
||||||
|
self.assert_(l.selector)
|
||||||
|
l.add_css('name', 'div::text')
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
|
l.replace_css('name', 'p::text')
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
||||||
|
|
||||||
|
l.replace_css('name', ['p::text', 'div::text'])
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])
|
||||||
|
|
||||||
|
l.add_css('url', 'a::attr(href)', re='http://(.+)')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])
|
||||||
|
l.replace_css('url', 'img::attr(src)')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])
|
||||||
|
|
||||||
|
def test_get_css(self):
|
||||||
|
l = TestItemLoader(response=self.response)
|
||||||
|
self.assertEqual(l.get_css('p::text'), [u'paragraph'])
|
||||||
|
self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph')
|
||||||
|
self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa')
|
||||||
|
|
||||||
|
self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta'])
|
||||||
|
self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']),
|
||||||
|
[u'http://www.scrapy.org', u'/images/logo.png'])
|
||||||
|
|
||||||
|
def test_replace_css_multi_fields(self):
|
||||||
|
l = TestItemLoader(response=self.response)
|
||||||
|
l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x})
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||||
|
l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x})
|
||||||
|
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
||||||
|
|
||||||
|
l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x})
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
|
||||||
|
l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x})
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])
|
||||||
|
|
||||||
|
def test_replace_css_re(self):
|
||||||
|
l = TestItemLoader(response=self.response)
|
||||||
|
self.assert_(l.selector)
|
||||||
|
l.add_css('url', 'a::attr(href)')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
|
||||||
|
l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)')
|
||||||
|
self.assertEqual(l.get_output_value('url'), [u'scrapy.org'])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user