mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 01:23:57 +00:00
Merge pull request #461 from redapple/selectorloader
Add "unified" SelectorItemLoader (supports .add_css() and .add_xpath())
This commit is contained in:
commit
36c8da2ad6
@ -39,15 +39,15 @@ Here is a typical Item Loader usage in a :ref:`Spider <topics-spiders>`, using
|
||||
the :ref:`Product item <topics-items-declaring>` declared in the :ref:`Items
|
||||
chapter <topics-items>`::
|
||||
|
||||
from scrapy.contrib.loader import XPathItemLoader
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
from myproject.items import Product
|
||||
|
||||
def parse(self, response):
|
||||
l = XPathItemLoader(item=Product(), response=response)
|
||||
l = ItemLoader(item=Product(), response=response)
|
||||
l.add_xpath('name', '//div[@class="product_name"]')
|
||||
l.add_xpath('name', '//div[@class="product_title"]')
|
||||
l.add_xpath('price', '//p[@id="price"]')
|
||||
l.add_xpath('stock', '//p[@id="stock"]')
|
||||
l.add_css('stock', 'p#stock]')
|
||||
l.add_value('last_updated', 'today') # you can also use literal values
|
||||
return l.load_item()
|
||||
|
||||
@ -58,17 +58,18 @@ extracted from two different XPath locations in the page:
|
||||
2. ``//div[@class="product_title"]``
|
||||
|
||||
In other words, data is being collected by extracting it from two XPath
|
||||
locations, using the :meth:`~XPathItemLoader.add_xpath` method. This is the
|
||||
locations, using the :meth:`~ItemLoader.add_xpath` method. This is the
|
||||
data that will be assigned to the ``name`` field later.
|
||||
|
||||
Afterwords, similar calls are used for ``price`` and ``stock`` fields, and
|
||||
finally the ``last_update`` field is populated directly with a literal value
|
||||
Afterwords, similar calls are used for ``price`` and ``stock`` fields
|
||||
(the later using a CSS selector with the :meth:`~ItemLoader.add_css` method),
|
||||
and finally the ``last_update`` field is populated directly with a literal value
|
||||
(``today``) using a different method: :meth:`~ItemLoader.add_value`.
|
||||
|
||||
Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is
|
||||
called which actually populates and returns the item populated with the data
|
||||
previously extracted and collected with the :meth:`~XPathItemLoader.add_xpath`
|
||||
and :meth:`~ItemLoader.add_value` calls.
|
||||
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
|
||||
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
|
||||
|
||||
.. _topics-loaders-processors:
|
||||
|
||||
@ -77,7 +78,7 @@ Input and Output processors
|
||||
|
||||
An Item Loader contains one input processor and one output processor for each
|
||||
(item) field. The input processor processes the extracted data as soon as it's
|
||||
received (through the :meth:`~XPathItemLoader.add_xpath` or
|
||||
received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css` or
|
||||
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
||||
collected and kept inside the ItemLoader. After collecting all data, the
|
||||
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
||||
@ -89,11 +90,12 @@ assigned to the item.
|
||||
Let's see an example to illustrate how the input and output processors are
|
||||
called for a particular field (the same applies for any other field)::
|
||||
|
||||
l = XPathItemLoader(Product(), some_xpath_selector)
|
||||
l = ItemLoader(Product(), some_selector)
|
||||
l.add_xpath('name', xpath1) # (1)
|
||||
l.add_xpath('name', xpath2) # (2)
|
||||
l.add_value('name', 'test') # (3)
|
||||
return l.load_item() # (4)
|
||||
l.add_css('name', css) # (3)
|
||||
l.add_value('name', 'test') # (4)
|
||||
return l.load_item() # (5)
|
||||
|
||||
So what happens is:
|
||||
|
||||
@ -105,16 +107,23 @@ So what happens is:
|
||||
processor* used in (1). The result of the input processor is appended to the
|
||||
data collected in (1) (if any).
|
||||
|
||||
3. This case is similar to the previous ones, except that the value to be
|
||||
collected is assigned directly, instead of being extracted from a XPath.
|
||||
3. This case is similar to the previous ones, except that the data is extracted
|
||||
from the ``css`` CSS selector, and passed through the same *input
|
||||
processor* used in (1) and (2). The result of the input processor is appended to the
|
||||
data collected in (1) and (2) (if any).
|
||||
|
||||
4. This case is also similar to the previous ones, except that the value to be
|
||||
collected is assigned directly, instead of being extracted from a XPath
|
||||
expression or a CSS selector.
|
||||
However, the value is still passed through the input processors. In this
|
||||
case, since the value is not iterable it is converted to an iterable of a
|
||||
single element before passing it to the input processor, because input
|
||||
processor always receive iterables.
|
||||
|
||||
4. The data collected in (1) and (2) is passed through the *output processor* of
|
||||
the ``name`` field. The result of the output processor is the value assigned to
|
||||
the ``name`` field in the item.
|
||||
5. The data collected in steps (1), (2), (3) and (4) is passed through
|
||||
the *output processor* of the ``name`` field.
|
||||
The result of the output processor is the value assigned to the ``name``
|
||||
field in the item.
|
||||
|
||||
It's worth noticing that processors are just callable objects, which are called
|
||||
with the data to be parsed, and return a parsed value. So you can use any
|
||||
@ -246,14 +255,35 @@ There are several ways to modify Item Loader context values:
|
||||
ItemLoader objects
|
||||
==================
|
||||
|
||||
.. class:: ItemLoader([item], \**kwargs)
|
||||
.. class:: ItemLoader([item, selector, response], \**kwargs)
|
||||
|
||||
Return a new Item Loader for populating the given Item. If no item is
|
||||
given, one is instantiated automatically using the class in
|
||||
:attr:`default_item_class`.
|
||||
|
||||
The item and the remaining keyword arguments are assigned to the Loader
|
||||
context (accessible through the :attr:`context` attribute).
|
||||
When instantiated with a `selector` or a `response` parameters
|
||||
the :class:`ItemLoader` class provides convenient mechanisms for extracting
|
||||
data from web pages using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
:param item: The item instance to populate using subsequent calls to
|
||||
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||
or :meth:`~ItemLoader.add_value`.
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
|
||||
:param selector: The selector to extract data from, when using the
|
||||
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
||||
(resp. :meth:`replace_css`) method.
|
||||
:type selector: :class:`~scrapy.selector.Selector` object
|
||||
|
||||
:param response: The response used to construct the selector using the
|
||||
:attr:`default_selector_class`, unless the selector argument is given,
|
||||
in which case this argument is ignored.
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
The item, selector, response and the remaining keyword arguments are
|
||||
assigned to the Loader context (accessible through the :attr:`context` attribute).
|
||||
|
||||
:class:`ItemLoader` instances have the following methods:
|
||||
|
||||
.. method:: get_value(value, \*processors, \**kwargs)
|
||||
|
||||
@ -299,6 +329,91 @@ ItemLoader objects
|
||||
|
||||
Similar to :meth:`add_value` but replaces the collected data with the
|
||||
new value instead of adding it.
|
||||
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`ItemLoader`.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected XPath region
|
||||
:type re: str or compiled regex
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_xpath('//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`ItemLoader`.
|
||||
|
||||
See :meth:`get_xpath` for ``kwargs``.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_xpath('name', '//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`add_xpath` but replaces collected data instead of
|
||||
adding it.
|
||||
|
||||
.. method:: get_css(css, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
from the selector associated with this :class:`ItemLoader`.
|
||||
|
||||
:param css: the CSS selector to extract data from
|
||||
:type css: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected CSS region
|
||||
:type re: str or compiled regex
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_css('p.product-name')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_css(field_name, css, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
from the selector associated with this :class:`ItemLoader`.
|
||||
|
||||
See :meth:`get_css` for ``kwargs``.
|
||||
|
||||
:param css: the CSS selector to extract data from
|
||||
:type css: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_css('name', 'p.product-name')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_css('price', 'p#price', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_css(field_name, css, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`add_css` but replaces collected data instead of
|
||||
adding it.
|
||||
|
||||
.. method:: load_item()
|
||||
|
||||
@ -324,6 +439,8 @@ ItemLoader objects
|
||||
|
||||
Return the output processor for the given field.
|
||||
|
||||
:class:`ItemLoader` instances have the following attributes:
|
||||
|
||||
.. attribute:: item
|
||||
|
||||
The :class:`~scrapy.item.Item` object being parsed by this Item Loader.
|
||||
@ -348,71 +465,10 @@ ItemLoader objects
|
||||
The default output processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. class:: XPathItemLoader([item, selector, response], \**kwargs)
|
||||
|
||||
The :class:`XPathItemLoader` class extends the :class:`ItemLoader` class
|
||||
providing more convenient mechanisms for extracting data from web pages
|
||||
using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
:class:`XPathItemLoader` objects accept two more additional parameters in
|
||||
their constructors:
|
||||
|
||||
:param selector: The selector to extract data from, when using the
|
||||
:meth:`add_xpath` or :meth:`replace_xpath` method.
|
||||
:type selector: :class:`~scrapy.selector.Selector` object
|
||||
|
||||
:param response: The response used to construct the selector using the
|
||||
:attr:`default_selector_class`, unless the selector argument is given,
|
||||
in which case this argument is ignored.
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`XPathItemLoader`.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected XPath region
|
||||
:type re: str or compiled regex
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_xpath('//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`XPathItemLoader`.
|
||||
|
||||
See :meth:`get_xpath` for ``kwargs``.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_xpath('name', '//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`add_xpath` but replaces collected data instead of
|
||||
adding it.
|
||||
|
||||
.. attribute:: default_selector_class
|
||||
|
||||
The class used to construct the :attr:`selector` of this
|
||||
:class:`XPathItemLoader`, if only a response is given in the constructor.
|
||||
:class:`ItemLoader`, if only a response is given in the constructor.
|
||||
If a selector is given in the constructor this attribute is ignored.
|
||||
This attribute is sometimes overridden in subclasses.
|
||||
|
||||
|
@ -11,16 +11,23 @@ from scrapy.item import Item
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.utils.misc import arg_to_iter, extract_regex
|
||||
from scrapy.utils.python import flatten
|
||||
from scrapy.utils.decorator import deprecated
|
||||
from .common import wrap_loader_context
|
||||
from .processor import Identity
|
||||
|
||||
|
||||
class ItemLoader(object):
|
||||
|
||||
default_item_class = Item
|
||||
default_input_processor = Identity()
|
||||
default_output_processor = Identity()
|
||||
default_selector_class = Selector
|
||||
|
||||
def __init__(self, item=None, **context):
|
||||
def __init__(self, item=None, selector=None, response=None, **context):
|
||||
if selector is None and response is not None:
|
||||
selector = self.default_selector_class(response)
|
||||
self.selector = selector
|
||||
context.update(selector=selector, response=response)
|
||||
if item is None:
|
||||
item = self.default_item_class()
|
||||
self.item = context['item'] = item
|
||||
@ -114,32 +121,56 @@ class ItemLoader(object):
|
||||
value = default
|
||||
return value
|
||||
|
||||
class XPathItemLoader(ItemLoader):
|
||||
|
||||
default_selector_class = Selector
|
||||
|
||||
def __init__(self, item=None, selector=None, response=None, **context):
|
||||
if selector is None and response is None:
|
||||
raise RuntimeError("%s must be instantiated with a selector " \
|
||||
"or response" % self.__class__.__name__)
|
||||
if selector is None:
|
||||
selector = self.default_selector_class(response)
|
||||
self.selector = selector
|
||||
context.update(selector=selector, response=response)
|
||||
super(XPathItemLoader, self).__init__(item, **context)
|
||||
def _check_selector_method(self):
|
||||
if self.selector is None:
|
||||
raise RuntimeError("To use XPath or CSS selectors, "
|
||||
"%s must be instantiated with a selector "
|
||||
"or a response" % self.__class__.__name__)
|
||||
|
||||
def add_xpath(self, field_name, xpath, *processors, **kw):
|
||||
values = self._get_values(xpath, **kw)
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
self.add_value(field_name, values, *processors, **kw)
|
||||
|
||||
def replace_xpath(self, field_name, xpath, *processors, **kw):
|
||||
values = self._get_values(xpath, **kw)
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
self.replace_value(field_name, values, *processors, **kw)
|
||||
|
||||
def get_xpath(self, xpath, *processors, **kw):
|
||||
values = self._get_values(xpath, **kw)
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
return self.get_value(values, *processors, **kw)
|
||||
|
||||
@deprecated(use_instead='._get_xpathvalues()')
|
||||
def _get_values(self, xpaths, **kw):
|
||||
return self._get_xpathvalues(xpaths, **kw)
|
||||
|
||||
def _get_xpathvalues(self, xpaths, **kw):
|
||||
self._check_selector_method()
|
||||
xpaths = arg_to_iter(xpaths)
|
||||
return flatten([self.selector.xpath(xpath).extract() for xpath in xpaths])
|
||||
|
||||
def add_css(self, field_name, css, *processors, **kw):
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
self.add_value(field_name, values, *processors, **kw)
|
||||
|
||||
def replace_css(self, field_name, css, *processors, **kw):
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
self.replace_value(field_name, values, *processors, **kw)
|
||||
|
||||
def get_css(self, css, *processors, **kw):
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
return self.get_value(values, *processors, **kw)
|
||||
|
||||
def _get_cssvalues(self, csss, **kw):
|
||||
self._check_selector_method()
|
||||
csss = arg_to_iter(csss)
|
||||
return flatten([self.selector.css(css).extract() for css in csss])
|
||||
|
||||
|
||||
class XPathItemLoader(ItemLoader):
|
||||
def __init__(self, *a, **kw):
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn('%s is deprecated, instanciate scrapy.contrib.loader.ItemLoader '
|
||||
'instead' % type(self).__name__,
|
||||
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||
super(XPathItemLoader, self).__init__(*a, **kw)
|
||||
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.contrib.loader import ItemLoader, XPathItemLoader
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
from scrapy.contrib.loader.processor import Join, Identity, TakeFirst, \
|
||||
Compose, MapCompose
|
||||
from scrapy.item import Item, Field
|
||||
@ -38,7 +38,7 @@ def processor_with_args(value, other=None, loader_context=None):
|
||||
return value
|
||||
|
||||
|
||||
class ItemLoaderTest(unittest.TestCase):
|
||||
class BasicItemLoaderTest(unittest.TestCase):
|
||||
|
||||
def test_load_item_using_default_loader(self):
|
||||
i = TestItem()
|
||||
@ -367,37 +367,78 @@ class ProcessorsTest(unittest.TestCase):
|
||||
[u'HELLO', u'THIS', u'IS', u'SCRAPY'])
|
||||
|
||||
|
||||
class TestXPathItemLoader(XPathItemLoader):
|
||||
default_item_class = TestItem
|
||||
name_in = MapCompose(lambda v: v.title())
|
||||
class SelectortemLoaderTest(unittest.TestCase):
|
||||
response = HtmlResponse(url="", body="""
|
||||
<html>
|
||||
<body>
|
||||
<div id="id">marta</div>
|
||||
<p>paragraph</p>
|
||||
<a href="http://www.scrapy.org">homepage</a>
|
||||
<img src="/images/logo.png" width="244" height="65" alt="Scrapy">
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
|
||||
|
||||
class XPathItemLoaderTest(unittest.TestCase):
|
||||
response = HtmlResponse(url="", body='<html><body><div id="id">marta</div><p>paragraph</p></body></html>')
|
||||
def test_constructor(self):
|
||||
l = TestItemLoader()
|
||||
self.assertEqual(l.selector, None)
|
||||
|
||||
def test_constructor_errors(self):
|
||||
self.assertRaises(RuntimeError, XPathItemLoader)
|
||||
l = TestItemLoader()
|
||||
self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href')
|
||||
self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href')
|
||||
self.assertRaises(RuntimeError, l.get_xpath, '//a/@href')
|
||||
self.assertRaises(RuntimeError, l.add_css, 'name', '#name::text')
|
||||
self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text')
|
||||
self.assertRaises(RuntimeError, l.get_css, '#name::text')
|
||||
|
||||
def test_constructor_with_selector(self):
|
||||
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
||||
l = TestXPathItemLoader(selector=sel)
|
||||
l = TestItemLoader(selector=sel)
|
||||
self.assert_(l.selector is sel)
|
||||
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
def test_constructor_with_selector_css(self):
|
||||
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
||||
l = TestItemLoader(selector=sel)
|
||||
self.assert_(l.selector is sel)
|
||||
|
||||
l.add_css('name', 'div::text')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
def test_constructor_with_response(self):
|
||||
l = TestXPathItemLoader(response=self.response)
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assert_(l.selector)
|
||||
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
def test_constructor_with_response_css(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assert_(l.selector)
|
||||
|
||||
l.add_css('name', 'div::text')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
l.add_css('url', 'a::attr(href)')
|
||||
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
|
||||
|
||||
# combining/accumulating CSS selectors and XPath expressions
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta', u'Marta'])
|
||||
|
||||
l.add_xpath('url', '//img/@src')
|
||||
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png'])
|
||||
|
||||
def test_add_xpath_re(self):
|
||||
l = TestXPathItemLoader(response=self.response)
|
||||
l = TestItemLoader(response=self.response)
|
||||
l.add_xpath('name', '//div/text()', re='ma')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
||||
|
||||
def test_replace_xpath(self):
|
||||
l = TestXPathItemLoader(response=self.response)
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assert_(l.selector)
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
@ -408,7 +449,7 @@ class XPathItemLoaderTest(unittest.TestCase):
|
||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])
|
||||
|
||||
def test_get_xpath(self):
|
||||
l = TestXPathItemLoader(response=self.response)
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assertEqual(l.get_xpath('//p/text()'), [u'paragraph'])
|
||||
self.assertEqual(l.get_xpath('//p/text()', TakeFirst()), u'paragraph')
|
||||
self.assertEqual(l.get_xpath('//p/text()', TakeFirst(), re='pa'), u'pa')
|
||||
@ -416,20 +457,74 @@ class XPathItemLoaderTest(unittest.TestCase):
|
||||
self.assertEqual(l.get_xpath(['//p/text()', '//div/text()']), [u'paragraph', 'marta'])
|
||||
|
||||
def test_replace_xpath_multi_fields(self):
|
||||
l = TestXPathItemLoader(response=self.response)
|
||||
l = TestItemLoader(response=self.response)
|
||||
l.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x})
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
l.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x})
|
||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
||||
|
||||
def test_replace_xpath_re(self):
|
||||
l = TestXPathItemLoader(response=self.response)
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assert_(l.selector)
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
l.replace_xpath('name', '//div/text()', re='ma')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
||||
|
||||
def test_add_css_re(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
l.add_css('name', 'div::text', re='ma')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Ma'])
|
||||
|
||||
l.add_css('url', 'a::attr(href)', re='http://(.+)')
|
||||
self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])
|
||||
|
||||
def test_replace_css(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assert_(l.selector)
|
||||
l.add_css('name', 'div::text')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
l.replace_css('name', 'p::text')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
||||
|
||||
l.replace_css('name', ['p::text', 'div::text'])
|
||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph', 'Marta'])
|
||||
|
||||
l.add_css('url', 'a::attr(href)', re='http://(.+)')
|
||||
self.assertEqual(l.get_output_value('url'), [u'www.scrapy.org'])
|
||||
l.replace_css('url', 'img::attr(src)')
|
||||
self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])
|
||||
|
||||
def test_get_css(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assertEqual(l.get_css('p::text'), [u'paragraph'])
|
||||
self.assertEqual(l.get_css('p::text', TakeFirst()), u'paragraph')
|
||||
self.assertEqual(l.get_css('p::text', TakeFirst(), re='pa'), u'pa')
|
||||
|
||||
self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta'])
|
||||
self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']),
|
||||
[u'http://www.scrapy.org', u'/images/logo.png'])
|
||||
|
||||
def test_replace_css_multi_fields(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
l.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x})
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
l.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x})
|
||||
self.assertEqual(l.get_output_value('name'), [u'Paragraph'])
|
||||
|
||||
l.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x})
|
||||
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
|
||||
l.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x})
|
||||
self.assertEqual(l.get_output_value('url'), [u'/images/logo.png'])
|
||||
|
||||
def test_replace_css_re(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assert_(l.selector)
|
||||
l.add_css('url', 'a::attr(href)')
|
||||
self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org'])
|
||||
l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)')
|
||||
self.assertEqual(l.get_output_value('url'), [u'scrapy.org'])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
Loading…
x
Reference in New Issue
Block a user