mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-21 07:52:49 +00:00
some improvements to XPathSelector and friends
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%4028
This commit is contained in:
parent
9cf83faf1c
commit
6cc91df9ed
@ -5,8 +5,8 @@ import unittest
|
||||
import libxml2
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.xpath.selector import XPathSelector
|
||||
from scrapy.xpath.constructors import xmlDoc_from_xml
|
||||
from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
|
||||
#from scrapy.xpath.constructors import xmlDoc_from_xml, xmlDoc_from_html
|
||||
from scrapy.xpath.iterator import XMLNodeIterator
|
||||
|
||||
class XPathTestCase(unittest.TestCase):
|
||||
@ -23,7 +23,7 @@ class XPathTestCase(unittest.TestCase):
|
||||
"""Simple selector tests"""
|
||||
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
|
||||
response = Response(domain="example.com", url="http://example.com", body=body)
|
||||
xpath = XPathSelector(response)
|
||||
xpath = HtmlXPathSelector(response)
|
||||
|
||||
xl = xpath.x('//input')
|
||||
self.assertEqual(2, len(xl))
|
||||
@ -59,7 +59,7 @@ class XPathTestCase(unittest.TestCase):
|
||||
</body>"""
|
||||
|
||||
response = Response(domain="example.com", url="http://example.com", body=body)
|
||||
x = XPathSelector(response)
|
||||
x = HtmlXPathSelector(response)
|
||||
|
||||
divtwo = x.x('//div[@class="two"]')
|
||||
self.assertEqual(divtwo.x("//li").extract(),
|
||||
@ -84,7 +84,7 @@ class XPathTestCase(unittest.TestCase):
|
||||
|
||||
"""
|
||||
response = Response(domain="example.com", url="http://example.com", body=body)
|
||||
x = XPathSelector(response)
|
||||
x = HtmlXPathSelector(response)
|
||||
|
||||
name_re = re.compile("Name: (\w+)")
|
||||
self.assertEqual(x.x("//ul/li").re(name_re),
|
||||
@ -92,6 +92,20 @@ class XPathTestCase(unittest.TestCase):
|
||||
self.assertEqual(x.x("//ul/li").re("Age: (\d+)"),
|
||||
["10", "20"])
|
||||
|
||||
def test_selector_over_text(self):
|
||||
hxs = HtmlXPathSelector(text='<root>lala</root>')
|
||||
self.assertEqual(hxs.extract(),
|
||||
u'<html><body><root>lala</root></body></html>')
|
||||
|
||||
xxs = XmlXPathSelector(text='<root>lala</root>')
|
||||
self.assertEqual(xxs.extract(),
|
||||
u'<root>lala</root>')
|
||||
|
||||
xxs = XmlXPathSelector(text='<root>lala</root>')
|
||||
self.assertEqual(xxs.x('.').extract(),
|
||||
[u'<root>lala</root>'])
|
||||
|
||||
|
||||
def test_selector_namespaces_simple(self):
|
||||
body = """
|
||||
<test xmlns:somens="http://scrapy.org">
|
||||
@ -101,7 +115,7 @@ class XPathTestCase(unittest.TestCase):
|
||||
"""
|
||||
|
||||
response = Response(domain="example.com", url="http://example.com", body=body)
|
||||
x = XPathSelector(response, constructor=xmlDoc_from_xml)
|
||||
x = XmlXPathSelector(response)
|
||||
|
||||
x.register_namespace("somens", "http://scrapy.org")
|
||||
self.assertEqual(x.x("//somens:a").extract(),
|
||||
@ -119,7 +133,7 @@ class XPathTestCase(unittest.TestCase):
|
||||
</BrowseNode>
|
||||
"""
|
||||
response = Response(domain="example.com", url="http://example.com", body=body)
|
||||
x = XPathSelector(response, constructor=xmlDoc_from_xml)
|
||||
x = XmlXPathSelector(response)
|
||||
|
||||
x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
|
||||
x.register_namespace("p", "http://www.scrapy.org/product")
|
||||
@ -146,10 +160,19 @@ class XPathTestCase(unittest.TestCase):
|
||||
|
||||
headers = {'Content-Type': ['text/html; charset=utf-8']}
|
||||
response = Response(domain="example.com", url="http://example.com", headers=headers, body=html_utf8)
|
||||
x = XPathSelector(response)
|
||||
x = HtmlXPathSelector(response)
|
||||
self.assertEquals(x.x("//span[@id='blank']/text()").extract(),
|
||||
[u'\xa3'])
|
||||
|
||||
def test_null_bytes(self):
|
||||
hxs = HtmlXPathSelector(text='<root>la\x00la</root>')
|
||||
self.assertEqual(hxs.extract(),
|
||||
u'<html><body><root>lala</root></body></html>')
|
||||
|
||||
xxs = XmlXPathSelector(text='<root>la\x00la</root>')
|
||||
self.assertEqual(xxs.extract(),
|
||||
u'<root>lala</root>')
|
||||
|
||||
def test_iterator(self):
|
||||
body = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
|
||||
|
@ -24,5 +24,8 @@ def xmlDoc_from_html(response):
|
||||
|
||||
def xmlDoc_from_xml(response):
|
||||
"""Return libxml2 doc for XMLs"""
|
||||
return libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options)
|
||||
|
||||
try:
|
||||
lxdoc = libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options)
|
||||
except TypeError: # libxml2 doesn't parse text with null bytes
|
||||
lxdoc = libxml2.readDoc(response.body.to_string('utf-8').replace("\x00", ""), response.url, 'utf-8', xml_parser_options)
|
||||
return lxdoc
|
||||
|
@ -4,8 +4,8 @@ from cStringIO import StringIO
|
||||
|
||||
import libxml2
|
||||
|
||||
from scrapy.xpath.constructors import xml_parser_options, xmlDoc_from_xml
|
||||
from scrapy.xpath.selector import XPathSelector
|
||||
from scrapy.xpath.constructors import xml_parser_options
|
||||
from scrapy.xpath.selector import XmlXPathSelector
|
||||
|
||||
class XMLNodeIterator(object):
|
||||
"""XMLNodeIterator provides a way to iterate over all nodes of the same
|
||||
@ -63,7 +63,7 @@ class XMLNodeSAXParser():
|
||||
if name == self.requested_nodename:
|
||||
self.inside_requested_node = False
|
||||
string = ''.join([self.xml_declaration, self.buffer.getvalue()])
|
||||
selector = XPathSelector(text=string, constructor=xmlDoc_from_xml).x('/' + self.requested_nodename)[0]
|
||||
selector = XmlXPathSelector(text=string).x('/' + self.requested_nodename)[0]
|
||||
self.selectors.append(selector)
|
||||
|
||||
def characters(self, data):
|
||||
|
@ -2,22 +2,20 @@ import libxml2
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.xpath.extension import Libxml2Document
|
||||
from scrapy.xpath.constructors import xmlDoc_from_html
|
||||
from scrapy.xpath.constructors import xmlDoc_from_html, xmlDoc_from_xml
|
||||
from scrapy.utils.python import flatten
|
||||
from scrapy.utils.misc import extract_regex
|
||||
|
||||
class XPathSelector(object):
|
||||
"""Provides an easy way for selecting document parts using XPaths and
|
||||
regexs, it also supports nested queries.
|
||||
|
||||
Usage example (untested code):
|
||||
|
||||
x = XPathSelector(response)
|
||||
i = ScrapedItem()
|
||||
i.assign("name", x.x("//h2/text()"))
|
||||
i.assign("features", x.x("//div[@class='features']).x("./span/text()")
|
||||
"""
|
||||
"""The XPathSelector class provides a convenient way for selecting document
|
||||
parts using XPaths and regexs, with support for nested queries.
|
||||
|
||||
Although this is not an abstract class, you usually instantiate one of its
|
||||
children:
|
||||
|
||||
- XmlXPathSelector (for XML content)
|
||||
- HtmlXPathSelector (for HTML content)
|
||||
"""
|
||||
|
||||
def __init__(self, response=None, text=None, node=None, parent=None, expr=None, constructor=xmlDoc_from_html):
|
||||
if parent:
|
||||
@ -36,6 +34,8 @@ class XPathSelector(object):
|
||||
self.expr = expr
|
||||
|
||||
def x(self, xpath):
|
||||
"""Perform the given XPath query on the current XPathSelector and
|
||||
return a XPathSelectorList of the result"""
|
||||
if hasattr(self.xmlNode, 'xpathEval'):
|
||||
self.doc.xpathContext.setContextNode(self.xmlNode)
|
||||
xpath_result = self.doc.xpathContext.xpathEval(xpath)
|
||||
@ -47,13 +47,20 @@ class XPathSelector(object):
|
||||
return XPathSelectorList([])
|
||||
|
||||
def re(self, regex):
|
||||
"""Return a list of unicode strings by applying the regex over all
|
||||
current XPath selections, and flattening the results"""
|
||||
return extract_regex(regex, self.extract(), 'utf-8')
|
||||
|
||||
def extract(self, **kwargs):
|
||||
def extract(self):
|
||||
"""Return a unicode string of the content referenced by the XPathSelector"""
|
||||
if isinstance(self.xmlNode, basestring):
|
||||
text = unicode(self.xmlNode, 'utf-8', errors='ignore')
|
||||
elif hasattr(self.xmlNode, 'xpathEval'):
|
||||
if isinstance(self.xmlNode, libxml2.xmlAttr):
|
||||
elif hasattr(self.xmlNode, 'serialize'):
|
||||
if isinstance(self.xmlNode, libxml2.xmlDoc):
|
||||
data = self.xmlNode.getRootElement().serialize('utf-8')
|
||||
text = unicode(data, 'utf-8', errors='ignore') if data else u''
|
||||
elif isinstance(self.xmlNode, libxml2.xmlAttr):
|
||||
# serialization doesn't work sometimes for xmlAttr types
|
||||
text = unicode(self.xmlNode.content, errors='ignore')
|
||||
else:
|
||||
data = self.xmlNode.serialize('utf-8')
|
||||
@ -66,6 +73,7 @@ class XPathSelector(object):
|
||||
return text
|
||||
|
||||
def register_namespace(self, prefix, uri):
|
||||
"""Register namespace so that it can be used in XPath queries"""
|
||||
self.doc.xpathContext.xpathRegisterNs(prefix, uri)
|
||||
|
||||
def __str__(self):
|
||||
@ -75,13 +83,29 @@ class XPathSelector(object):
|
||||
|
||||
|
||||
class XPathSelectorList(list):
|
||||
|
||||
def extract(self, **kwargs):
|
||||
return [x.extract(**kwargs) if isinstance(x, XPathSelector) else x for x in self]
|
||||
"""List of XPathSelector objects"""
|
||||
|
||||
def x(self, xpath):
|
||||
"""Perform the given XPath query on each XPathSelector of the list and
|
||||
return a new (flattened) XPathSelectorList of the results"""
|
||||
return XPathSelectorList(flatten([x.x(xpath) for x in self]))
|
||||
|
||||
def re(self, regex):
|
||||
"""Perform the re() method on each XPathSelector of the list, and
|
||||
return the result as a flattened list of unicode strings"""
|
||||
return flatten([x.re(regex) for x in self])
|
||||
|
||||
def extract(self):
|
||||
"""Return a list of unicode strings with the content referenced by each
|
||||
XPathSelector of the list"""
|
||||
return [x.extract() if isinstance(x, XPathSelector) else x for x in self]
|
||||
|
||||
class XmlXPathSelector(XPathSelector):
|
||||
"""XPathSelector for XML content"""
|
||||
def __init__(self, response=None, text=None):
|
||||
XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_xml)
|
||||
|
||||
class HtmlXPathSelector(XPathSelector):
|
||||
"""XPathSelector for HTML content"""
|
||||
def __init__(self, response=None, text=None):
|
||||
XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_html)
|
||||
|
Loading…
x
Reference in New Issue
Block a user