1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 07:52:49 +00:00

some improvements to XPathSelector and friends

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%4028
This commit is contained in:
Pablo Hoffman 2008-06-29 05:20:31 +00:00
parent 9cf83faf1c
commit 6cc91df9ed
4 changed files with 80 additions and 30 deletions

View File

@ -5,8 +5,8 @@ import unittest
import libxml2
from scrapy.http import Response
from scrapy.xpath.selector import XPathSelector
from scrapy.xpath.constructors import xmlDoc_from_xml
from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
#from scrapy.xpath.constructors import xmlDoc_from_xml, xmlDoc_from_html
from scrapy.xpath.iterator import XMLNodeIterator
class XPathTestCase(unittest.TestCase):
@ -23,7 +23,7 @@ class XPathTestCase(unittest.TestCase):
"""Simple selector tests"""
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
response = Response(domain="example.com", url="http://example.com", body=body)
xpath = XPathSelector(response)
xpath = HtmlXPathSelector(response)
xl = xpath.x('//input')
self.assertEqual(2, len(xl))
@ -59,7 +59,7 @@ class XPathTestCase(unittest.TestCase):
</body>"""
response = Response(domain="example.com", url="http://example.com", body=body)
x = XPathSelector(response)
x = HtmlXPathSelector(response)
divtwo = x.x('//div[@class="two"]')
self.assertEqual(divtwo.x("//li").extract(),
@ -84,7 +84,7 @@ class XPathTestCase(unittest.TestCase):
"""
response = Response(domain="example.com", url="http://example.com", body=body)
x = XPathSelector(response)
x = HtmlXPathSelector(response)
name_re = re.compile("Name: (\w+)")
self.assertEqual(x.x("//ul/li").re(name_re),
@ -92,6 +92,20 @@ class XPathTestCase(unittest.TestCase):
self.assertEqual(x.x("//ul/li").re("Age: (\d+)"),
["10", "20"])
def test_selector_over_text(self):
hxs = HtmlXPathSelector(text='<root>lala</root>')
self.assertEqual(hxs.extract(),
u'<html><body><root>lala</root></body></html>')
xxs = XmlXPathSelector(text='<root>lala</root>')
self.assertEqual(xxs.extract(),
u'<root>lala</root>')
xxs = XmlXPathSelector(text='<root>lala</root>')
self.assertEqual(xxs.x('.').extract(),
[u'<root>lala</root>'])
def test_selector_namespaces_simple(self):
body = """
<test xmlns:somens="http://scrapy.org">
@ -101,7 +115,7 @@ class XPathTestCase(unittest.TestCase):
"""
response = Response(domain="example.com", url="http://example.com", body=body)
x = XPathSelector(response, constructor=xmlDoc_from_xml)
x = XmlXPathSelector(response)
x.register_namespace("somens", "http://scrapy.org")
self.assertEqual(x.x("//somens:a").extract(),
@ -119,7 +133,7 @@ class XPathTestCase(unittest.TestCase):
</BrowseNode>
"""
response = Response(domain="example.com", url="http://example.com", body=body)
x = XPathSelector(response, constructor=xmlDoc_from_xml)
x = XmlXPathSelector(response)
x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
x.register_namespace("p", "http://www.scrapy.org/product")
@ -146,10 +160,19 @@ class XPathTestCase(unittest.TestCase):
headers = {'Content-Type': ['text/html; charset=utf-8']}
response = Response(domain="example.com", url="http://example.com", headers=headers, body=html_utf8)
x = XPathSelector(response)
x = HtmlXPathSelector(response)
self.assertEquals(x.x("//span[@id='blank']/text()").extract(),
[u'\xa3'])
def test_null_bytes(self):
hxs = HtmlXPathSelector(text='<root>la\x00la</root>')
self.assertEqual(hxs.extract(),
u'<html><body><root>lala</root></body></html>')
xxs = XmlXPathSelector(text='<root>la\x00la</root>')
self.assertEqual(xxs.extract(),
u'<root>lala</root>')
def test_iterator(self):
body = """<?xml version="1.0" encoding="UTF-8"?>
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">

View File

@ -24,5 +24,8 @@ def xmlDoc_from_html(response):
def xmlDoc_from_xml(response):
"""Return libxml2 doc for XMLs"""
return libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options)
try:
lxdoc = libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options)
except TypeError: # libxml2 doesn't parse text with null bytes
lxdoc = libxml2.readDoc(response.body.to_string('utf-8').replace("\x00", ""), response.url, 'utf-8', xml_parser_options)
return lxdoc

View File

@ -4,8 +4,8 @@ from cStringIO import StringIO
import libxml2
from scrapy.xpath.constructors import xml_parser_options, xmlDoc_from_xml
from scrapy.xpath.selector import XPathSelector
from scrapy.xpath.constructors import xml_parser_options
from scrapy.xpath.selector import XmlXPathSelector
class XMLNodeIterator(object):
"""XMLNodeIterator provides a way to iterate over all nodes of the same
@ -63,7 +63,7 @@ class XMLNodeSAXParser():
if name == self.requested_nodename:
self.inside_requested_node = False
string = ''.join([self.xml_declaration, self.buffer.getvalue()])
selector = XPathSelector(text=string, constructor=xmlDoc_from_xml).x('/' + self.requested_nodename)[0]
selector = XmlXPathSelector(text=string).x('/' + self.requested_nodename)[0]
self.selectors.append(selector)
def characters(self, data):

View File

@ -2,22 +2,20 @@ import libxml2
from scrapy.http import Response
from scrapy.xpath.extension import Libxml2Document
from scrapy.xpath.constructors import xmlDoc_from_html
from scrapy.xpath.constructors import xmlDoc_from_html, xmlDoc_from_xml
from scrapy.utils.python import flatten
from scrapy.utils.misc import extract_regex
class XPathSelector(object):
"""Provides an easy way for selecting document parts using XPaths and
regexs, it also supports nested queries.
Usage example (untested code):
x = XPathSelector(response)
i = ScrapedItem()
i.assign("name", x.x("//h2/text()"))
i.assign("features", x.x("//div[@class='features']).x("./span/text()")
"""
"""The XPathSelector class provides a convenient way for selecting document
parts using XPaths and regexs, with support for nested queries.
Although this is not an abstract class, you usually instantiate one of its
children:
- XmlXPathSelector (for XML content)
- HtmlXPathSelector (for HTML content)
"""
def __init__(self, response=None, text=None, node=None, parent=None, expr=None, constructor=xmlDoc_from_html):
if parent:
@ -36,6 +34,8 @@ class XPathSelector(object):
self.expr = expr
def x(self, xpath):
"""Perform the given XPath query on the current XPathSelector and
return a XPathSelectorList of the result"""
if hasattr(self.xmlNode, 'xpathEval'):
self.doc.xpathContext.setContextNode(self.xmlNode)
xpath_result = self.doc.xpathContext.xpathEval(xpath)
@ -47,13 +47,20 @@ class XPathSelector(object):
return XPathSelectorList([])
def re(self, regex):
"""Return a list of unicode strings by applying the regex over all
current XPath selections, and flattening the results"""
return extract_regex(regex, self.extract(), 'utf-8')
def extract(self, **kwargs):
def extract(self):
"""Return a unicode string of the content referenced by the XPathSelector"""
if isinstance(self.xmlNode, basestring):
text = unicode(self.xmlNode, 'utf-8', errors='ignore')
elif hasattr(self.xmlNode, 'xpathEval'):
if isinstance(self.xmlNode, libxml2.xmlAttr):
elif hasattr(self.xmlNode, 'serialize'):
if isinstance(self.xmlNode, libxml2.xmlDoc):
data = self.xmlNode.getRootElement().serialize('utf-8')
text = unicode(data, 'utf-8', errors='ignore') if data else u''
elif isinstance(self.xmlNode, libxml2.xmlAttr):
# serialization doesn't work sometimes for xmlAttr types
text = unicode(self.xmlNode.content, errors='ignore')
else:
data = self.xmlNode.serialize('utf-8')
@ -66,6 +73,7 @@ class XPathSelector(object):
return text
def register_namespace(self, prefix, uri):
"""Register namespace so that it can be used in XPath queries"""
self.doc.xpathContext.xpathRegisterNs(prefix, uri)
def __str__(self):
@ -75,13 +83,29 @@ class XPathSelector(object):
class XPathSelectorList(list):
def extract(self, **kwargs):
return [x.extract(**kwargs) if isinstance(x, XPathSelector) else x for x in self]
"""List of XPathSelector objects"""
def x(self, xpath):
"""Perform the given XPath query on each XPathSelector of the list and
return a new (flattened) XPathSelectorList of the results"""
return XPathSelectorList(flatten([x.x(xpath) for x in self]))
def re(self, regex):
"""Perform the re() method on each XPathSelector of the list, and
return the result as a flattened list of unicode strings"""
return flatten([x.re(regex) for x in self])
def extract(self):
"""Return a list of unicode strings with the content referenced by each
XPathSelector of the list"""
return [x.extract() if isinstance(x, XPathSelector) else x for x in self]
class XmlXPathSelector(XPathSelector):
"""XPathSelector for XML content"""
def __init__(self, response=None, text=None):
XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_xml)
class HtmlXPathSelector(XPathSelector):
"""XPathSelector for HTML content"""
def __init__(self, response=None, text=None):
XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_html)