some improvements to XPathSelector and friends

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%4028
2025-02-21 07:52:49 +00:00 · 2008-06-29 05:20:31 +00:00 · 2008-06-29 05:20:31 +00:00 · 6cc91df9ed
commit 6cc91df9ed
parent 9cf83faf1c
4 changed files with 80 additions and 30 deletions
--- a/scrapy/trunk/scrapy/tests/test_xpath.py
+++ b/scrapy/trunk/scrapy/tests/test_xpath.py
@ -5,8 +5,8 @@ import unittest
 import libxml2

 from scrapy.http import Response
-from scrapy.xpath.selector import XPathSelector
-from scrapy.xpath.constructors import xmlDoc_from_xml
+from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
+#from scrapy.xpath.constructors import xmlDoc_from_xml, xmlDoc_from_html
 from scrapy.xpath.iterator import XMLNodeIterator

 class XPathTestCase(unittest.TestCase):
@ -23,7 +23,7 @@ class XPathTestCase(unittest.TestCase):
        """Simple selector tests"""
        body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
        response = Response(domain="example.com", url="http://example.com", body=body)
-        xpath = XPathSelector(response)
+        xpath = HtmlXPathSelector(response)

        xl = xpath.x('//input')
        self.assertEqual(2, len(xl))
@ -59,7 +59,7 @@ class XPathTestCase(unittest.TestCase):
                  </body>"""

        response = Response(domain="example.com", url="http://example.com", body=body)
-        x = XPathSelector(response)
+        x = HtmlXPathSelector(response)

        divtwo = x.x('//div[@class="two"]')
        self.assertEqual(divtwo.x("//li").extract(),
@ -84,7 +84,7 @@ class XPathTestCase(unittest.TestCase):

               """
        response = Response(domain="example.com", url="http://example.com", body=body)
-        x = XPathSelector(response)
+        x = HtmlXPathSelector(response)

        name_re = re.compile("Name: (\w+)")
        self.assertEqual(x.x("//ul/li").re(name_re),
@ -92,6 +92,20 @@ class XPathTestCase(unittest.TestCase):
        self.assertEqual(x.x("//ul/li").re("Age: (\d+)"),
                         ["10", "20"])

+    def test_selector_over_text(self):
+        hxs = HtmlXPathSelector(text='<root>lala</root>')
+        self.assertEqual(hxs.extract(),
+                         u'<html><body><root>lala</root></body></html>')
+
+        xxs = XmlXPathSelector(text='<root>lala</root>')
+        self.assertEqual(xxs.extract(),
+                         u'<root>lala</root>')
+
+        xxs = XmlXPathSelector(text='<root>lala</root>')
+        self.assertEqual(xxs.x('.').extract(),
+                         [u'<root>lala</root>'])
+
+
    def test_selector_namespaces_simple(self):
        body = """
        <test xmlns:somens="http://scrapy.org">
@ -101,7 +115,7 @@ class XPathTestCase(unittest.TestCase):
        """

        response = Response(domain="example.com", url="http://example.com", body=body)
-        x = XPathSelector(response, constructor=xmlDoc_from_xml)
+        x = XmlXPathSelector(response)
        
        x.register_namespace("somens", "http://scrapy.org")
        self.assertEqual(x.x("//somens:a").extract(), 
@ -119,7 +133,7 @@ class XPathTestCase(unittest.TestCase):
 </BrowseNode>
        """
        response = Response(domain="example.com", url="http://example.com", body=body)
-        x = XPathSelector(response, constructor=xmlDoc_from_xml)
+        x = XmlXPathSelector(response)

        x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
        x.register_namespace("p", "http://www.scrapy.org/product")
@ -146,10 +160,19 @@ class XPathTestCase(unittest.TestCase):

        headers = {'Content-Type': ['text/html; charset=utf-8']}
        response = Response(domain="example.com", url="http://example.com", headers=headers, body=html_utf8)
-        x = XPathSelector(response)
+        x = HtmlXPathSelector(response)
        self.assertEquals(x.x("//span[@id='blank']/text()").extract(),
                          [u'\xa3'])

+    def test_null_bytes(self):
+        hxs = HtmlXPathSelector(text='<root>la\x00la</root>')
+        self.assertEqual(hxs.extract(),
+                         u'<html><body><root>lala</root></body></html>')
+
+        xxs = XmlXPathSelector(text='<root>la\x00la</root>')
+        self.assertEqual(xxs.extract(),
+                         u'<root>lala</root>')
+
    def test_iterator(self):
        body = """<?xml version="1.0" encoding="UTF-8"?>
 <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
--- a/scrapy/trunk/scrapy/xpath/constructors.py
+++ b/scrapy/trunk/scrapy/xpath/constructors.py
@ -24,5 +24,8 @@ def xmlDoc_from_html(response):

 def xmlDoc_from_xml(response):
    """Return libxml2 doc for XMLs"""
-    return libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options)
-
+    try:
+        lxdoc = libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options)
+    except TypeError:  # libxml2 doesn't parse text with null bytes
+        lxdoc = libxml2.readDoc(response.body.to_string('utf-8').replace("\x00", ""), response.url, 'utf-8', xml_parser_options)
+    return lxdoc
--- a/scrapy/trunk/scrapy/xpath/iterator.py
+++ b/scrapy/trunk/scrapy/xpath/iterator.py
@ -4,8 +4,8 @@ from cStringIO import StringIO

 import libxml2

-from scrapy.xpath.constructors import xml_parser_options, xmlDoc_from_xml
-from scrapy.xpath.selector import XPathSelector
+from scrapy.xpath.constructors import xml_parser_options
+from scrapy.xpath.selector import XmlXPathSelector

 class XMLNodeIterator(object):
    """XMLNodeIterator provides a way to iterate over all nodes of the same
@ -63,7 +63,7 @@ class XMLNodeSAXParser():
        if name == self.requested_nodename:
            self.inside_requested_node = False
            string = ''.join([self.xml_declaration, self.buffer.getvalue()])
-            selector = XPathSelector(text=string, constructor=xmlDoc_from_xml).x('/' + self.requested_nodename)[0]
+            selector = XmlXPathSelector(text=string).x('/' + self.requested_nodename)[0]
            self.selectors.append(selector)

    def characters(self, data):
--- a/scrapy/trunk/scrapy/xpath/selector.py
+++ b/scrapy/trunk/scrapy/xpath/selector.py
@ -2,22 +2,20 @@ import libxml2

 from scrapy.http import Response
 from scrapy.xpath.extension import Libxml2Document
-from scrapy.xpath.constructors import xmlDoc_from_html
+from scrapy.xpath.constructors import xmlDoc_from_html, xmlDoc_from_xml
 from scrapy.utils.python import flatten
 from scrapy.utils.misc import extract_regex

 class XPathSelector(object):
-    """Provides an easy way for selecting document parts using XPaths and
-    regexs, it also supports nested queries.
-    
-    Usage example (untested code):
-    
-    x = XPathSelector(response)
-    i = ScrapedItem()
-    i.assign("name", x.x("//h2/text()"))
-    i.assign("features", x.x("//div[@class='features']).x("./span/text()")
-    """
+    """The XPathSelector class provides a convenient way for selecting document
+    parts using XPaths and regexs, with support for nested queries.

+    Although this is not an abstract class, you usually instantiate one of its
+    children:
+    
+    - XmlXPathSelector (for XML content)
+    - HtmlXPathSelector (for HTML content)
+    """

    def __init__(self, response=None, text=None, node=None, parent=None, expr=None, constructor=xmlDoc_from_html):
        if parent:
@ -36,6 +34,8 @@ class XPathSelector(object):
        self.expr = expr

    def x(self, xpath):
+        """Perform the given XPath query on the current XPathSelector and
+        return a XPathSelectorList of the result"""
        if hasattr(self.xmlNode, 'xpathEval'):
            self.doc.xpathContext.setContextNode(self.xmlNode)
            xpath_result = self.doc.xpathContext.xpathEval(xpath)
@ -47,13 +47,20 @@ class XPathSelector(object):
            return XPathSelectorList([])

    def re(self, regex):
+        """Return a list of unicode strings by applying the regex over all
+        current XPath selections, and flattening the results"""
        return extract_regex(regex, self.extract(), 'utf-8')

-    def extract(self, **kwargs): 
+    def extract(self):
+        """Return a unicode string of the content referenced by the XPathSelector"""
        if isinstance(self.xmlNode, basestring):
            text = unicode(self.xmlNode, 'utf-8', errors='ignore')
-        elif hasattr(self.xmlNode, 'xpathEval'):
-            if isinstance(self.xmlNode, libxml2.xmlAttr):
+        elif hasattr(self.xmlNode, 'serialize'):
+            if isinstance(self.xmlNode, libxml2.xmlDoc):
+                data = self.xmlNode.getRootElement().serialize('utf-8')
+                text = unicode(data, 'utf-8', errors='ignore') if data else u''
+            elif isinstance(self.xmlNode, libxml2.xmlAttr): 
+                # serialization doesn't work sometimes for xmlAttr types
                text = unicode(self.xmlNode.content, errors='ignore')
            else:
                data = self.xmlNode.serialize('utf-8')
@ -66,6 +73,7 @@ class XPathSelector(object):
        return text

    def register_namespace(self, prefix, uri):
+        """Register namespace so that it can be used in XPath queries"""
        self.doc.xpathContext.xpathRegisterNs(prefix, uri)

    def __str__(self):
@ -75,13 +83,29 @@ class XPathSelector(object):


 class XPathSelectorList(list):
-
-    def extract(self, **kwargs):
-        return [x.extract(**kwargs) if isinstance(x, XPathSelector) else x for x in self]
+    """List of XPathSelector objects"""

    def x(self, xpath):
+        """Perform the given XPath query on each XPathSelector of the list and
+        return a new (flattened) XPathSelectorList of the results"""
        return XPathSelectorList(flatten([x.x(xpath) for x in self]))

    def re(self, regex):
+        """Perform the re() method on each XPathSelector of the list, and
+        return the result as a flattened list of unicode strings"""
        return flatten([x.re(regex) for x in self])
    
+    def extract(self):
+        """Return a list of unicode strings with the content referenced by each
+        XPathSelector of the list"""
+        return [x.extract() if isinstance(x, XPathSelector) else x for x in self]
+
+class XmlXPathSelector(XPathSelector):
+    """XPathSelector for XML content"""
+    def __init__(self, response=None, text=None):
+        XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_xml)
+
+class HtmlXPathSelector(XPathSelector):
+    """XPathSelector for HTML content"""
+    def __init__(self, response=None, text=None):
+        XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_html)