replaced XMLNodeIterator with xpathselector_iternodes

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%4034
2025-02-21 05:53:15 +00:00 · 2008-06-30 17:20:56 +00:00 · 2008-06-30 17:20:56 +00:00 · 1f7f0d0994
commit 1f7f0d0994
parent 3ee14952aa
5 changed files with 60 additions and 157 deletions
--- a/scrapy/trunk/scrapy/tests/test_utils_xml.py
+++ b/scrapy/trunk/scrapy/tests/test_utils_xml.py
@ -0,0 +1,30 @@
+import unittest
+
+from scrapy.utils.xml import xpathselector_iternodes
+from scrapy.http import Response
+
+class UtilsXmlTestCase(unittest.TestCase):
+
+    def test_iterator(self):
+        body = """<?xml version="1.0" encoding="UTF-8"?>
+<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
+  <product id="001">
+    <type>Type 1</type>
+    <name>Name 1</name>
+  </product>
+  <product id="002">
+    <type>Type 2</type>
+    <name>Name 2</name>
+  </product>
+</products>
+        """
+        response = Response(domain="example.com", url="http://example.com", body=body)
+        attrs = []
+        for x in xpathselector_iternodes(response, 'product'):
+            attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
+
+        self.assertEqual(attrs, 
+                         [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
+
+if __name__ == "__main__":
+    unittest.main()   
--- a/scrapy/trunk/scrapy/tests/test_xpath.py
+++ b/scrapy/trunk/scrapy/tests/test_xpath.py
@ -6,8 +6,6 @@ import libxml2

 from scrapy.http import Response
 from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
-#from scrapy.xpath.constructors import xmlDoc_from_xml, xmlDoc_from_html
-from scrapy.xpath.iterator import XMLNodeIterator

 class XPathTestCase(unittest.TestCase):

@ -173,26 +171,5 @@ class XPathTestCase(unittest.TestCase):
        self.assertEqual(xxs.extract(),
                         u'<root>lala</root>')

-    def test_iterator(self):
-        body = """<?xml version="1.0" encoding="UTF-8"?>
-<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
-  <product id="001">
-    <type>Type 1</type>
-    <name>Name 1</name>
-  </product>
-  <product id="002">
-    <type>Type 2</type>
-    <name>Name 2</name>
-  </product>
-</products>
-        """
-        response = Response(domain="example.com", url="http://example.com", body=body)
-        attrs = []
-        for x in XMLNodeIterator(response, 'product'):
-            attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
-
-        self.assertEqual(attrs, 
-                         [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
-
 if __name__ == "__main__":
    unittest.main()   
--- a/scrapy/trunk/scrapy/utils/xml.py
+++ b/scrapy/trunk/scrapy/utils/xml.py
@ -0,0 +1,30 @@
+import re
+
+from scrapy.xpath import XmlXPathSelector
+from scrapy.http import Response
+
+def xpathselector_iternodes(obj, nodename):
+    """Return a iterator of XPathSelector's over all nodes of a XML document,
+       given tha name of the node to iterate. Useful for parsing XML feeds.
+
+    obj can be:
+    - a Response object
+    - a unicode string
+    - a string encoded as utf-8
+    """
+    
+    assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
+
+    if isinstance(obj, Response):
+        text = obj.body.to_string()
+        enc = obj.body.get_real_encoding()
+    else:
+        text = obj
+        enc = 'utf-8'
+
+
+    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
+
+    for match in r.finditer(text):
+        nodetext = match.group().decode(enc)
+        yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]
--- a/scrapy/trunk/scrapy/xpath/init.py
+++ b/scrapy/trunk/scrapy/xpath/init.py
@ -7,9 +7,7 @@ spiders, for convenience.

 * XPath - a simple class to represent a XPath expression
 * XPathSelector - to extract data using XPaths (parses the entire response)
-* XMLNodeIterator - to iterate over XML nodes without parsing the entire response in memory
 """

 from scrapy.xpath.types import XPath
 from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
-from scrapy.xpath.iterator import XMLNodeIterator
--- a/scrapy/trunk/scrapy/xpath/iterator.py
+++ b/scrapy/trunk/scrapy/xpath/iterator.py
@ -1,132 +0,0 @@
-import re
-from xml.sax.saxutils import escape
-from cStringIO import StringIO
-
-import libxml2
-
-from scrapy.xpath.constructors import xml_parser_options
-from scrapy.xpath.selector import XmlXPathSelector
-
-class XMLNodeIterator(object):
-    """XMLNodeIterator provides a way to iterate over all nodes of the same
-    name (passed in the constructor) in a XML Response without parsing the
-    entire response in memory. The iterator returns XPathSelector objects.
-
-    Usage example:
-
-    for x in XMLNodeIterator(response, "product"):
-        i = ScrapedItem()
-        i.assign("id", x.x("@id"))
-        i.assign("name", x.x("./name/text()")
-    """
-
-    def __init__(self, response, node, chunk_size=2048):
-        self.response = response
-        self.node = node
-        self.chunk_size = 2048
-
-    def __iter__(self):
-        sax_parser = XMLNodeSAXParser(self.node, self.response)
-        contents = self.response.body.to_string()
-        ctxt = libxml2.createPushParser(sax_parser, '', 0, None)
-        ctxt.ctxtUseOptions(xml_parser_options)
-        for i in xrange(0, len(contents), self.chunk_size):
-            chunk = contents[i:i + self.chunk_size]
-            ctxt.parseChunk(chunk, len(chunk), 0)
-            while sax_parser.selectors:
-                yield sax_parser.selectors.pop(0)
-        ctxt.parseChunk('', 0, 1)
-
-class XMLNodeSAXParser():
-
-    xmldeclr_re = re.compile(r'<\?xml.*?\?>')
-
-    def __init__(self, requested_nodename, response):
-        self.requested_nodename = requested_nodename
-        self.inside_requested_node = False
-        self.buffer = StringIO()
-        self.xml_declaration = self._extract_xmldecl(response.body.to_string()[0:4096])
-        self.selectors = []
-
-    def startElement(self, name, attributes):
-        if name == self.requested_nodename:
-            self.inside_requested_node = True
-            self.buffer.close()
-            self.buffer = StringIO()
-        attributes = attributes or {}
-        attribute_strings = ["%s='%s'" % tuple(ka) for ka in attributes.items()]
-        self.buffer.write('<' + ' '.join([name] + attribute_strings) + '>')
-
-    def endElement(self, name):
-        self.buffer.write('</%s>' % name)
-
-        if name == self.requested_nodename:
-            self.inside_requested_node = False
-            string = ''.join([self.xml_declaration, self.buffer.getvalue()])
-            selector = XmlXPathSelector(text=string).x('/' + self.requested_nodename)[0]
-            self.selectors.append(selector)
-
-    def characters(self, data):
-        if self.inside_requested_node:
-            self.buffer.write(escape(data))
-
-    def cdataBlock(self, data):
-        #self.characters('<![CDATA[' + data + ']]>')
-        if self.inside_requested_node:
-            self.buffer.write('<![CDATA[' + data + ']]>')
-
-    def _extract_xmldecl(self, string):
-        m = self.xmldeclr_re.search(string)
-        return m.group() if m else ''
-
-
-# TESTING #
-from xml.parsers.expat import ParserCreate
-
-class expat_XMLNodeIterator():
-    def __init__(self, response, req_nodename, chunk_size=2048):
-        self._response = response
-        self._req_nodename = req_nodename
-        self._chunk_size = chunk_size
-
-        self._byte_offset_buffer = []
-
-        self._parser = ParserCreate()
-        self._parser.StartElementHandler = self._StartElementHandler
-        self._parser.EndElementHandler = self._EndElementHandler
-
-    def _StartElementHandler(self, name, attrs):
-        if name == self._req_nodename and not self._inside_req_node:
-            self._start_pos = self._parser.CurrentByteIndex
-            self._inside_req_node = True
-
-    def _EndElementHandler(self, name):
-        if name == self._req_nodename and self._inside_req_node:
-            self._byte_offset_buffer.append((self._start_pos, self._parser.CurrentByteIndex))
-            self._inside_req_node = False
-
-    def __iter__(self):
-        response_body = self._response.body.to_string()
-        self._inside_req_node = False
-        for i in xrange(0, len(response_body), self._chunk_size):
-            self._parser.Parse(response_body[i:i + self._chunk_size])
-            while self._byte_offset_buffer:
-                start, end = self._byte_offset_buffer.pop(0)
-                yield response_body[start:end]
-        self._parser.Parse('', 1)
-
-
-# TESTING (pablo) #
-# Yet another node iterator: this one is based entirely on regular expressions,
-# which means it should be faster but needs some profiling to confirm.
-
-class re_XMLNodeIterator():
-
-    def __init__(self, response, node):
-        self.response = response
-        self.node = node
-        self.re = re.compile(r"<%s[\s>].*?</%s>" % (node, node), re.DOTALL)
-
-    def __iter__(self):
-        for match in self.re.finditer(self.response.body.to_string()):
-            yield XmlXPathSelector(text=match.group()).x('/' + self.node)[0]