1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 05:53:15 +00:00

replaced XMLNodeIterator with xpathselector_iternodes

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%4034
This commit is contained in:
Pablo Hoffman 2008-06-30 17:20:56 +00:00
parent 3ee14952aa
commit 1f7f0d0994
5 changed files with 60 additions and 157 deletions

View File

@ -0,0 +1,30 @@
import unittest
from scrapy.utils.xml import xpathselector_iternodes
from scrapy.http import Response
class UtilsXmlTestCase(unittest.TestCase):
def test_iterator(self):
body = """<?xml version="1.0" encoding="UTF-8"?>
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
<product id="001">
<type>Type 1</type>
<name>Name 1</name>
</product>
<product id="002">
<type>Type 2</type>
<name>Name 2</name>
</product>
</products>
"""
response = Response(domain="example.com", url="http://example.com", body=body)
attrs = []
for x in xpathselector_iternodes(response, 'product'):
attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
self.assertEqual(attrs,
[(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
if __name__ == "__main__":
unittest.main()

View File

@ -6,8 +6,6 @@ import libxml2
from scrapy.http import Response
from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
#from scrapy.xpath.constructors import xmlDoc_from_xml, xmlDoc_from_html
from scrapy.xpath.iterator import XMLNodeIterator
class XPathTestCase(unittest.TestCase):
@ -173,26 +171,5 @@ class XPathTestCase(unittest.TestCase):
self.assertEqual(xxs.extract(),
u'<root>lala</root>')
def test_iterator(self):
body = """<?xml version="1.0" encoding="UTF-8"?>
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
<product id="001">
<type>Type 1</type>
<name>Name 1</name>
</product>
<product id="002">
<type>Type 2</type>
<name>Name 2</name>
</product>
</products>
"""
response = Response(domain="example.com", url="http://example.com", body=body)
attrs = []
for x in XMLNodeIterator(response, 'product'):
attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
self.assertEqual(attrs,
[(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,30 @@
import re
from scrapy.xpath import XmlXPathSelector
from scrapy.http import Response
def xpathselector_iternodes(obj, nodename):
"""Return a iterator of XPathSelector's over all nodes of a XML document,
given tha name of the node to iterate. Useful for parsing XML feeds.
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
"""
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
if isinstance(obj, Response):
text = obj.body.to_string()
enc = obj.body.get_real_encoding()
else:
text = obj
enc = 'utf-8'
r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
for match in r.finditer(text):
nodetext = match.group().decode(enc)
yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]

View File

@ -7,9 +7,7 @@ spiders, for convenience.
* XPath - a simple class to represent a XPath expression
* XPathSelector - to extract data using XPaths (parses the entire response)
* XMLNodeIterator - to iterate over XML nodes without parsing the entire response in memory
"""
from scrapy.xpath.types import XPath
from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
from scrapy.xpath.iterator import XMLNodeIterator

View File

@ -1,132 +0,0 @@
import re
from xml.sax.saxutils import escape
from cStringIO import StringIO
import libxml2
from scrapy.xpath.constructors import xml_parser_options
from scrapy.xpath.selector import XmlXPathSelector
class XMLNodeIterator(object):
"""XMLNodeIterator provides a way to iterate over all nodes of the same
name (passed in the constructor) in a XML Response without parsing the
entire response in memory. The iterator returns XPathSelector objects.
Usage example:
for x in XMLNodeIterator(response, "product"):
i = ScrapedItem()
i.assign("id", x.x("@id"))
i.assign("name", x.x("./name/text()")
"""
def __init__(self, response, node, chunk_size=2048):
self.response = response
self.node = node
self.chunk_size = 2048
def __iter__(self):
sax_parser = XMLNodeSAXParser(self.node, self.response)
contents = self.response.body.to_string()
ctxt = libxml2.createPushParser(sax_parser, '', 0, None)
ctxt.ctxtUseOptions(xml_parser_options)
for i in xrange(0, len(contents), self.chunk_size):
chunk = contents[i:i + self.chunk_size]
ctxt.parseChunk(chunk, len(chunk), 0)
while sax_parser.selectors:
yield sax_parser.selectors.pop(0)
ctxt.parseChunk('', 0, 1)
class XMLNodeSAXParser():
xmldeclr_re = re.compile(r'<\?xml.*?\?>')
def __init__(self, requested_nodename, response):
self.requested_nodename = requested_nodename
self.inside_requested_node = False
self.buffer = StringIO()
self.xml_declaration = self._extract_xmldecl(response.body.to_string()[0:4096])
self.selectors = []
def startElement(self, name, attributes):
if name == self.requested_nodename:
self.inside_requested_node = True
self.buffer.close()
self.buffer = StringIO()
attributes = attributes or {}
attribute_strings = ["%s='%s'" % tuple(ka) for ka in attributes.items()]
self.buffer.write('<' + ' '.join([name] + attribute_strings) + '>')
def endElement(self, name):
self.buffer.write('</%s>' % name)
if name == self.requested_nodename:
self.inside_requested_node = False
string = ''.join([self.xml_declaration, self.buffer.getvalue()])
selector = XmlXPathSelector(text=string).x('/' + self.requested_nodename)[0]
self.selectors.append(selector)
def characters(self, data):
if self.inside_requested_node:
self.buffer.write(escape(data))
def cdataBlock(self, data):
#self.characters('<![CDATA[' + data + ']]>')
if self.inside_requested_node:
self.buffer.write('<![CDATA[' + data + ']]>')
def _extract_xmldecl(self, string):
m = self.xmldeclr_re.search(string)
return m.group() if m else ''
# TESTING #
from xml.parsers.expat import ParserCreate
class expat_XMLNodeIterator():
def __init__(self, response, req_nodename, chunk_size=2048):
self._response = response
self._req_nodename = req_nodename
self._chunk_size = chunk_size
self._byte_offset_buffer = []
self._parser = ParserCreate()
self._parser.StartElementHandler = self._StartElementHandler
self._parser.EndElementHandler = self._EndElementHandler
def _StartElementHandler(self, name, attrs):
if name == self._req_nodename and not self._inside_req_node:
self._start_pos = self._parser.CurrentByteIndex
self._inside_req_node = True
def _EndElementHandler(self, name):
if name == self._req_nodename and self._inside_req_node:
self._byte_offset_buffer.append((self._start_pos, self._parser.CurrentByteIndex))
self._inside_req_node = False
def __iter__(self):
response_body = self._response.body.to_string()
self._inside_req_node = False
for i in xrange(0, len(response_body), self._chunk_size):
self._parser.Parse(response_body[i:i + self._chunk_size])
while self._byte_offset_buffer:
start, end = self._byte_offset_buffer.pop(0)
yield response_body[start:end]
self._parser.Parse('', 1)
# TESTING (pablo) #
# Yet another node iterator: this one is based entirely on regular expressions,
# which means it should be faster but needs some profiling to confirm.
class re_XMLNodeIterator():
def __init__(self, response, node):
self.response = response
self.node = node
self.re = re.compile(r"<%s[\s>].*?</%s>" % (node, node), re.DOTALL)
def __iter__(self):
for match in self.re.finditer(self.response.body.to_string()):
yield XmlXPathSelector(text=match.group()).x('/' + self.node)[0]