1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 22:23:46 +00:00

Support unicode tags in xml iterators (fixes #1665)

This commit is contained in:
Paul Tremberth 2016-01-12 10:48:45 +01:00
parent 95e8ff8ba1
commit 6ddd814738
2 changed files with 55 additions and 6 deletions

View File

@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
def xmliter(obj, nodename):
"""Return a iterator of Selector's over all nodes of a XML document,
given tha name of the node to iterate. Useful for parsing XML feeds.
given the name of the node to iterate. Useful for parsing XML feeds.
obj can be:
- a Response object
@ -36,7 +36,7 @@ def xmliter(obj, nodename):
header_end = re_rsearch(HEADER_END_RE, text)
header_end = text[header_end[1]:].strip() if header_end else ''
r = re.compile(r"<{0}[\s>].*?</{0}>".format(nodename_patt), re.DOTALL)
r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
for match in r.finditer(text):
nodetext = header_start + match.group() + header_end
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
@ -49,7 +49,7 @@ def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
for _, node in iterable:
nodetext = etree.tostring(node)
nodetext = etree.tostring(node, encoding='unicode')
node.clear()
xs = Selector(text=nodetext, type='xml')
if namespace:
@ -94,7 +94,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
headers is an iterable that when provided offers the keys
for the returned dictionaries, if not the first row is used.
quotechar is the character used to enclosure fields on the given obj.
"""
@ -125,7 +125,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
def _body_or_str(obj, unicode=True):
assert isinstance(obj, (Response, six.string_types)), \
assert isinstance(obj, (Response, six.string_types, bytes)), \
"obj must be Response or basestring, not %s" % type(obj).__name__
if isinstance(obj, Response):
if not unicode:

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import os
from twisted.trial import unittest
@ -45,6 +46,54 @@ class XmliterTestCase(unittest.TestCase):
for e in self.xmliter(response, 'matchme...')]
self.assertEqual(nodenames, [['matchme...']])
def test_xmliter_unicode(self):
# example taken from https://github.com/scrapy/scrapy/issues/1665
body = """<?xml version="1.0" encoding="UTF-8"?>
<þingflokkar>
<þingflokkur id="26">
<heiti />
<skammstafanir>
<stuttskammstöfun>-</stuttskammstöfun>
<löngskammstöfun />
</skammstafanir>
<tímabil>
<fyrstaþing>80</fyrstaþing>
</tímabil>
</þingflokkur>
<þingflokkur id="21">
<heiti>Alþýðubandalag</heiti>
<skammstafanir>
<stuttskammstöfun>Ab</stuttskammstöfun>
<löngskammstöfun>Alþb.</löngskammstöfun>
</skammstafanir>
<tímabil>
<fyrstaþing>76</fyrstaþing>
<síðastaþing>123</síðastaþing>
</tímabil>
</þingflokkur>
<þingflokkur id="27">
<heiti>Alþýðuflokkur</heiti>
<skammstafanir>
<stuttskammstöfun>A</stuttskammstöfun>
<löngskammstöfun>Alþfl.</löngskammstöfun>
</skammstafanir>
<tímabil>
<fyrstaþing>27</fyrstaþing>
<síðastaþing>120</síðastaþing>
</tímabil>
</þingflokkur>
</þingflokkar>"""
response = XmlResponse(url="http://example.com", body=body)
attrs = []
for x in self.xmliter(response, u'þingflokkur'):
attrs.append((x.xpath('@id').extract(),
x.xpath(u'./skammstafanir/stuttskammstöfun/text()').extract(),
x.xpath(u'./tímabil/fyrstaþing/text()').extract()))
self.assertEqual(attrs,
[([u'26'], [u'-'], [u'80']),
([u'21'], [u'Ab'], [u'76']),
([u'27'], [u'A'], [u'27'])])
def test_xmliter_text(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
@ -206,7 +255,7 @@ class UtilsCsvTestCase(unittest.TestCase):
def test_csviter_quotechar(self):
body1 = get_testdata('feeds', 'feed-sample6.csv')
body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')
response1 = TextResponse(url="http://example.com/", body=body1)
csv1 = csviter(response1, quotechar="'")