mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 22:23:46 +00:00
Support unicode tags in xml iterators (fixes #1665)
This commit is contained in:
parent
95e8ff8ba1
commit
6ddd814738
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
def xmliter(obj, nodename):
|
||||
"""Return a iterator of Selector's over all nodes of a XML document,
|
||||
given tha name of the node to iterate. Useful for parsing XML feeds.
|
||||
given the name of the node to iterate. Useful for parsing XML feeds.
|
||||
|
||||
obj can be:
|
||||
- a Response object
|
||||
@ -36,7 +36,7 @@ def xmliter(obj, nodename):
|
||||
header_end = re_rsearch(HEADER_END_RE, text)
|
||||
header_end = text[header_end[1]:].strip() if header_end else ''
|
||||
|
||||
r = re.compile(r"<{0}[\s>].*?</{0}>".format(nodename_patt), re.DOTALL)
|
||||
r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
|
||||
for match in r.finditer(text):
|
||||
nodetext = header_start + match.group() + header_end
|
||||
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
|
||||
@ -49,7 +49,7 @@ def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
|
||||
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
|
||||
selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
|
||||
for _, node in iterable:
|
||||
nodetext = etree.tostring(node)
|
||||
nodetext = etree.tostring(node, encoding='unicode')
|
||||
node.clear()
|
||||
xs = Selector(text=nodetext, type='xml')
|
||||
if namespace:
|
||||
@ -94,7 +94,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
|
||||
|
||||
headers is an iterable that when provided offers the keys
|
||||
for the returned dictionaries, if not the first row is used.
|
||||
|
||||
|
||||
quotechar is the character used to enclosure fields on the given obj.
|
||||
"""
|
||||
|
||||
@ -125,7 +125,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
|
||||
|
||||
|
||||
def _body_or_str(obj, unicode=True):
|
||||
assert isinstance(obj, (Response, six.string_types)), \
|
||||
assert isinstance(obj, (Response, six.string_types, bytes)), \
|
||||
"obj must be Response or basestring, not %s" % type(obj).__name__
|
||||
if isinstance(obj, Response):
|
||||
if not unicode:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
from twisted.trial import unittest
|
||||
|
||||
@ -45,6 +46,54 @@ class XmliterTestCase(unittest.TestCase):
|
||||
for e in self.xmliter(response, 'matchme...')]
|
||||
self.assertEqual(nodenames, [['matchme...']])
|
||||
|
||||
def test_xmliter_unicode(self):
|
||||
# example taken from https://github.com/scrapy/scrapy/issues/1665
|
||||
body = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<þingflokkar>
|
||||
<þingflokkur id="26">
|
||||
<heiti />
|
||||
<skammstafanir>
|
||||
<stuttskammstöfun>-</stuttskammstöfun>
|
||||
<löngskammstöfun />
|
||||
</skammstafanir>
|
||||
<tímabil>
|
||||
<fyrstaþing>80</fyrstaþing>
|
||||
</tímabil>
|
||||
</þingflokkur>
|
||||
<þingflokkur id="21">
|
||||
<heiti>Alþýðubandalag</heiti>
|
||||
<skammstafanir>
|
||||
<stuttskammstöfun>Ab</stuttskammstöfun>
|
||||
<löngskammstöfun>Alþb.</löngskammstöfun>
|
||||
</skammstafanir>
|
||||
<tímabil>
|
||||
<fyrstaþing>76</fyrstaþing>
|
||||
<síðastaþing>123</síðastaþing>
|
||||
</tímabil>
|
||||
</þingflokkur>
|
||||
<þingflokkur id="27">
|
||||
<heiti>Alþýðuflokkur</heiti>
|
||||
<skammstafanir>
|
||||
<stuttskammstöfun>A</stuttskammstöfun>
|
||||
<löngskammstöfun>Alþfl.</löngskammstöfun>
|
||||
</skammstafanir>
|
||||
<tímabil>
|
||||
<fyrstaþing>27</fyrstaþing>
|
||||
<síðastaþing>120</síðastaþing>
|
||||
</tímabil>
|
||||
</þingflokkur>
|
||||
</þingflokkar>"""
|
||||
response = XmlResponse(url="http://example.com", body=body)
|
||||
attrs = []
|
||||
for x in self.xmliter(response, u'þingflokkur'):
|
||||
attrs.append((x.xpath('@id').extract(),
|
||||
x.xpath(u'./skammstafanir/stuttskammstöfun/text()').extract(),
|
||||
x.xpath(u'./tímabil/fyrstaþing/text()').extract()))
|
||||
|
||||
self.assertEqual(attrs,
|
||||
[([u'26'], [u'-'], [u'80']),
|
||||
([u'21'], [u'Ab'], [u'76']),
|
||||
([u'27'], [u'A'], [u'27'])])
|
||||
|
||||
def test_xmliter_text(self):
|
||||
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||
@ -206,7 +255,7 @@ class UtilsCsvTestCase(unittest.TestCase):
|
||||
def test_csviter_quotechar(self):
|
||||
body1 = get_testdata('feeds', 'feed-sample6.csv')
|
||||
body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')
|
||||
|
||||
|
||||
response1 = TextResponse(url="http://example.com/", body=body1)
|
||||
csv1 = csviter(response1, quotechar="'")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user