Support unicode tags in xml iterators (fixes #1665)

2025-02-24 22:23:46 +00:00 · 2016-01-12 10:48:45 +01:00 · 2016-01-12 10:48:45 +01:00 · 6ddd814738
commit 6ddd814738
parent 95e8ff8ba1
2 changed files with 55 additions and 6 deletions
--- a/scrapy/utils/iterators.py
+++ b/scrapy/utils/iterators.py
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)

 def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
-       given tha name of the node to iterate. Useful for parsing XML feeds.
+       given the name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
@ -36,7 +36,7 @@ def xmliter(obj, nodename):
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

-    r = re.compile(r"<{0}[\s>].*?</{0}>".format(nodename_patt), re.DOTALL)
+    r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
@ -49,7 +49,7 @@ def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
    for _, node in iterable:
-        nodetext = etree.tostring(node)
+        nodetext = etree.tostring(node, encoding='unicode')
        node.clear()
        xs = Selector(text=nodetext, type='xml')
        if namespace:
@ -94,7 +94,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.
-    
+
    quotechar is the character used to enclosure fields on the given obj.
    """

@ -125,7 +125,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):


 def _body_or_str(obj, unicode=True):
-    assert isinstance(obj, (Response, six.string_types)), \
+    assert isinstance(obj, (Response, six.string_types, bytes)), \
        "obj must be Response or basestring, not %s" % type(obj).__name__
    if isinstance(obj, Response):
        if not unicode:
--- a/tests/test_utils_iterators.py
+++ b/tests/test_utils_iterators.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import os
 from twisted.trial import unittest

@ -45,6 +46,54 @@ class XmliterTestCase(unittest.TestCase):
                 for e in self.xmliter(response, 'matchme...')]
        self.assertEqual(nodenames, [['matchme...']])

+    def test_xmliter_unicode(self):
+        # example taken from https://github.com/scrapy/scrapy/issues/1665
+        body = """<?xml version="1.0" encoding="UTF-8"?>
+            <þingflokkar>
+               <þingflokkur id="26">
+                  <heiti />
+                  <skammstafanir>
+                     <stuttskammstöfun>-</stuttskammstöfun>
+                     <löngskammstöfun />
+                  </skammstafanir>
+                  <tímabil>
+                     <fyrstaþing>80</fyrstaþing>
+                  </tímabil>
+               </þingflokkur>
+               <þingflokkur id="21">
+                  <heiti>Alþýðubandalag</heiti>
+                  <skammstafanir>
+                     <stuttskammstöfun>Ab</stuttskammstöfun>
+                     <löngskammstöfun>Alþb.</löngskammstöfun>
+                  </skammstafanir>
+                  <tímabil>
+                     <fyrstaþing>76</fyrstaþing>
+                     <síðastaþing>123</síðastaþing>
+                  </tímabil>
+               </þingflokkur>
+               <þingflokkur id="27">
+                  <heiti>Alþýðuflokkur</heiti>
+                  <skammstafanir>
+                     <stuttskammstöfun>A</stuttskammstöfun>
+                     <löngskammstöfun>Alþfl.</löngskammstöfun>
+                  </skammstafanir>
+                  <tímabil>
+                     <fyrstaþing>27</fyrstaþing>
+                     <síðastaþing>120</síðastaþing>
+                  </tímabil>
+               </þingflokkur>
+            </þingflokkar>"""
+        response = XmlResponse(url="http://example.com", body=body)
+        attrs = []
+        for x in self.xmliter(response, u'þingflokkur'):
+            attrs.append((x.xpath('@id').extract(),
+                          x.xpath(u'./skammstafanir/stuttskammstöfun/text()').extract(),
+                          x.xpath(u'./tímabil/fyrstaþing/text()').extract()))
+
+        self.assertEqual(attrs,
+                         [([u'26'], [u'-'], [u'80']),
+                          ([u'21'], [u'Ab'], [u'76']),
+                          ([u'27'], [u'A'], [u'27'])])

    def test_xmliter_text(self):
        body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
@ -206,7 +255,7 @@ class UtilsCsvTestCase(unittest.TestCase):
    def test_csviter_quotechar(self):
        body1 = get_testdata('feeds', 'feed-sample6.csv')
        body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')
-        
+
        response1 = TextResponse(url="http://example.com/", body=body1)
        csv1 = csviter(response1, quotechar="'")