Merge remote-tracking branch 'origin/master' into xmliter-unicode

2025-02-25 16:04:01 +00:00 · 2016-01-13 11:42:01 +01:00 · 2016-01-13 11:42:01 +01:00 · 2f2c2e8096
commit 2f2c2e8096
parent 6ddd814738 f01fd07642
3 changed files with 17 additions and 14 deletions
--- a/scrapy/utils/iterators.py
+++ b/scrapy/utils/iterators.py
@ -1,12 +1,11 @@
 import re
 import csv
 import logging
-
 try:
    from cStringIO import StringIO as BytesIO
 except ImportError:
    from io import BytesIO
-
+from io import StringIO
 import six

 from scrapy.http import TextResponse, Response
@ -65,7 +64,7 @@ class _StreamReader(object):
            self._text, self.encoding = obj.body, obj.encoding
        else:
            self._text, self.encoding = obj, 'utf-8'
-        self._is_unicode = isinstance(self._text, unicode)
+        self._is_unicode = isinstance(self._text, six.text_type)

    def read(self, n=65535):
        self.read = self._read_unicode if self._is_unicode else self._read_string
@ -102,7 +101,11 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
    def _getrow(csv_r):
        return [to_unicode(field, encoding) for field in next(csv_r)]

-    lines = BytesIO(_body_or_str(obj, unicode=False))
+    # Python 3 csv reader input object needs to return strings
+    if six.PY3:
+        lines = StringIO(_body_or_str(obj, unicode=True))
+    else:
+        lines = BytesIO(_body_or_str(obj, unicode=False))

    kwargs = {}
    if delimiter: kwargs["delimiter"] = delimiter
--- a/tests/py3-ignores.txt
+++ b/tests/py3-ignores.txt
@ -16,7 +16,6 @@ tests/test_pipeline_files.py
 tests/test_pipeline_images.py
 tests/test_proxy_connect.py
 tests/test_spidermiddleware_httperror.py
-tests/test_utils_iterators.py
 tests/test_utils_template.py
 tests/test_webclient.py

--- a/tests/test_utils_iterators.py
+++ b/tests/test_utils_iterators.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import os
+import six
 from twisted.trial import unittest

 from scrapy.utils.iterators import csviter, xmliter, _body_or_str, xmliter_lxml
@ -148,7 +149,7 @@ class XmliterTestCase(unittest.TestCase):
        body = b'<?xml version="1.0" encoding="ISO-8859-9"?>\n<xml>\n    <item>Some Turkish Characters \xd6\xc7\xde\xdd\xd0\xdc \xfc\xf0\xfd\xfe\xe7\xf6</item>\n</xml>\n\n'
        response = XmlResponse('http://www.example.com', body=body)
        self.assertEqual(
-            self.xmliter(response, 'item').next().extract(),
+            next(self.xmliter(response, 'item')).extract(),
            u'<item>Some Turkish Characters \xd6\xc7\u015e\u0130\u011e\xdc \xfc\u011f\u0131\u015f\xe7\xf6</item>'
        )

@ -238,11 +239,11 @@ class UtilsCsvTestCase(unittest.TestCase):

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
-            self.assert_(all((isinstance(k, unicode) for k in result_row.keys())))
-            self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
+            self.assert_(all((isinstance(k, six.text_type) for k in result_row.keys())))
+            self.assert_(all((isinstance(v, six.text_type) for v in result_row.values())))

    def test_csviter_delimiter(self):
-        body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
+        body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

@ -254,7 +255,7 @@ class UtilsCsvTestCase(unittest.TestCase):

    def test_csviter_quotechar(self):
        body1 = get_testdata('feeds', 'feed-sample6.csv')
-        body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')
+        body2 = get_testdata('feeds', 'feed-sample6.csv').replace(b',', b'|')

        response1 = TextResponse(url="http://example.com/", body=body1)
        csv1 = csviter(response1, quotechar="'")
@ -286,7 +287,7 @@ class UtilsCsvTestCase(unittest.TestCase):
                          {u"'id'": u"4",   u"'name'": u"'empty'",   u"'value'": u""}])

    def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
-        body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
+        body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t')
        response = Response(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

@ -298,10 +299,10 @@ class UtilsCsvTestCase(unittest.TestCase):

    def test_csviter_headers(self):
        sample = get_testdata('feeds', 'feed-sample3.csv').splitlines()
-        headers, body = sample[0].split(','), '\n'.join(sample[1:])
+        headers, body = sample[0].split(b','), b'\n'.join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
-        csv = csviter(response, headers=headers)
+        csv = csviter(response, headers=[h.decode('utf-8') for h in headers])

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
@ -311,7 +312,7 @@ class UtilsCsvTestCase(unittest.TestCase):

    def test_csviter_falserow(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
-        body = '\n'.join((body, 'a,b', 'a,b,c,d'))
+        body = b'\n'.join((body, b'a,b', b'a,b,c,d'))

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)