added proper fix to canonicalize_url problem with unicode urls already percent-encoded

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40437
2025-02-26 07:03:49 +00:00 · 2008-11-26 22:10:06 +00:00 · 2008-11-26 22:10:06 +00:00 · af842a3e6d
commit af842a3e6d
parent 8d08ab5f98
2 changed files with 19 additions and 4 deletions
--- a/scrapy/trunk/scrapy/tests/test_utils_url.py
+++ b/scrapy/trunk/scrapy/tests/test_utils_url.py
@ -82,9 +82,12 @@ class UrlUtilsTest(unittest.TestCase):
                         'product.html?id=200&foo=bar')

    def test_canonicalize_url(self):
-        # no query arguments
+        # simplest case
        self.assertEqual(canonicalize_url("http://www.example.com"),
-                         "http://www.example.com")
+                                          "http://www.example.com")
+
+        # always return a str
+        assert isinstance(canonicalize_url(u"http://www.example.com"), str)

        # typical usage
        self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
@ -96,7 +99,7 @@ class UrlUtilsTest(unittest.TestCase):
        
        # sorting by argument values
        self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
-                         "http://www.example.com/do?a=50&b=2&b=5&c=3")
+                                          "http://www.example.com/do?a=50&b=2&b=5&c=3")

        # using keep_blank_values
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
@ -148,6 +151,14 @@ class UrlUtilsTest(unittest.TestCase):
        self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
                                          u"http://user:pass@www.example.com/do?a=1#frag")

+        # urllib.quote uses a mapping cache of encoded characters. when parsing
+        # an already percent-encoded url, it will fail if that url was not
+        # percent-encoded as utf-8, that's why canonicalize_url must always
+        # convert the urls to string. the following test asserts that
+        # functionality.
+        self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
+                                           'http://www.example.com/caf%E9-con-leche.htm')
+
    def test_check_valid_urlencode(self):
        self.assertFalse(check_valid_urlencode(r'http://www.example.com/pictures\detail\CAN43664.jpg'))
        self.assertTrue(check_valid_urlencode('http://www.example.com/pictures%5Cdetail%5CCAN43664.jpg'))
--- a/scrapy/trunk/scrapy/utils/url.py
+++ b/scrapy/trunk/scrapy/utils/url.py
@ -125,7 +125,7 @@ def add_or_replace_parameter(url, name, new_value, sep='&'):
    return next_url

 def canonicalize_url(url, keep_blank_values=False, keep_fragments=False):
-    """Canonicalize url by applying the following procedures:
+    """Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
@ -135,9 +135,13 @@ def canonicalize_url(url, keep_blank_values=False, keep_fragments=False):
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove fragments (unless keep_fragments is True)

+    The url passed can be a str or unicode, while the url returned is always a
+    str.
+
    For examples see the tests in scrapy.tests.test_utils_url
    """

+    url = url.encode('utf-8')
    parts = list(urlparse.urlparse(url))
    keyvals = cgi.parse_qsl(parts[4], keep_blank_values)
    keyvals.sort()