diff --git a/scrapy/trunk/scrapy/tests/test_utils_url.py b/scrapy/trunk/scrapy/tests/test_utils_url.py index 2b0fcb660..e68136a6a 100644 --- a/scrapy/trunk/scrapy/tests/test_utils_url.py +++ b/scrapy/trunk/scrapy/tests/test_utils_url.py @@ -82,9 +82,12 @@ class UrlUtilsTest(unittest.TestCase): 'product.html?id=200&foo=bar') def test_canonicalize_url(self): - # no query arguments + # simplest case self.assertEqual(canonicalize_url("http://www.example.com"), - "http://www.example.com") + "http://www.example.com") + + # always return a str + assert isinstance(canonicalize_url(u"http://www.example.com"), str) # typical usage self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), @@ -96,7 +99,7 @@ class UrlUtilsTest(unittest.TestCase): # sorting by argument values self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), - "http://www.example.com/do?a=50&b=2&b=5&c=3") + "http://www.example.com/do?a=50&b=2&b=5&c=3") # using keep_blank_values self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"), @@ -148,6 +151,14 @@ class UrlUtilsTest(unittest.TestCase): self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True), u"http://user:pass@www.example.com/do?a=1#frag") + # urllib.quote uses a mapping cache of encoded characters. when parsing + # an already percent-encoded url, it will fail if that url was not + # percent-encoded as utf-8, that's why canonicalize_url must always + # convert the urls to string. the following test asserts that + # functionality. + self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), + 'http://www.example.com/caf%E9-con-leche.htm') + def test_check_valid_urlencode(self): self.assertFalse(check_valid_urlencode(r'http://www.example.com/pictures\detail\CAN43664.jpg')) self.assertTrue(check_valid_urlencode('http://www.example.com/pictures%5Cdetail%5CCAN43664.jpg')) diff --git a/scrapy/trunk/scrapy/utils/url.py b/scrapy/trunk/scrapy/utils/url.py index 5f07283f7..a2b586d5c 100644 --- a/scrapy/trunk/scrapy/utils/url.py +++ b/scrapy/trunk/scrapy/utils/url.py @@ -125,7 +125,7 @@ def add_or_replace_parameter(url, name, new_value, sep='&'): return next_url def canonicalize_url(url, keep_blank_values=False, keep_fragments=False): - """Canonicalize url by applying the following procedures: + """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are @@ -135,9 +135,13 @@ def canonicalize_url(url, keep_blank_values=False, keep_fragments=False): - remove query arguments with blank values (unless keep_blank_values is True) - remove fragments (unless keep_fragments is True) + The url passed can be a str or unicode, while the url returned is always a + str. + For examples see the tests in scrapy.tests.test_utils_url """ + url = url.encode('utf-8') parts = list(urlparse.urlparse(url)) keyvals = cgi.parse_qsl(parts[4], keep_blank_values) keyvals.sort()