1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 07:03:49 +00:00

added proper fix to canonicalize_url problem with unicode urls already percent-encoded

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40437
This commit is contained in:
Pablo Hoffman 2008-11-26 22:10:06 +00:00
parent 8d08ab5f98
commit af842a3e6d
2 changed files with 19 additions and 4 deletions

View File

@ -82,9 +82,12 @@ class UrlUtilsTest(unittest.TestCase):
'product.html?id=200&foo=bar')
def test_canonicalize_url(self):
# no query arguments
# simplest case
self.assertEqual(canonicalize_url("http://www.example.com"),
"http://www.example.com")
"http://www.example.com")
# always return a str
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
# typical usage
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
@ -96,7 +99,7 @@ class UrlUtilsTest(unittest.TestCase):
# sorting by argument values
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
"http://www.example.com/do?a=50&b=2&b=5&c=3")
"http://www.example.com/do?a=50&b=2&b=5&c=3")
# using keep_blank_values
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
@ -148,6 +151,14 @@ class UrlUtilsTest(unittest.TestCase):
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
u"http://user:pass@www.example.com/do?a=1#frag")
# urllib.quote uses a mapping cache of encoded characters. when parsing
# an already percent-encoded url, it will fail if that url was not
# percent-encoded as utf-8, that's why canonicalize_url must always
# convert the urls to string. the following test asserts that
# functionality.
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
'http://www.example.com/caf%E9-con-leche.htm')
def test_check_valid_urlencode(self):
self.assertFalse(check_valid_urlencode(r'http://www.example.com/pictures\detail\CAN43664.jpg'))
self.assertTrue(check_valid_urlencode('http://www.example.com/pictures%5Cdetail%5CCAN43664.jpg'))

View File

@ -125,7 +125,7 @@ def add_or_replace_parameter(url, name, new_value, sep='&'):
return next_url
def canonicalize_url(url, keep_blank_values=False, keep_fragments=False):
"""Canonicalize url by applying the following procedures:
"""Canonicalize the given url by applying the following procedures:
- sort query arguments, first by key, then by value
- percent encode paths and query arguments. non-ASCII characters are
@ -135,9 +135,13 @@ def canonicalize_url(url, keep_blank_values=False, keep_fragments=False):
- remove query arguments with blank values (unless keep_blank_values is True)
- remove fragments (unless keep_fragments is True)
The url passed can be a str or unicode, while the url returned is always a
str.
For examples see the tests in scrapy.tests.test_utils_url
"""
url = url.encode('utf-8')
parts = list(urlparse.urlparse(url))
keyvals = cgi.parse_qsl(parts[4], keep_blank_values)
keyvals.sort()