mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 07:03:49 +00:00
added proper fix to canonicalize_url problem with unicode urls already percent-encoded
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40437
This commit is contained in:
parent
8d08ab5f98
commit
af842a3e6d
@ -82,9 +82,12 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
'product.html?id=200&foo=bar')
|
||||
|
||||
def test_canonicalize_url(self):
|
||||
# no query arguments
|
||||
# simplest case
|
||||
self.assertEqual(canonicalize_url("http://www.example.com"),
|
||||
"http://www.example.com")
|
||||
"http://www.example.com")
|
||||
|
||||
# always return a str
|
||||
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
|
||||
|
||||
# typical usage
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
|
||||
@ -96,7 +99,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
|
||||
# sorting by argument values
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
|
||||
"http://www.example.com/do?a=50&b=2&b=5&c=3")
|
||||
"http://www.example.com/do?a=50&b=2&b=5&c=3")
|
||||
|
||||
# using keep_blank_values
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
|
||||
@ -148,6 +151,14 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
|
||||
u"http://user:pass@www.example.com/do?a=1#frag")
|
||||
|
||||
# urllib.quote uses a mapping cache of encoded characters. when parsing
|
||||
# an already percent-encoded url, it will fail if that url was not
|
||||
# percent-encoded as utf-8, that's why canonicalize_url must always
|
||||
# convert the urls to string. the following test asserts that
|
||||
# functionality.
|
||||
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
|
||||
'http://www.example.com/caf%E9-con-leche.htm')
|
||||
|
||||
def test_check_valid_urlencode(self):
|
||||
self.assertFalse(check_valid_urlencode(r'http://www.example.com/pictures\detail\CAN43664.jpg'))
|
||||
self.assertTrue(check_valid_urlencode('http://www.example.com/pictures%5Cdetail%5CCAN43664.jpg'))
|
||||
|
@ -125,7 +125,7 @@ def add_or_replace_parameter(url, name, new_value, sep='&'):
|
||||
return next_url
|
||||
|
||||
def canonicalize_url(url, keep_blank_values=False, keep_fragments=False):
|
||||
"""Canonicalize url by applying the following procedures:
|
||||
"""Canonicalize the given url by applying the following procedures:
|
||||
|
||||
- sort query arguments, first by key, then by value
|
||||
- percent encode paths and query arguments. non-ASCII characters are
|
||||
@ -135,9 +135,13 @@ def canonicalize_url(url, keep_blank_values=False, keep_fragments=False):
|
||||
- remove query arguments with blank values (unless keep_blank_values is True)
|
||||
- remove fragments (unless keep_fragments is True)
|
||||
|
||||
The url passed can be a str or unicode, while the url returned is always a
|
||||
str.
|
||||
|
||||
For examples see the tests in scrapy.tests.test_utils_url
|
||||
"""
|
||||
|
||||
url = url.encode('utf-8')
|
||||
parts = list(urlparse.urlparse(url))
|
||||
keyvals = cgi.parse_qsl(parts[4], keep_blank_values)
|
||||
keyvals.sort()
|
||||
|
Loading…
x
Reference in New Issue
Block a user