1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 03:43:43 +00:00

use safe_url_string in canonicalize_url, to avoid to convert safe characters into percent representation. Lead to errors with many sites (RFC3986). closes #297

This commit is contained in:
Martin Olveyra 2010-12-08 16:28:38 -02:00
parent 6a1b69c93f
commit 02ccca01eb
2 changed files with 6 additions and 1 deletions

View File

@ -266,6 +266,11 @@ class UrlUtilsTest(unittest.TestCase):
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
u"http://user:pass@www.example.com/do?a=1#frag")
# dont convert safe characters to percent encoding representation
self.assertEqual(canonicalize_url(
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
# urllib.quote uses a mapping cache of encoded characters. when parsing
# an already percent-encoded url, it will fail if that url was not
# percent-encoded as utf-8, that's why canonicalize_url must always

View File

@ -152,7 +152,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
keyvals = cgi.parse_qsl(query, keep_blank_values)
keyvals.sort()
query = urllib.urlencode(keyvals)
path = urllib.quote(urllib.unquote(path))
path = safe_url_string(urllib.unquote(path))
fragment = '' if not keep_fragments else fragment
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))