mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 13:24:20 +00:00
use safe_url_string in canonicalize_url, to avoid to convert safe characters into percent representation. Lead to errors with many sites (RFC3986). closes #297
This commit is contained in:
parent
6a1b69c93f
commit
02ccca01eb
@ -266,6 +266,11 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
|
||||
u"http://user:pass@www.example.com/do?a=1#frag")
|
||||
|
||||
# dont convert safe characters to percent encoding representation
|
||||
self.assertEqual(canonicalize_url(
|
||||
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
|
||||
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
|
||||
|
||||
# urllib.quote uses a mapping cache of encoded characters. when parsing
|
||||
# an already percent-encoded url, it will fail if that url was not
|
||||
# percent-encoded as utf-8, that's why canonicalize_url must always
|
||||
|
@ -152,7 +152,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
||||
keyvals = cgi.parse_qsl(query, keep_blank_values)
|
||||
keyvals.sort()
|
||||
query = urllib.urlencode(keyvals)
|
||||
path = urllib.quote(urllib.unquote(path))
|
||||
path = safe_url_string(urllib.unquote(path))
|
||||
fragment = '' if not keep_fragments else fragment
|
||||
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user