From 02ccca01ebd4d407fd39ad1053395346fe669baa Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Wed, 8 Dec 2010 16:28:38 -0200 Subject: [PATCH] use safe_url_string in canonicalize_url, to avoid to convert safe characters into percent representation. Lead to errors with many sites (RFC3986). closes #297 --- scrapy/tests/test_utils_url.py | 5 +++++ scrapy/utils/url.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scrapy/tests/test_utils_url.py b/scrapy/tests/test_utils_url.py index ec9422cbd..6e7e8bb3f 100644 --- a/scrapy/tests/test_utils_url.py +++ b/scrapy/tests/test_utils_url.py @@ -266,6 +266,11 @@ class UrlUtilsTest(unittest.TestCase): self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True), u"http://user:pass@www.example.com/do?a=1#frag") + # dont convert safe characters to percent encoding representation + self.assertEqual(canonicalize_url( + "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"), + "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html") + # urllib.quote uses a mapping cache of encoded characters. when parsing # an already percent-encoded url, it will fail if that url was not # percent-encoded as utf-8, that's why canonicalize_url must always diff --git a/scrapy/utils/url.py b/scrapy/utils/url.py index dfb7f7bb8..3dc77153f 100644 --- a/scrapy/utils/url.py +++ b/scrapy/utils/url.py @@ -152,7 +152,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \ keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals.sort() query = urllib.urlencode(keyvals) - path = urllib.quote(urllib.unquote(path)) + path = safe_url_string(urllib.unquote(path)) fragment = '' if not keep_fragments else fragment return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))