use safe_url_string in canonicalize_url, to avoid to convert safe characters into percent representation. Lead to errors with many sites (RFC3986). closes #297

2025-02-23 13:24:20 +00:00 · 2010-12-08 16:28:38 -02:00 · 2010-12-08 16:28:38 -02:00 · 02ccca01eb
commit 02ccca01eb
parent 6a1b69c93f
2 changed files with 6 additions and 1 deletions
--- a/scrapy/tests/test_utils_url.py
+++ b/scrapy/tests/test_utils_url.py
@ -266,6 +266,11 @@ class UrlUtilsTest(unittest.TestCase):
        self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
                                          u"http://user:pass@www.example.com/do?a=1#frag")

+        # dont convert safe characters to percent encoding representation
+        self.assertEqual(canonicalize_url(
+            "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
+            "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
+
        # urllib.quote uses a mapping cache of encoded characters. when parsing
        # an already percent-encoded url, it will fail if that url was not
        # percent-encoded as utf-8, that's why canonicalize_url must always
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@ -152,7 +152,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
-    path = urllib.quote(urllib.unquote(path))
+    path = safe_url_string(urllib.unquote(path))
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))