issue GH #1550 - rewritten add_http_if_no_scheme()

2025-02-28 18:38:43 +00:00 · 2015-11-03 14:32:30 +03:00 · 2015-11-03 14:32:30 +03:00 · dd45b31fe4
commit dd45b31fe4
parent bc9db65358
1 changed files with 7 additions and 13 deletions
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@ -6,7 +6,7 @@ Some of the functions that used to be imported from this module have been moved
 to the w3lib.url module. Always import those from there instead.
 """
 import posixpath
-from urlparse import urlsplit, urlunsplit
+import re
 from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
                                    urlparse, parse_qsl, urlencode,
                                    unquote)
@ -115,16 +115,10 @@ def escape_ajax(url):
 def add_http_if_no_scheme(url):
    """Add http as the default scheme if it is missing from the url."""
-    parts = urlsplit(url)
+    match = re.match(r"^\w+://", url, flags=re.I)
-    scheme = parts.scheme or "http"
+    parts = urlparse(url)
-    if parts.netloc:
+    if not match:
-        netloc = parts.netloc
+        scheme = "http:" if parts.netloc else "http://"
-        path = parts.path
+        url = scheme + url
    else:
        path_parts = url.split("/", 1)
        netloc = path_parts[0]
        path = path_parts[1] if len(path_parts) > 1 else "/"
-    return urlunsplit((
+    return url
        scheme, netloc, path, parts.query, parts.fragment
    ))