issue GH #1550 - fixed bugs in scrapy.utils.url.add_http_if_no_scheme(): when given URI where scheme is present, but not 'http' the function gave bad result

2025-02-28 18:58:34 +00:00 · 2015-11-02 16:06:21 +03:00 · 2015-11-02 16:06:21 +03:00 · a41c64bfb9
commit a41c64bfb9
parent 98a2e77a75
1 changed files with 14 additions and 7 deletions
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@ -6,6 +6,7 @@ Some of the functions that used to be imported from this module have been moved
 to the w3lib.url module. Always import those from there instead.
 """
 import posixpath
 from urlparse import urlsplit, urlunsplit
 from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
                                    urlparse, parse_qsl, urlencode,
                                    unquote)
@ -114,10 +115,16 @@ def escape_ajax(url):
 def add_http_if_no_scheme(url):
    """Add http as the default scheme if it is missing from the url."""
-    if url.startswith('//'):
+    parts = urlsplit(url)
-        url = 'http:' + url
+    scheme = parts.scheme or "http"
-        return url
+    if parts.netloc:
-    parser = parse_url(url)
+        netloc = parts.netloc
-    if not parser.scheme or not parser.netloc:
+        path = parts.path
-        url = 'http://' + url
+    else:
-    return url
+        path_parts = url.split("/", 1)
        netloc = path_parts[0]
        path = path_parts[1] if len(path_parts) > 1 else "/"
    return urlunsplit((
        scheme, netloc, path, parts.query, parts.fragment
    ))