1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-28 18:58:34 +00:00

issue GH #1550 - fixed bugs in scrapy.utils.url.add_http_if_no_scheme(): when given URI where scheme is present, but not 'http' the function gave bad result

This commit is contained in:
Leonid Amirov 2015-11-02 16:06:21 +03:00
parent 98a2e77a75
commit a41c64bfb9

View File

@ -6,6 +6,7 @@ Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead. to the w3lib.url module. Always import those from there instead.
""" """
import posixpath import posixpath
from urlparse import urlsplit, urlunsplit
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag, from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
urlparse, parse_qsl, urlencode, urlparse, parse_qsl, urlencode,
unquote) unquote)
@ -114,10 +115,16 @@ def escape_ajax(url):
def add_http_if_no_scheme(url): def add_http_if_no_scheme(url):
"""Add http as the default scheme if it is missing from the url.""" """Add http as the default scheme if it is missing from the url."""
if url.startswith('//'): parts = urlsplit(url)
url = 'http:' + url scheme = parts.scheme or "http"
return url if parts.netloc:
parser = parse_url(url) netloc = parts.netloc
if not parser.scheme or not parser.netloc: path = parts.path
url = 'http://' + url else:
return url path_parts = url.split("/", 1)
netloc = path_parts[0]
path = path_parts[1] if len(path_parts) > 1 else "/"
return urlunsplit((
scheme, netloc, path, parts.query, parts.fragment
))