1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-28 18:38:43 +00:00

issue GH #1550 - rewritten add_http_if_no_scheme()

This commit is contained in:
Leonid Amirov 2015-11-03 14:32:30 +03:00
parent bc9db65358
commit dd45b31fe4

View File

@ -6,7 +6,7 @@ Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead. to the w3lib.url module. Always import those from there instead.
""" """
import posixpath import posixpath
from urlparse import urlsplit, urlunsplit import re
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag, from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
urlparse, parse_qsl, urlencode, urlparse, parse_qsl, urlencode,
unquote) unquote)
@ -115,16 +115,10 @@ def escape_ajax(url):
def add_http_if_no_scheme(url): def add_http_if_no_scheme(url):
"""Add http as the default scheme if it is missing from the url.""" """Add http as the default scheme if it is missing from the url."""
parts = urlsplit(url) match = re.match(r"^\w+://", url, flags=re.I)
scheme = parts.scheme or "http" parts = urlparse(url)
if parts.netloc: if not match:
netloc = parts.netloc scheme = "http:" if parts.netloc else "http://"
path = parts.path url = scheme + url
else:
path_parts = url.split("/", 1)
netloc = path_parts[0]
path = path_parts[1] if len(path_parts) > 1 else "/"
return urlunsplit(( return url
scheme, netloc, path, parts.query, parts.fragment
))