mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-28 18:58:34 +00:00
issue GH #1550 - fixed bugs in scrapy.utils.url.add_http_if_no_scheme(): when given URI where scheme is present, but not 'http' the function gave bad result
This commit is contained in:
parent
98a2e77a75
commit
a41c64bfb9
@ -6,6 +6,7 @@ Some of the functions that used to be imported from this module have been moved
|
|||||||
to the w3lib.url module. Always import those from there instead.
|
to the w3lib.url module. Always import those from there instead.
|
||||||
"""
|
"""
|
||||||
import posixpath
|
import posixpath
|
||||||
|
from urlparse import urlsplit, urlunsplit
|
||||||
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
|
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
|
||||||
urlparse, parse_qsl, urlencode,
|
urlparse, parse_qsl, urlencode,
|
||||||
unquote)
|
unquote)
|
||||||
@ -114,10 +115,16 @@ def escape_ajax(url):
|
|||||||
|
|
||||||
def add_http_if_no_scheme(url):
|
def add_http_if_no_scheme(url):
|
||||||
"""Add http as the default scheme if it is missing from the url."""
|
"""Add http as the default scheme if it is missing from the url."""
|
||||||
if url.startswith('//'):
|
parts = urlsplit(url)
|
||||||
url = 'http:' + url
|
scheme = parts.scheme or "http"
|
||||||
return url
|
if parts.netloc:
|
||||||
parser = parse_url(url)
|
netloc = parts.netloc
|
||||||
if not parser.scheme or not parser.netloc:
|
path = parts.path
|
||||||
url = 'http://' + url
|
else:
|
||||||
return url
|
path_parts = url.split("/", 1)
|
||||||
|
netloc = path_parts[0]
|
||||||
|
path = path_parts[1] if len(path_parts) > 1 else "/"
|
||||||
|
|
||||||
|
return urlunsplit((
|
||||||
|
scheme, netloc, path, parts.query, parts.fragment
|
||||||
|
))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user