mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 00:23:52 +00:00
Merge pull request #2168 from advarisk/w3lib-canonicalize-url
[MRG+1] Use w3lib.url.canonicalize_url() from w3lib 1.15.0
This commit is contained in:
commit
241bd00e76
@ -103,7 +103,7 @@ LxmlLinkExtractor
|
||||
:type attrs: list
|
||||
|
||||
:param canonicalize: canonicalize each extracted url (using
|
||||
scrapy.utils.url.canonicalize_url). Defaults to ``True``.
|
||||
w3lib.url.canonicalize_url). Defaults to ``True``.
|
||||
:type canonicalize: boolean
|
||||
|
||||
:param unique: whether duplicate filtering should be applied to extracted
|
||||
|
@ -2,7 +2,7 @@ Twisted>=10.0.0
|
||||
lxml
|
||||
pyOpenSSL
|
||||
cssselect>=0.9
|
||||
w3lib>=1.14.2
|
||||
w3lib>=1.15.0
|
||||
queuelib
|
||||
six>=1.5.2
|
||||
PyDispatcher>=2.0.5
|
||||
|
@ -9,10 +9,11 @@ import re
|
||||
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from parsel.csstranslator import HTMLTranslator
|
||||
from w3lib.url import canonicalize_url
|
||||
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.url import (
|
||||
canonicalize_url, url_is_from_any_domain, url_has_any_extension,
|
||||
url_is_from_any_domain, url_has_any_extension,
|
||||
)
|
||||
|
||||
|
||||
|
@ -11,7 +11,7 @@ from six.moves.urllib.parse import urlunparse
|
||||
from w3lib.http import basic_auth_header
|
||||
from scrapy.utils.python import to_bytes, to_native_str
|
||||
|
||||
from scrapy.utils.url import canonicalize_url
|
||||
from w3lib.url import canonicalize_url
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
|
||||
|
@ -7,18 +7,13 @@ to the w3lib.url module. Always import those from there instead.
|
||||
"""
|
||||
import posixpath
|
||||
import re
|
||||
import six
|
||||
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
|
||||
urlparse, parse_qsl, urlencode,
|
||||
quote, unquote)
|
||||
if not six.PY2:
|
||||
from urllib.parse import unquote_to_bytes
|
||||
from six.moves.urllib.parse import (ParseResult, urldefrag, urlparse)
|
||||
|
||||
# scrapy.utils.url was moved to w3lib.url and import * ensures this
|
||||
# move doesn't break old code
|
||||
from w3lib.url import *
|
||||
from w3lib.url import _safe_chars
|
||||
from scrapy.utils.python import to_bytes, to_native_str, to_unicode
|
||||
from w3lib.url import _safe_chars, _unquotepath
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
def url_is_from_any_domain(url, domains):
|
||||
@ -40,121 +35,6 @@ def url_has_any_extension(url, extensions):
|
||||
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
||||
|
||||
|
||||
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
|
||||
# IDNA encoding can fail for too long labels (>63 characters)
|
||||
# or missing labels (e.g. http://.example.com)
|
||||
try:
|
||||
netloc = parts.netloc.encode('idna')
|
||||
except UnicodeError:
|
||||
netloc = parts.netloc
|
||||
|
||||
return (
|
||||
to_native_str(parts.scheme),
|
||||
to_native_str(netloc),
|
||||
|
||||
# default encoding for path component SHOULD be UTF-8
|
||||
quote(to_bytes(parts.path, path_encoding), _safe_chars),
|
||||
quote(to_bytes(parts.params, path_encoding), _safe_chars),
|
||||
|
||||
# encoding of query and fragment follows page encoding
|
||||
# or form-charset (if known and passed)
|
||||
quote(to_bytes(parts.query, encoding), _safe_chars),
|
||||
quote(to_bytes(parts.fragment, encoding), _safe_chars)
|
||||
)
|
||||
|
||||
|
||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
|
||||
encoding=None):
|
||||
"""Canonicalize the given url by applying the following procedures:
|
||||
|
||||
- sort query arguments, first by key, then by value
|
||||
- percent encode paths ; non-ASCII characters are percent-encoded
|
||||
using UTF-8 (RFC-3986)
|
||||
- percent encode query arguments ; non-ASCII characters are percent-encoded
|
||||
using passed `encoding` (UTF-8 by default)
|
||||
- normalize all spaces (in query arguments) '+' (plus symbol)
|
||||
- normalize percent encodings case (%2f -> %2F)
|
||||
- remove query arguments with blank values (unless `keep_blank_values` is True)
|
||||
- remove fragments (unless `keep_fragments` is True)
|
||||
|
||||
The url passed can be bytes or unicode, while the url returned is
|
||||
always a native str (bytes in Python 2, unicode in Python 3).
|
||||
|
||||
For examples see the tests in tests/test_utils_url.py
|
||||
"""
|
||||
# If supplied `encoding` is not compatible with all characters in `url`,
|
||||
# fallback to UTF-8 as safety net.
|
||||
# UTF-8 can handle all Unicode characters,
|
||||
# so we should be covered regarding URL normalization,
|
||||
# if not for proper URL expected by remote website.
|
||||
try:
|
||||
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
|
||||
parse_url(url), encoding=encoding)
|
||||
except UnicodeEncodeError as e:
|
||||
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
|
||||
parse_url(url), encoding='utf8')
|
||||
|
||||
# 1. decode query-string as UTF-8 (or keep raw bytes),
|
||||
# sort values,
|
||||
# and percent-encode them back
|
||||
if six.PY2:
|
||||
keyvals = parse_qsl(query, keep_blank_values)
|
||||
else:
|
||||
# Python3's urllib.parse.parse_qsl does not work as wanted
|
||||
# for percent-encoded characters that do not match passed encoding,
|
||||
# they get lost.
|
||||
#
|
||||
# e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
|
||||
# (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
|
||||
# instead of \xa3 that you get with Python2's parse_qsl)
|
||||
#
|
||||
# what we want here is to keep raw bytes, and percent encode them
|
||||
# so as to preserve whatever encoding what originally used.
|
||||
#
|
||||
# See https://tools.ietf.org/html/rfc3987#section-6.4:
|
||||
#
|
||||
# For example, it is possible to have a URI reference of
|
||||
# "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
|
||||
# document name is encoded in iso-8859-1 based on server settings, but
|
||||
# where the fragment identifier is encoded in UTF-8 according to
|
||||
# [XPointer]. The IRI corresponding to the above URI would be (in XML
|
||||
# notation)
|
||||
# "http://www.example.org/r%E9sum%E9.xml#résumé".
|
||||
# Similar considerations apply to query parts. The functionality of
|
||||
# IRIs (namely, to be able to include non-ASCII characters) can only be
|
||||
# used if the query part is encoded in UTF-8.
|
||||
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
|
||||
keyvals.sort()
|
||||
query = urlencode(keyvals)
|
||||
|
||||
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
|
||||
# and percent-encode path again (this normalizes to upper-case %XX)
|
||||
uqp = _unquotepath(path)
|
||||
path = quote(uqp, _safe_chars) or '/'
|
||||
|
||||
fragment = '' if not keep_fragments else fragment
|
||||
|
||||
# every part should be safe already
|
||||
return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
|
||||
|
||||
|
||||
def _unquotepath(path):
|
||||
for reserved in ('2f', '2F', '3f', '3F'):
|
||||
path = path.replace('%' + reserved, '%25' + reserved.upper())
|
||||
|
||||
if six.PY2:
|
||||
# in Python 2, '%a3' becomes '\xa3', which is what we want
|
||||
return unquote(path)
|
||||
else:
|
||||
# in Python 3,
|
||||
# standard lib's unquote() does not work for non-UTF-8
|
||||
# percent-escaped characters, they get lost.
|
||||
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
|
||||
#
|
||||
# unquote_to_bytes() returns raw bytes instead
|
||||
return unquote_to_bytes(path)
|
||||
|
||||
|
||||
def parse_url(url, encoding=None):
|
||||
"""Return urlparsed url from the given argument (which could be an already
|
||||
parsed url)
|
||||
@ -164,59 +44,6 @@ def parse_url(url, encoding=None):
|
||||
return urlparse(to_unicode(url, encoding))
|
||||
|
||||
|
||||
if not six.PY2:
|
||||
from urllib.parse import _coerce_args, unquote_to_bytes
|
||||
|
||||
def parse_qsl_to_bytes(qs, keep_blank_values=False, strict_parsing=False):
|
||||
"""Parse a query given as a string argument.
|
||||
|
||||
Data are returned as a list of name, value pairs as bytes.
|
||||
|
||||
Arguments:
|
||||
|
||||
qs: percent-encoded query string to be parsed
|
||||
|
||||
keep_blank_values: flag indicating whether blank values in
|
||||
percent-encoded queries should be treated as blank strings. A
|
||||
true value indicates that blanks should be retained as blank
|
||||
strings. The default false value indicates that blank values
|
||||
are to be ignored and treated as if they were not included.
|
||||
|
||||
strict_parsing: flag indicating what to do with parsing errors. If
|
||||
false (the default), errors are silently ignored. If true,
|
||||
errors raise a ValueError exception.
|
||||
|
||||
"""
|
||||
# This code is the same as Python3's parse_qsl()
|
||||
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
|
||||
# except for the unquote(s, encoding, errors) calls replaced
|
||||
# with unquote_to_bytes(s)
|
||||
qs, _coerce_result = _coerce_args(qs)
|
||||
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
||||
r = []
|
||||
for name_value in pairs:
|
||||
if not name_value and not strict_parsing:
|
||||
continue
|
||||
nv = name_value.split('=', 1)
|
||||
if len(nv) != 2:
|
||||
if strict_parsing:
|
||||
raise ValueError("bad query field: %r" % (name_value,))
|
||||
# Handle case of a control-name with no equal sign
|
||||
if keep_blank_values:
|
||||
nv.append('')
|
||||
else:
|
||||
continue
|
||||
if len(nv[1]) or keep_blank_values:
|
||||
name = nv[0].replace('+', ' ')
|
||||
name = unquote_to_bytes(name)
|
||||
name = _coerce_result(name)
|
||||
value = nv[1].replace('+', ' ')
|
||||
value = unquote_to_bytes(value)
|
||||
value = _coerce_result(value)
|
||||
r.append((name, value))
|
||||
return r
|
||||
|
||||
|
||||
def escape_ajax(url):
|
||||
"""
|
||||
Return the crawleable url according to:
|
||||
|
2
setup.py
2
setup.py
@ -42,7 +42,7 @@ setup(
|
||||
],
|
||||
install_requires=[
|
||||
'Twisted>=10.0.0',
|
||||
'w3lib>=1.14.2',
|
||||
'w3lib>=1.15.0',
|
||||
'queuelib',
|
||||
'lxml',
|
||||
'pyOpenSSL',
|
||||
|
@ -6,8 +6,7 @@ from six.moves.urllib.parse import urlparse
|
||||
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
|
||||
canonicalize_url, add_http_if_no_scheme,
|
||||
guess_scheme, parse_url)
|
||||
add_http_if_no_scheme, guess_scheme, parse_url)
|
||||
|
||||
__doctests__ = ['scrapy.utils.url']
|
||||
|
||||
@ -77,209 +76,6 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
|
||||
|
||||
|
||||
class CanonicalizeUrlTest(unittest.TestCase):
|
||||
|
||||
def test_canonicalize_url(self):
|
||||
# simplest case
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/"),
|
||||
"http://www.example.com/")
|
||||
|
||||
def test_return_str(self):
|
||||
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
|
||||
assert isinstance(canonicalize_url(b"http://www.example.com"), str)
|
||||
|
||||
def test_append_missing_path(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com"),
|
||||
"http://www.example.com/")
|
||||
|
||||
def test_typical_usage(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
|
||||
"http://www.example.com/do?a=1&b=2&c=3")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
|
||||
"http://www.example.com/do?a=3&b=2&c=1")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
|
||||
"http://www.example.com/do?a=1")
|
||||
|
||||
def test_sorting(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
|
||||
"http://www.example.com/do?a=50&b=2&b=5&c=3")
|
||||
|
||||
def test_keep_blank_values(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
|
||||
"http://www.example.com/do?a=2")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
|
||||
"http://www.example.com/do?a=2&b=")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
|
||||
"http://www.example.com/do?a=2")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
|
||||
"http://www.example.com/do?a=2&b=&c=")
|
||||
|
||||
self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
|
||||
'http://www.example.com/do?1750%2C4=')
|
||||
|
||||
def test_spaces(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
|
||||
"http://www.example.com/do?a=1&q=a+space")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
|
||||
"http://www.example.com/do?a=1&q=a+space")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
|
||||
"http://www.example.com/do?a=1&q=a+space")
|
||||
|
||||
def test_canonicalize_url_unicode_path(self):
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9")
|
||||
|
||||
def test_canonicalize_url_unicode_query_string(self):
|
||||
# default encoding for path and query is UTF-8
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
|
||||
|
||||
# passed encoding will affect query string
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")
|
||||
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
|
||||
|
||||
def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
|
||||
# trying to encode with wrong encoding
|
||||
# fallback to UTF-8
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")
|
||||
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
|
||||
|
||||
def test_normalize_percent_encoding_in_paths(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9")
|
||||
|
||||
# non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
|
||||
# 'latin1'-encoded sequence in path
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
|
||||
"http://www.example.com/a%A3do")
|
||||
|
||||
# 'latin1'-encoded path, UTF-8 encoded query string
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
|
||||
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
|
||||
|
||||
# 'latin1'-encoded path and query string
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
|
||||
"http://www.example.com/a%A3do?q=r%E9sum%E9")
|
||||
|
||||
def test_normalize_percent_encoding_in_query_arguments(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
|
||||
"http://www.example.com/do?k=b%A3")
|
||||
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
|
||||
"http://www.example.com/do?k=r%C3%A9sum%C3%A9")
|
||||
|
||||
def test_non_ascii_percent_encoding_in_paths(self):
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
|
||||
"http://www.example.com/a%20do?a=1"),
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
|
||||
"http://www.example.com/a%20%20do?a=1"),
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
|
||||
"http://www.example.com/a%20do%C2%A3.html?a=1")
|
||||
self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
|
||||
"http://www.example.com/a%20do%C2%A3.html?a=1")
|
||||
|
||||
def test_non_ascii_percent_encoding_in_query_arguments(self):
|
||||
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"),
|
||||
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
|
||||
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
|
||||
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
|
||||
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
|
||||
"http://www.example.com/do?a=1&price%28%C2%A3%29=500")
|
||||
|
||||
def test_urls_with_auth_and_ports(self):
|
||||
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"),
|
||||
u"http://user:pass@www.example.com:81/do?now=1")
|
||||
|
||||
def test_remove_fragments(self):
|
||||
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"),
|
||||
u"http://user:pass@www.example.com/do?a=1")
|
||||
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
|
||||
u"http://user:pass@www.example.com/do?a=1#frag")
|
||||
|
||||
def test_dont_convert_safe_characters(self):
|
||||
# dont convert safe characters to percent encoding representation
|
||||
self.assertEqual(canonicalize_url(
|
||||
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
|
||||
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
|
||||
|
||||
def test_safe_characters_unicode(self):
|
||||
# urllib.quote uses a mapping cache of encoded characters. when parsing
|
||||
# an already percent-encoded url, it will fail if that url was not
|
||||
# percent-encoded as utf-8, that's why canonicalize_url must always
|
||||
# convert the urls to string. the following test asserts that
|
||||
# functionality.
|
||||
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
|
||||
'http://www.example.com/caf%E9-con-leche.htm')
|
||||
|
||||
def test_domains_are_case_insensitive(self):
|
||||
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
|
||||
"http://www.example.com/")
|
||||
|
||||
def test_canonicalize_idns(self):
|
||||
self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
|
||||
'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
|
||||
# Japanese (+ reordering query parameters)
|
||||
self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
|
||||
'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
|
||||
|
||||
def test_quoted_slash_and_question_sign(self):
|
||||
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
|
||||
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
|
||||
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
|
||||
"http://foo.com/AC%2FDC/")
|
||||
|
||||
def test_canonicalize_urlparsed(self):
|
||||
# canonicalize_url() can be passed an already urlparse'd URL
|
||||
self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
|
||||
self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
|
||||
'http://www.example.com/caf%E9-con-leche.htm')
|
||||
self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
|
||||
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
|
||||
|
||||
def test_canonicalize_parse_url(self):
|
||||
# parse_url() wraps urlparse and is used in link extractors
|
||||
self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
|
||||
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
|
||||
self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
|
||||
'http://www.example.com/caf%E9-con-leche.htm')
|
||||
self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
|
||||
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
|
||||
|
||||
def test_canonicalize_url_idempotence(self):
|
||||
for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
|
||||
(u'http://www.example.com/résumé?q=résumé', 'latin1'),
|
||||
(u'http://www.example.com/résumé?country=Россия', 'cp1251'),
|
||||
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
|
||||
canonicalized = canonicalize_url(url, encoding=enc)
|
||||
|
||||
# if we canonicalize again, we ge the same result
|
||||
self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)
|
||||
|
||||
# without encoding, already canonicalized URL is canonicalized identically
|
||||
self.assertEqual(canonicalize_url(canonicalized), canonicalized)
|
||||
|
||||
def test_canonicalize_url_idna_exceptions(self):
|
||||
# missing DNS label
|
||||
self.assertEqual(
|
||||
canonicalize_url(u"http://.example.com/résumé?q=résumé"),
|
||||
"http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
|
||||
|
||||
# DNS label too long
|
||||
self.assertEqual(
|
||||
canonicalize_url(
|
||||
u"http://www.{label}.com/résumé?q=résumé".format(
|
||||
label=u"example"*11)),
|
||||
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
|
||||
label=u"example"*11))
|
||||
|
||||
|
||||
class AddHttpIfNoScheme(unittest.TestCase):
|
||||
|
||||
def test_add_scheme(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user