1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 00:23:52 +00:00

Merge pull request #2168 from advarisk/w3lib-canonicalize-url

[MRG+1] Use w3lib.url.canonicalize_url() from w3lib 1.15.0
This commit is contained in:
Mikhail Korobov 2016-08-16 20:59:17 +06:00 committed by GitHub
commit 241bd00e76
7 changed files with 10 additions and 386 deletions

View File

@ -103,7 +103,7 @@ LxmlLinkExtractor
:type attrs: list
:param canonicalize: canonicalize each extracted url (using
scrapy.utils.url.canonicalize_url). Defaults to ``True``.
w3lib.url.canonicalize_url). Defaults to ``True``.
:type canonicalize: boolean
:param unique: whether duplicate filtering should be applied to extracted

View File

@ -2,7 +2,7 @@ Twisted>=10.0.0
lxml
pyOpenSSL
cssselect>=0.9
w3lib>=1.14.2
w3lib>=1.15.0
queuelib
six>=1.5.2
PyDispatcher>=2.0.5

View File

@ -9,10 +9,11 @@ import re
from six.moves.urllib.parse import urlparse
from parsel.csstranslator import HTMLTranslator
from w3lib.url import canonicalize_url
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.url import (
canonicalize_url, url_is_from_any_domain, url_has_any_extension,
url_is_from_any_domain, url_has_any_extension,
)

View File

@ -11,7 +11,7 @@ from six.moves.urllib.parse import urlunparse
from w3lib.http import basic_auth_header
from scrapy.utils.python import to_bytes, to_native_str
from scrapy.utils.url import canonicalize_url
from w3lib.url import canonicalize_url
from scrapy.utils.httpobj import urlparse_cached

View File

@ -7,18 +7,13 @@ to the w3lib.url module. Always import those from there instead.
"""
import posixpath
import re
import six
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
urlparse, parse_qsl, urlencode,
quote, unquote)
if not six.PY2:
from urllib.parse import unquote_to_bytes
from six.moves.urllib.parse import (ParseResult, urldefrag, urlparse)
# scrapy.utils.url was moved to w3lib.url and import * ensures this
# move doesn't break old code
from w3lib.url import *
from w3lib.url import _safe_chars
from scrapy.utils.python import to_bytes, to_native_str, to_unicode
from w3lib.url import _safe_chars, _unquotepath
from scrapy.utils.python import to_unicode
def url_is_from_any_domain(url, domains):
@ -40,121 +35,6 @@ def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
# IDNA encoding can fail for too long labels (>63 characters)
# or missing labels (e.g. http://.example.com)
try:
netloc = parts.netloc.encode('idna')
except UnicodeError:
netloc = parts.netloc
return (
to_native_str(parts.scheme),
to_native_str(netloc),
# default encoding for path component SHOULD be UTF-8
quote(to_bytes(parts.path, path_encoding), _safe_chars),
quote(to_bytes(parts.params, path_encoding), _safe_chars),
# encoding of query and fragment follows page encoding
# or form-charset (if known and passed)
quote(to_bytes(parts.query, encoding), _safe_chars),
quote(to_bytes(parts.fragment, encoding), _safe_chars)
)
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
encoding=None):
"""Canonicalize the given url by applying the following procedures:
- sort query arguments, first by key, then by value
- percent encode paths ; non-ASCII characters are percent-encoded
using UTF-8 (RFC-3986)
- percent encode query arguments ; non-ASCII characters are percent-encoded
using passed `encoding` (UTF-8 by default)
- normalize all spaces (in query arguments) '+' (plus symbol)
- normalize percent encodings case (%2f -> %2F)
- remove query arguments with blank values (unless `keep_blank_values` is True)
- remove fragments (unless `keep_fragments` is True)
The url passed can be bytes or unicode, while the url returned is
always a native str (bytes in Python 2, unicode in Python 3).
For examples see the tests in tests/test_utils_url.py
"""
# If supplied `encoding` is not compatible with all characters in `url`,
# fallback to UTF-8 as safety net.
# UTF-8 can handle all Unicode characters,
# so we should be covered regarding URL normalization,
# if not for proper URL expected by remote website.
try:
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
parse_url(url), encoding=encoding)
except UnicodeEncodeError as e:
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
parse_url(url), encoding='utf8')
# 1. decode query-string as UTF-8 (or keep raw bytes),
# sort values,
# and percent-encode them back
if six.PY2:
keyvals = parse_qsl(query, keep_blank_values)
else:
# Python3's urllib.parse.parse_qsl does not work as wanted
# for percent-encoded characters that do not match passed encoding,
# they get lost.
#
# e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
# (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
# instead of \xa3 that you get with Python2's parse_qsl)
#
# what we want here is to keep raw bytes, and percent encode them
# so as to preserve whatever encoding what originally used.
#
# See https://tools.ietf.org/html/rfc3987#section-6.4:
#
# For example, it is possible to have a URI reference of
# "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
# document name is encoded in iso-8859-1 based on server settings, but
# where the fragment identifier is encoded in UTF-8 according to
# [XPointer]. The IRI corresponding to the above URI would be (in XML
# notation)
# "http://www.example.org/r%E9sum%E9.xml#résumé".
# Similar considerations apply to query parts. The functionality of
# IRIs (namely, to be able to include non-ASCII characters) can only be
# used if the query part is encoded in UTF-8.
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
keyvals.sort()
query = urlencode(keyvals)
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
# and percent-encode path again (this normalizes to upper-case %XX)
uqp = _unquotepath(path)
path = quote(uqp, _safe_chars) or '/'
fragment = '' if not keep_fragments else fragment
# every part should be safe already
return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def _unquotepath(path):
for reserved in ('2f', '2F', '3f', '3F'):
path = path.replace('%' + reserved, '%25' + reserved.upper())
if six.PY2:
# in Python 2, '%a3' becomes '\xa3', which is what we want
return unquote(path)
else:
# in Python 3,
# standard lib's unquote() does not work for non-UTF-8
# percent-escaped characters, they get lost.
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
#
# unquote_to_bytes() returns raw bytes instead
return unquote_to_bytes(path)
def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)
@ -164,59 +44,6 @@ def parse_url(url, encoding=None):
return urlparse(to_unicode(url, encoding))
if not six.PY2:
from urllib.parse import _coerce_args, unquote_to_bytes
def parse_qsl_to_bytes(qs, keep_blank_values=False, strict_parsing=False):
"""Parse a query given as a string argument.
Data are returned as a list of name, value pairs as bytes.
Arguments:
qs: percent-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings. A
true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
strict_parsing: flag indicating what to do with parsing errors. If
false (the default), errors are silently ignored. If true,
errors raise a ValueError exception.
"""
# This code is the same as Python3's parse_qsl()
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
# except for the unquote(s, encoding, errors) calls replaced
# with unquote_to_bytes(s)
qs, _coerce_result = _coerce_args(qs)
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError("bad query field: %r" % (name_value,))
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = unquote_to_bytes(name)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = unquote_to_bytes(value)
value = _coerce_result(value)
r.append((name, value))
return r
def escape_ajax(url):
"""
Return the crawleable url according to:

View File

@ -42,7 +42,7 @@ setup(
],
install_requires=[
'Twisted>=10.0.0',
'w3lib>=1.14.2',
'w3lib>=1.15.0',
'queuelib',
'lxml',
'pyOpenSSL',

View File

@ -6,8 +6,7 @@ from six.moves.urllib.parse import urlparse
from scrapy.spiders import Spider
from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
canonicalize_url, add_http_if_no_scheme,
guess_scheme, parse_url)
add_http_if_no_scheme, guess_scheme, parse_url)
__doctests__ = ['scrapy.utils.url']
@ -77,209 +76,6 @@ class UrlUtilsTest(unittest.TestCase):
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
class CanonicalizeUrlTest(unittest.TestCase):
def test_canonicalize_url(self):
# simplest case
self.assertEqual(canonicalize_url("http://www.example.com/"),
"http://www.example.com/")
def test_return_str(self):
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
assert isinstance(canonicalize_url(b"http://www.example.com"), str)
def test_append_missing_path(self):
self.assertEqual(canonicalize_url("http://www.example.com"),
"http://www.example.com/")
def test_typical_usage(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3")
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
"http://www.example.com/do?a=3&b=2&c=1")
self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
"http://www.example.com/do?a=1")
def test_sorting(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
"http://www.example.com/do?a=50&b=2&b=5&c=3")
def test_keep_blank_values(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
"http://www.example.com/do?a=2")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
"http://www.example.com/do?a=2&b=")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
"http://www.example.com/do?a=2")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
"http://www.example.com/do?a=2&b=&c=")
self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
'http://www.example.com/do?1750%2C4=')
def test_spaces(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
def test_canonicalize_url_unicode_path(self):
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
"http://www.example.com/r%C3%A9sum%C3%A9")
def test_canonicalize_url_unicode_query_string(self):
# default encoding for path and query is UTF-8
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
# passed encoding will affect query string
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
# trying to encode with wrong encoding
# fallback to UTF-8
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
"http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
def test_normalize_percent_encoding_in_paths(self):
self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
"http://www.example.com/r%C3%A9sum%C3%A9")
# non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
# 'latin1'-encoded sequence in path
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
"http://www.example.com/a%A3do")
# 'latin1'-encoded path, UTF-8 encoded query string
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
# 'latin1'-encoded path and query string
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
"http://www.example.com/a%A3do?q=r%E9sum%E9")
def test_normalize_percent_encoding_in_query_arguments(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
"http://www.example.com/do?k=b%A3")
self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
"http://www.example.com/do?k=r%C3%A9sum%C3%A9")
def test_non_ascii_percent_encoding_in_paths(self):
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
"http://www.example.com/a%20do?a=1"),
self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
"http://www.example.com/a%20%20do?a=1"),
self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1")
self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1")
def test_non_ascii_percent_encoding_in_query_arguments(self):
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"),
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
"http://www.example.com/do?a=1&price%28%C2%A3%29=500")
def test_urls_with_auth_and_ports(self):
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"),
u"http://user:pass@www.example.com:81/do?now=1")
def test_remove_fragments(self):
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"),
u"http://user:pass@www.example.com/do?a=1")
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
u"http://user:pass@www.example.com/do?a=1#frag")
def test_dont_convert_safe_characters(self):
# dont convert safe characters to percent encoding representation
self.assertEqual(canonicalize_url(
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
def test_safe_characters_unicode(self):
# urllib.quote uses a mapping cache of encoded characters. when parsing
# an already percent-encoded url, it will fail if that url was not
# percent-encoded as utf-8, that's why canonicalize_url must always
# convert the urls to string. the following test asserts that
# functionality.
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
'http://www.example.com/caf%E9-con-leche.htm')
def test_domains_are_case_insensitive(self):
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
"http://www.example.com/")
def test_canonicalize_idns(self):
self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
# Japanese (+ reordering query parameters)
self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
def test_quoted_slash_and_question_sign(self):
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
"http://foo.com/AC%2FDC/")
def test_canonicalize_urlparsed(self):
# canonicalize_url() can be passed an already urlparse'd URL
self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
'http://www.example.com/caf%E9-con-leche.htm')
self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_parse_url(self):
# parse_url() wraps urlparse and is used in link extractors
self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
'http://www.example.com/caf%E9-con-leche.htm')
self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_url_idempotence(self):
for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
(u'http://www.example.com/résumé?q=résumé', 'latin1'),
(u'http://www.example.com/résumé?country=Россия', 'cp1251'),
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
canonicalized = canonicalize_url(url, encoding=enc)
# if we canonicalize again, we ge the same result
self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)
# without encoding, already canonicalized URL is canonicalized identically
self.assertEqual(canonicalize_url(canonicalized), canonicalized)
def test_canonicalize_url_idna_exceptions(self):
# missing DNS label
self.assertEqual(
canonicalize_url(u"http://.example.com/résumé?q=résumé"),
"http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
# DNS label too long
self.assertEqual(
canonicalize_url(
u"http://www.{label}.com/résumé?q=résumé".format(
label=u"example"*11)),
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
label=u"example"*11))
class AddHttpIfNoScheme(unittest.TestCase):
def test_add_scheme(self):