1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 05:23:47 +00:00
scrapy/tests/test_utils_url.py
2016-01-28 13:51:50 +01:00

358 lines
19 KiB
Python

# -*- coding: utf-8 -*-
import unittest
import six
from scrapy.spiders import Spider
from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
canonicalize_url, add_http_if_no_scheme,
guess_scheme)
__doctests__ = ['scrapy.utils.url']
class UrlUtilsTest(unittest.TestCase):
def test_url_is_from_any_domain(self):
url = 'http://www.wheele-bin-art.co.uk/get/product/123'
self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))
url = 'http://wheele-bin-art.co.uk/get/product/123'
self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))
url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123'
self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.CO.UK']))
self.assertTrue(url_is_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK']))
url = 'http://192.169.0.15:8080/mypage.html'
self.assertTrue(url_is_from_any_domain(url, ['192.169.0.15:8080']))
self.assertFalse(url_is_from_any_domain(url, ['192.169.0.15']))
url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
self.assertFalse(url_is_from_any_domain(url, ['testdomain.com']))
self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
def test_url_is_from_spider(self):
spider = Spider(name='example.com')
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
def test_url_is_from_spider_class_attributes(self):
class MySpider(Spider):
name = 'example.com'
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
def test_url_is_from_spider_with_allowed_domains(self):
spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net'])
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))
spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net'))
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
class MySpider(Spider):
name = 'example.com'
allowed_domains = ('example.org', 'example.net')
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
class CanonicalizeUrlTest(unittest.TestCase):
def test_canonicalize_url(self):
# simplest case
self.assertEqual(canonicalize_url("http://www.example.com/"),
"http://www.example.com/")
def test_return_str(self):
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
assert isinstance(canonicalize_url(b"http://www.example.com"), str)
def test_append_missing_path(self):
self.assertEqual(canonicalize_url("http://www.example.com"),
"http://www.example.com/")
def test_typical_usage(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3")
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
"http://www.example.com/do?a=3&b=2&c=1")
self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
"http://www.example.com/do?a=1")
def test_sorting(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
"http://www.example.com/do?a=50&b=2&b=5&c=3")
def test_keep_blank_values(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
"http://www.example.com/do?a=2")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
"http://www.example.com/do?a=2&b=")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
"http://www.example.com/do?a=2")
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
"http://www.example.com/do?a=2&b=&c=")
self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
'http://www.example.com/do?1750%2C4=')
def test_spaces(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
@unittest.skipUnless(six.PY2, "TODO")
def test_normalize_percent_encoding_in_paths(self):
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
"http://www.example.com/a%A3do"),
@unittest.skipUnless(six.PY2, "TODO")
def test_normalize_percent_encoding_in_query_arguments(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
"http://www.example.com/do?k=b%A3")
def test_non_ascii_percent_encoding_in_paths(self):
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
"http://www.example.com/a%20do?a=1"),
self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
"http://www.example.com/a%20%20do?a=1"),
self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1")
self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1")
def test_non_ascii_percent_encoding_in_query_arguments(self):
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=\xa3500&a=5&z=3"),
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
"http://www.example.com/do?a=1&price%28%C2%A3%29=500")
def test_urls_with_auth_and_ports(self):
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"),
u"http://user:pass@www.example.com:81/do?now=1")
def test_remove_fragments(self):
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"),
u"http://user:pass@www.example.com/do?a=1")
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
u"http://user:pass@www.example.com/do?a=1#frag")
def test_dont_convert_safe_characters(self):
# dont convert safe characters to percent encoding representation
self.assertEqual(canonicalize_url(
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
@unittest.skipUnless(six.PY2, "TODO")
def test_safe_characters_unicode(self):
# urllib.quote uses a mapping cache of encoded characters. when parsing
# an already percent-encoded url, it will fail if that url was not
# percent-encoded as utf-8, that's why canonicalize_url must always
# convert the urls to string. the following test asserts that
# functionality.
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
'http://www.example.com/caf%E9-con-leche.htm')
def test_domains_are_case_insensitive(self):
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
"http://www.example.com/")
def test_quoted_slash_and_question_sign(self):
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
"http://foo.com/AC%2FDC/")
class AddHttpIfNoScheme(unittest.TestCase):
def test_add_scheme(self):
self.assertEqual(add_http_if_no_scheme('www.example.com'),
'http://www.example.com')
def test_without_subdomain(self):
self.assertEqual(add_http_if_no_scheme('example.com'),
'http://example.com')
def test_path(self):
self.assertEqual(add_http_if_no_scheme('www.example.com/some/page.html'),
'http://www.example.com/some/page.html')
def test_port(self):
self.assertEqual(add_http_if_no_scheme('www.example.com:80'),
'http://www.example.com:80')
def test_fragment(self):
self.assertEqual(add_http_if_no_scheme('www.example.com/some/page#frag'),
'http://www.example.com/some/page#frag')
def test_query(self):
self.assertEqual(add_http_if_no_scheme('www.example.com/do?a=1&b=2&c=3'),
'http://www.example.com/do?a=1&b=2&c=3')
def test_username_password(self):
self.assertEqual(add_http_if_no_scheme('username:password@www.example.com'),
'http://username:password@www.example.com')
def test_complete_url(self):
self.assertEqual(add_http_if_no_scheme('username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag'),
'http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag')
def test_preserve_http(self):
self.assertEqual(add_http_if_no_scheme('http://www.example.com'),
'http://www.example.com')
def test_preserve_http_without_subdomain(self):
self.assertEqual(add_http_if_no_scheme('http://example.com'),
'http://example.com')
def test_preserve_http_path(self):
self.assertEqual(add_http_if_no_scheme('http://www.example.com/some/page.html'),
'http://www.example.com/some/page.html')
def test_preserve_http_port(self):
self.assertEqual(add_http_if_no_scheme('http://www.example.com:80'),
'http://www.example.com:80')
def test_preserve_http_fragment(self):
self.assertEqual(add_http_if_no_scheme('http://www.example.com/some/page#frag'),
'http://www.example.com/some/page#frag')
def test_preserve_http_query(self):
self.assertEqual(add_http_if_no_scheme('http://www.example.com/do?a=1&b=2&c=3'),
'http://www.example.com/do?a=1&b=2&c=3')
def test_preserve_http_username_password(self):
self.assertEqual(add_http_if_no_scheme('http://username:password@www.example.com'),
'http://username:password@www.example.com')
def test_preserve_http_complete_url(self):
self.assertEqual(add_http_if_no_scheme('http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag'),
'http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag')
def test_protocol_relative(self):
self.assertEqual(add_http_if_no_scheme('//www.example.com'),
'http://www.example.com')
def test_protocol_relative_without_subdomain(self):
self.assertEqual(add_http_if_no_scheme('//example.com'),
'http://example.com')
def test_protocol_relative_path(self):
self.assertEqual(add_http_if_no_scheme('//www.example.com/some/page.html'),
'http://www.example.com/some/page.html')
def test_protocol_relative_port(self):
self.assertEqual(add_http_if_no_scheme('//www.example.com:80'),
'http://www.example.com:80')
def test_protocol_relative_fragment(self):
self.assertEqual(add_http_if_no_scheme('//www.example.com/some/page#frag'),
'http://www.example.com/some/page#frag')
def test_protocol_relative_query(self):
self.assertEqual(add_http_if_no_scheme('//www.example.com/do?a=1&b=2&c=3'),
'http://www.example.com/do?a=1&b=2&c=3')
def test_protocol_relative_username_password(self):
self.assertEqual(add_http_if_no_scheme('//username:password@www.example.com'),
'http://username:password@www.example.com')
def test_protocol_relative_complete_url(self):
self.assertEqual(add_http_if_no_scheme('//username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag'),
'http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag')
def test_preserve_https(self):
self.assertEqual(add_http_if_no_scheme('https://www.example.com'),
'https://www.example.com')
def test_preserve_ftp(self):
self.assertEqual(add_http_if_no_scheme('ftp://www.example.com'),
'ftp://www.example.com')
class GuessSchemeTest(unittest.TestCase):
pass
def create_guess_scheme_t(args):
def do_expected(self):
url = guess_scheme(args[0])
assert url.startswith(args[1]), \
'Wrong scheme guessed: for `%s` got `%s`, expected `%s...`' % (
args[0], url, args[1])
return do_expected
def create_skipped_scheme_t(args):
def do_expected(self):
raise unittest.SkipTest(args[2])
url = guess_scheme(args[0])
assert url.startswith(args[1])
return do_expected
for k, args in enumerate ([
('/index', 'file://'),
('/index.html', 'file://'),
('./index.html', 'file://'),
('../index.html', 'file://'),
('../../index.html', 'file://'),
('./data/index.html', 'file://'),
('.hidden/data/index.html', 'file://'),
('/home/user/www/index.html', 'file://'),
('//home/user/www/index.html', 'file://'),
('file:///home/user/www/index.html', 'file://'),
('index.html', 'http://'),
('example.com', 'http://'),
('www.example.com', 'http://'),
('www.example.com/index.html', 'http://'),
('http://example.com', 'http://'),
('http://example.com/index.html', 'http://'),
('localhost', 'http://'),
('localhost/index.html', 'http://'),
# some corner cases (default to http://)
('/', 'http://'),
('.../test', 'http://'),
], start=1):
t_method = create_guess_scheme_t(args)
t_method.__name__ = 'test_uri_%03d' % k
setattr (GuessSchemeTest, t_method.__name__, t_method)
# TODO: the following tests do not pass with current implementation
for k, args in enumerate ([
('C:\absolute\path\to\a\file.html', 'file://',
'Windows filepath are not supported for scrapy shell'),
], start=1):
t_method = create_skipped_scheme_t(args)
t_method.__name__ = 'test_uri_skipped_%03d' % k
setattr (GuessSchemeTest, t_method.__name__, t_method)
if __name__ == "__main__":
unittest.main()