1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00
scrapy/tests/test_utils_url.py
2025-01-27 11:07:09 +01:00

633 lines
23 KiB
Python

import unittest
import warnings
import pytest
from scrapy.linkextractors import IGNORED_EXTENSIONS
from scrapy.spiders import Spider
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.url import ( # type: ignore[attr-defined]
_is_filesystem_path,
_public_w3lib_objects,
add_http_if_no_scheme,
guess_scheme,
strip_url,
url_has_any_extension,
url_is_from_any_domain,
url_is_from_spider,
)
__doctests__ = ["scrapy.utils.url"]
class UrlUtilsTest(unittest.TestCase):
def test_url_is_from_any_domain(self):
url = "http://www.wheele-bin-art.co.uk/get/product/123"
self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"]))
self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"]))
url = "http://wheele-bin-art.co.uk/get/product/123"
self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"]))
self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"]))
url = "http://www.Wheele-Bin-Art.co.uk/get/product/123"
self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.CO.UK"]))
self.assertTrue(url_is_from_any_domain(url, ["WHEELE-BIN-ART.CO.UK"]))
url = "http://192.169.0.15:8080/mypage.html"
self.assertTrue(url_is_from_any_domain(url, ["192.169.0.15:8080"]))
self.assertFalse(url_is_from_any_domain(url, ["192.169.0.15"]))
url = (
"javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20"
"javascript:%20document.orderform_2581_1190810811.submit%28%29"
)
self.assertFalse(url_is_from_any_domain(url, ["testdomain.com"]))
self.assertFalse(
url_is_from_any_domain(url + ".testdomain.com", ["testdomain.com"])
)
def test_url_is_from_spider(self):
spider = Spider(name="example.com")
self.assertTrue(
url_is_from_spider("http://www.example.com/some/page.html", spider)
)
self.assertTrue(
url_is_from_spider("http://sub.example.com/some/page.html", spider)
)
self.assertFalse(
url_is_from_spider("http://www.example.org/some/page.html", spider)
)
self.assertFalse(
url_is_from_spider("http://www.example.net/some/page.html", spider)
)
def test_url_is_from_spider_class_attributes(self):
class MySpider(Spider):
name = "example.com"
self.assertTrue(
url_is_from_spider("http://www.example.com/some/page.html", MySpider)
)
self.assertTrue(
url_is_from_spider("http://sub.example.com/some/page.html", MySpider)
)
self.assertFalse(
url_is_from_spider("http://www.example.org/some/page.html", MySpider)
)
self.assertFalse(
url_is_from_spider("http://www.example.net/some/page.html", MySpider)
)
def test_url_is_from_spider_with_allowed_domains(self):
spider = Spider(
name="example.com", allowed_domains=["example.org", "example.net"]
)
self.assertTrue(
url_is_from_spider("http://www.example.com/some/page.html", spider)
)
self.assertTrue(
url_is_from_spider("http://sub.example.com/some/page.html", spider)
)
self.assertTrue(url_is_from_spider("http://example.com/some/page.html", spider))
self.assertTrue(
url_is_from_spider("http://www.example.org/some/page.html", spider)
)
self.assertTrue(
url_is_from_spider("http://www.example.net/some/page.html", spider)
)
self.assertFalse(
url_is_from_spider("http://www.example.us/some/page.html", spider)
)
spider = Spider(
name="example.com", allowed_domains={"example.com", "example.net"}
)
self.assertTrue(
url_is_from_spider("http://www.example.com/some/page.html", spider)
)
spider = Spider(
name="example.com", allowed_domains=("example.com", "example.net")
)
self.assertTrue(
url_is_from_spider("http://www.example.com/some/page.html", spider)
)
def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
class MySpider(Spider):
name = "example.com"
allowed_domains = ("example.org", "example.net")
self.assertTrue(
url_is_from_spider("http://www.example.com/some/page.html", MySpider)
)
self.assertTrue(
url_is_from_spider("http://sub.example.com/some/page.html", MySpider)
)
self.assertTrue(
url_is_from_spider("http://example.com/some/page.html", MySpider)
)
self.assertTrue(
url_is_from_spider("http://www.example.org/some/page.html", MySpider)
)
self.assertTrue(
url_is_from_spider("http://www.example.net/some/page.html", MySpider)
)
self.assertFalse(
url_is_from_spider("http://www.example.us/some/page.html", MySpider)
)
def test_url_has_any_extension(self):
deny_extensions = {"." + e for e in arg_to_iter(IGNORED_EXTENSIONS)}
self.assertTrue(
url_has_any_extension(
"http://www.example.com/archive.tar.gz", deny_extensions
)
)
self.assertTrue(
url_has_any_extension("http://www.example.com/page.doc", deny_extensions)
)
self.assertTrue(
url_has_any_extension("http://www.example.com/page.pdf", deny_extensions)
)
self.assertFalse(
url_has_any_extension("http://www.example.com/page.htm", deny_extensions)
)
self.assertFalse(
url_has_any_extension("http://www.example.com/", deny_extensions)
)
self.assertFalse(
url_has_any_extension(
"http://www.example.com/page.doc.html", deny_extensions
)
)
class AddHttpIfNoScheme(unittest.TestCase):
def test_add_scheme(self):
self.assertEqual(
add_http_if_no_scheme("www.example.com"), "http://www.example.com"
)
def test_without_subdomain(self):
self.assertEqual(add_http_if_no_scheme("example.com"), "http://example.com")
def test_path(self):
self.assertEqual(
add_http_if_no_scheme("www.example.com/some/page.html"),
"http://www.example.com/some/page.html",
)
def test_port(self):
self.assertEqual(
add_http_if_no_scheme("www.example.com:80"), "http://www.example.com:80"
)
def test_fragment(self):
self.assertEqual(
add_http_if_no_scheme("www.example.com/some/page#frag"),
"http://www.example.com/some/page#frag",
)
def test_query(self):
self.assertEqual(
add_http_if_no_scheme("www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3",
)
def test_username_password(self):
self.assertEqual(
add_http_if_no_scheme("username:password@www.example.com"),
"http://username:password@www.example.com",
)
def test_complete_url(self):
self.assertEqual(
add_http_if_no_scheme(
"username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag"
),
"http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag",
)
def test_preserve_http(self):
self.assertEqual(
add_http_if_no_scheme("http://www.example.com"), "http://www.example.com"
)
def test_preserve_http_without_subdomain(self):
self.assertEqual(
add_http_if_no_scheme("http://example.com"), "http://example.com"
)
def test_preserve_http_path(self):
self.assertEqual(
add_http_if_no_scheme("http://www.example.com/some/page.html"),
"http://www.example.com/some/page.html",
)
def test_preserve_http_port(self):
self.assertEqual(
add_http_if_no_scheme("http://www.example.com:80"),
"http://www.example.com:80",
)
def test_preserve_http_fragment(self):
self.assertEqual(
add_http_if_no_scheme("http://www.example.com/some/page#frag"),
"http://www.example.com/some/page#frag",
)
def test_preserve_http_query(self):
self.assertEqual(
add_http_if_no_scheme("http://www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3",
)
def test_preserve_http_username_password(self):
self.assertEqual(
add_http_if_no_scheme("http://username:password@www.example.com"),
"http://username:password@www.example.com",
)
def test_preserve_http_complete_url(self):
self.assertEqual(
add_http_if_no_scheme(
"http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag"
),
"http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag",
)
def test_protocol_relative(self):
self.assertEqual(
add_http_if_no_scheme("//www.example.com"), "http://www.example.com"
)
def test_protocol_relative_without_subdomain(self):
self.assertEqual(add_http_if_no_scheme("//example.com"), "http://example.com")
def test_protocol_relative_path(self):
self.assertEqual(
add_http_if_no_scheme("//www.example.com/some/page.html"),
"http://www.example.com/some/page.html",
)
def test_protocol_relative_port(self):
self.assertEqual(
add_http_if_no_scheme("//www.example.com:80"), "http://www.example.com:80"
)
def test_protocol_relative_fragment(self):
self.assertEqual(
add_http_if_no_scheme("//www.example.com/some/page#frag"),
"http://www.example.com/some/page#frag",
)
def test_protocol_relative_query(self):
self.assertEqual(
add_http_if_no_scheme("//www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3",
)
def test_protocol_relative_username_password(self):
self.assertEqual(
add_http_if_no_scheme("//username:password@www.example.com"),
"http://username:password@www.example.com",
)
def test_protocol_relative_complete_url(self):
self.assertEqual(
add_http_if_no_scheme(
"//username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag"
),
"http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag",
)
def test_preserve_https(self):
self.assertEqual(
add_http_if_no_scheme("https://www.example.com"), "https://www.example.com"
)
def test_preserve_ftp(self):
self.assertEqual(
add_http_if_no_scheme("ftp://www.example.com"), "ftp://www.example.com"
)
class GuessSchemeTest(unittest.TestCase):
pass
def create_guess_scheme_t(args):
def do_expected(self):
url = guess_scheme(args[0])
assert url.startswith(args[1]), (
f"Wrong scheme guessed: for `{args[0]}` got `{url}`, expected `{args[1]}...`"
)
return do_expected
def create_skipped_scheme_t(args):
def do_expected(self):
raise unittest.SkipTest(args[2])
return do_expected
for k, args in enumerate(
[
("/index", "file://"),
("/index.html", "file://"),
("./index.html", "file://"),
("../index.html", "file://"),
("../../index.html", "file://"),
("./data/index.html", "file://"),
(".hidden/data/index.html", "file://"),
("/home/user/www/index.html", "file://"),
("//home/user/www/index.html", "file://"),
("file:///home/user/www/index.html", "file://"),
("index.html", "http://"),
("example.com", "http://"),
("www.example.com", "http://"),
("www.example.com/index.html", "http://"),
("http://example.com", "http://"),
("http://example.com/index.html", "http://"),
("localhost", "http://"),
("localhost/index.html", "http://"),
# some corner cases (default to http://)
("/", "http://"),
(".../test", "http://"),
],
start=1,
):
t_method = create_guess_scheme_t(args)
t_method.__name__ = f"test_uri_{k:03}"
setattr(GuessSchemeTest, t_method.__name__, t_method)
# TODO: the following tests do not pass with current implementation
for k, skip_args in enumerate(
[
(
r"C:\absolute\path\to\a\file.html",
"file://",
"Windows filepath are not supported for scrapy shell",
),
],
start=1,
):
t_method = create_skipped_scheme_t(skip_args)
t_method.__name__ = f"test_uri_skipped_{k:03}"
setattr(GuessSchemeTest, t_method.__name__, t_method)
class StripUrl(unittest.TestCase):
def test_noop(self):
self.assertEqual(
strip_url("http://www.example.com/index.html"),
"http://www.example.com/index.html",
)
def test_noop_query_string(self):
self.assertEqual(
strip_url("http://www.example.com/index.html?somekey=somevalue"),
"http://www.example.com/index.html?somekey=somevalue",
)
def test_fragments(self):
self.assertEqual(
strip_url(
"http://www.example.com/index.html?somekey=somevalue#section",
strip_fragment=False,
),
"http://www.example.com/index.html?somekey=somevalue#section",
)
def test_path(self):
for input_url, origin, output_url in [
("http://www.example.com/", False, "http://www.example.com/"),
("http://www.example.com", False, "http://www.example.com"),
("http://www.example.com", True, "http://www.example.com/"),
]:
self.assertEqual(strip_url(input_url, origin_only=origin), output_url)
def test_credentials(self):
for i, o in [
(
"http://username@www.example.com/index.html?somekey=somevalue#section",
"http://www.example.com/index.html?somekey=somevalue",
),
(
"https://username:@www.example.com/index.html?somekey=somevalue#section",
"https://www.example.com/index.html?somekey=somevalue",
),
(
"ftp://username:password@www.example.com/index.html?somekey=somevalue#section",
"ftp://www.example.com/index.html?somekey=somevalue",
),
]:
self.assertEqual(strip_url(i, strip_credentials=True), o)
def test_credentials_encoded_delims(self):
for i, o in [
# user: "username@"
# password: none
(
"http://username%40@www.example.com/index.html?somekey=somevalue#section",
"http://www.example.com/index.html?somekey=somevalue",
),
# user: "username:pass"
# password: ""
(
"https://username%3Apass:@www.example.com/index.html?somekey=somevalue#section",
"https://www.example.com/index.html?somekey=somevalue",
),
# user: "me"
# password: "user@domain.com"
(
"ftp://me:user%40domain.com@www.example.com/index.html?somekey=somevalue#section",
"ftp://www.example.com/index.html?somekey=somevalue",
),
]:
self.assertEqual(strip_url(i, strip_credentials=True), o)
def test_default_ports_creds_off(self):
for i, o in [
(
"http://username:password@www.example.com:80/index.html?somekey=somevalue#section",
"http://www.example.com/index.html?somekey=somevalue",
),
(
"http://username:password@www.example.com:8080/index.html#section",
"http://www.example.com:8080/index.html",
),
(
"http://username:password@www.example.com:443/index.html?somekey=somevalue&someotherkey=sov#section",
"http://www.example.com:443/index.html?somekey=somevalue&someotherkey=sov",
),
(
"https://username:password@www.example.com:443/index.html",
"https://www.example.com/index.html",
),
(
"https://username:password@www.example.com:442/index.html",
"https://www.example.com:442/index.html",
),
(
"https://username:password@www.example.com:80/index.html",
"https://www.example.com:80/index.html",
),
(
"ftp://username:password@www.example.com:21/file.txt",
"ftp://www.example.com/file.txt",
),
(
"ftp://username:password@www.example.com:221/file.txt",
"ftp://www.example.com:221/file.txt",
),
]:
self.assertEqual(strip_url(i), o)
def test_default_ports(self):
for i, o in [
(
"http://username:password@www.example.com:80/index.html",
"http://username:password@www.example.com/index.html",
),
(
"http://username:password@www.example.com:8080/index.html",
"http://username:password@www.example.com:8080/index.html",
),
(
"http://username:password@www.example.com:443/index.html",
"http://username:password@www.example.com:443/index.html",
),
(
"https://username:password@www.example.com:443/index.html",
"https://username:password@www.example.com/index.html",
),
(
"https://username:password@www.example.com:442/index.html",
"https://username:password@www.example.com:442/index.html",
),
(
"https://username:password@www.example.com:80/index.html",
"https://username:password@www.example.com:80/index.html",
),
(
"ftp://username:password@www.example.com:21/file.txt",
"ftp://username:password@www.example.com/file.txt",
),
(
"ftp://username:password@www.example.com:221/file.txt",
"ftp://username:password@www.example.com:221/file.txt",
),
]:
self.assertEqual(
strip_url(i, strip_default_port=True, strip_credentials=False), o
)
def test_default_ports_keep(self):
for i, o in [
(
"http://username:password@www.example.com:80/index.html?somekey=somevalue&someotherkey=sov#section",
"http://username:password@www.example.com:80/index.html?somekey=somevalue&someotherkey=sov",
),
(
"http://username:password@www.example.com:8080/index.html?somekey=somevalue&someotherkey=sov#section",
"http://username:password@www.example.com:8080/index.html?somekey=somevalue&someotherkey=sov",
),
(
"http://username:password@www.example.com:443/index.html",
"http://username:password@www.example.com:443/index.html",
),
(
"https://username:password@www.example.com:443/index.html",
"https://username:password@www.example.com:443/index.html",
),
(
"https://username:password@www.example.com:442/index.html",
"https://username:password@www.example.com:442/index.html",
),
(
"https://username:password@www.example.com:80/index.html",
"https://username:password@www.example.com:80/index.html",
),
(
"ftp://username:password@www.example.com:21/file.txt",
"ftp://username:password@www.example.com:21/file.txt",
),
(
"ftp://username:password@www.example.com:221/file.txt",
"ftp://username:password@www.example.com:221/file.txt",
),
]:
self.assertEqual(
strip_url(i, strip_default_port=False, strip_credentials=False), o
)
def test_origin_only(self):
for i, o in [
(
"http://username:password@www.example.com/index.html",
"http://www.example.com/",
),
(
"http://username:password@www.example.com:80/foo/bar?query=value#somefrag",
"http://www.example.com/",
),
(
"http://username:password@www.example.com:8008/foo/bar?query=value#somefrag",
"http://www.example.com:8008/",
),
(
"https://username:password@www.example.com:443/index.html",
"https://www.example.com/",
),
]:
self.assertEqual(strip_url(i, origin_only=True), o)
class IsPathTestCase(unittest.TestCase):
def test_path(self):
for input_value, output_value in (
# https://en.wikipedia.org/wiki/Path_(computing)#Representations_of_paths_by_operating_system_and_shell
# Unix-like OS, Microsoft Windows / cmd.exe
("/home/user/docs/Letter.txt", True),
("./inthisdir", True),
("../../greatgrandparent", True),
("~/.rcinfo", True),
(r"C:\user\docs\Letter.txt", True),
("/user/docs/Letter.txt", True),
(r"C:\Letter.txt", True),
(r"\\Server01\user\docs\Letter.txt", True),
(r"\\?\UNC\Server01\user\docs\Letter.txt", True),
(r"\\?\C:\user\docs\Letter.txt", True),
(r"C:\user\docs\somefile.ext:alternate_stream_name", True),
(r"https://example.com", False),
):
self.assertEqual(
_is_filesystem_path(input_value), output_value, input_value
)
@pytest.mark.parametrize(
"obj_name",
[
"_unquotepath",
"_safe_chars",
"parse_url",
*_public_w3lib_objects,
],
)
def test_deprecated_imports_from_w3lib(obj_name):
with warnings.catch_warnings(record=True) as warns:
obj_type = "attribute" if obj_name == "_safe_chars" else "function"
message = f"The scrapy.utils.url.{obj_name} {obj_type} is deprecated, use w3lib.url.{obj_name} instead."
from importlib import import_module
getattr(import_module("scrapy.utils.url"), obj_name)
assert message in warns[0].message.args