1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 06:48:31 +00:00
scrapy/tests/test_utils_request.py
2022-07-27 09:41:12 +05:00

690 lines
25 KiB
Python

import unittest
import warnings
from hashlib import sha1
from typing import Dict, Mapping, Optional, Tuple, Union
from weakref import WeakKeyDictionary
import pytest
from w3lib.url import canonicalize_url
from scrapy.http import Request
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.python import to_bytes
from scrapy.utils.request import (
_deprecated_fingerprint_cache,
_fingerprint_cache,
_request_fingerprint_as_bytes,
fingerprint,
request_authenticate,
request_fingerprint,
request_httprepr,
)
from scrapy.utils.test import get_crawler
class UtilsRequestTest(unittest.TestCase):
def test_request_authenticate(self):
r = Request("http://www.example.com")
request_authenticate(r, 'someuser', 'somepass')
self.assertEqual(r.headers['Authorization'], b'Basic c29tZXVzZXI6c29tZXBhc3M=')
def test_request_httprepr(self):
r1 = Request("http://www.example.com")
self.assertEqual(request_httprepr(r1), b'GET / HTTP/1.1\r\nHost: www.example.com\r\n\r\n')
r1 = Request("http://www.example.com/some/page.html?arg=1")
self.assertEqual(request_httprepr(r1), b'GET /some/page.html?arg=1 HTTP/1.1\r\nHost: www.example.com\r\n\r\n')
r1 = Request("http://www.example.com", method='POST',
headers={"Content-type": b"text/html"}, body=b"Some body")
self.assertEqual(
request_httprepr(r1),
b'POST / HTTP/1.1\r\nHost: www.example.com\r\nContent-Type: text/html\r\n\r\nSome body'
)
def test_request_httprepr_for_non_http_request(self):
# the representation is not important but it must not fail.
request_httprepr(Request("file:///tmp/foo.txt"))
request_httprepr(Request("ftp://localhost/tmp/foo.txt"))
class FingerprintTest(unittest.TestCase):
maxDiff = None
function: staticmethod = staticmethod(fingerprint)
cache: Union[
"WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], bytes]]",
"WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], str]]",
] = _fingerprint_cache
default_cache_key = (None, False)
known_hashes: Tuple[Tuple[Request, Union[bytes, str], Dict], ...] = (
(
Request("http://example.org"),
b'xs\xd7\x0c3uj\x15\xfe\xd7d\x9b\xa9\t\xe0d\xbf\x9cXD',
{},
),
(
Request("https://example.org"),
b'\xc04\x85P,\xaa\x91\x06\xf8t\xb4\xbd*\xd9\xe9\x8a:m\xc3l',
{},
),
(
Request("https://example.org?a"),
b'G\xad\xb8Ck\x19\x1c\xed\x838,\x01\xc4\xde;\xee\xa5\x94a\x0c',
{},
),
(
Request("https://example.org?a=b"),
b'\x024MYb\x8a\xc2\x1e\xbc>\xd6\xac*\xda\x9cF\xc1r\x7f\x17',
{},
),
(
Request("https://example.org?a=b&a"),
b't+\xe8*\xfb\x84\xe3v\x1a}\x88p\xc0\xccB\xd7\x9d\xfez\x96',
{},
),
(
Request("https://example.org?a=b&a=c"),
b'\xda\x1ec\xd0\x9c\x08s`\xb4\x9b\xe2\xb6R\xf8k\xef\xeaQG\xef',
{},
),
(
Request("https://example.org", method='POST'),
b'\x9d\xcdA\x0fT\x02:\xca\xa0}\x90\xda\x05B\xded\x8aN7\x1d',
{},
),
(
Request("https://example.org", body=b'a'),
b'\xc34z>\xd8\x99\x8b\xda7\x05r\x99I\xa8\xa0x;\xa41_',
{},
),
(
Request("https://example.org", method='POST', body=b'a'),
b'5`\xe2y4\xd0\x9d\xee\xe0\xbatw\x87Q\xe8O\xd78\xfc\xe7',
{},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
b'\xc04\x85P,\xaa\x91\x06\xf8t\xb4\xbd*\xd9\xe9\x8a:m\xc3l',
{},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
b']\xc7\x1f\xf2\xafG2\xbc\xa4\xfa\x99\n33\xda\x18\x94\x81U.',
{'include_headers': ['A']},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
b'<\x1a\xeb\x85y\xdeW\xfb\xdcq\x88\xee\xaf\x17\xdd\x0c\xbfH\x18\x1f',
{'keep_fragments': True},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
b'\xc1\xef~\x94\x9bS\xc1\x83\t\xdcz8\x9f\xdc{\x11\x16I.\x11',
{'include_headers': ['A'], 'keep_fragments': True},
),
(
Request("https://example.org/ab"),
b'N\xe5l\xb8\x12@iw\xe2\xf3\x1bp\xea\xffp!u\xe2\x8a\xc6',
{},
),
(
Request("https://example.org/a", body=b'b'),
b'_NOv\xbco$6\xfcW\x9f\xb24g\x9f\xbb\xdd\xa82\xc5',
{},
),
)
def test_query_string_key_order(self):
r1 = Request("http://www.example.com/query?id=111&cat=222")
r2 = Request("http://www.example.com/query?cat=222&id=111")
self.assertEqual(self.function(r1), self.function(r1))
self.assertEqual(self.function(r1), self.function(r2))
def test_query_string_key_without_value(self):
r1 = Request('http://www.example.com/hnnoticiaj1.aspx?78132,199')
r2 = Request('http://www.example.com/hnnoticiaj1.aspx?78160,199')
self.assertNotEqual(self.function(r1), self.function(r2))
def test_caching(self):
r1 = Request('http://www.example.com/hnnoticiaj1.aspx?78160,199')
self.assertEqual(
self.function(r1),
self.cache[r1][self.default_cache_key]
)
def test_header(self):
r1 = Request("http://www.example.com/members/offers.html")
r2 = Request("http://www.example.com/members/offers.html")
r2.headers['SESSIONID'] = b"somehash"
self.assertEqual(self.function(r1), self.function(r2))
def test_headers(self):
r1 = Request("http://www.example.com/")
r2 = Request("http://www.example.com/")
r2.headers['Accept-Language'] = b'en'
r3 = Request("http://www.example.com/")
r3.headers['Accept-Language'] = b'en'
r3.headers['SESSIONID'] = b"somehash"
self.assertEqual(self.function(r1), self.function(r2), self.function(r3))
self.assertEqual(self.function(r1),
self.function(r1, include_headers=['Accept-Language']))
self.assertNotEqual(
self.function(r1),
self.function(r2, include_headers=['Accept-Language']))
self.assertEqual(self.function(r3, include_headers=['accept-language', 'sessionid']),
self.function(r3, include_headers=['SESSIONID', 'Accept-Language']))
def test_fragment(self):
r1 = Request("http://www.example.com/test.html")
r2 = Request("http://www.example.com/test.html#fragment")
self.assertEqual(self.function(r1), self.function(r2))
self.assertEqual(self.function(r1), self.function(r1, keep_fragments=True))
self.assertNotEqual(self.function(r2), self.function(r2, keep_fragments=True))
self.assertNotEqual(self.function(r1), self.function(r2, keep_fragments=True))
def test_method_and_body(self):
r1 = Request("http://www.example.com")
r2 = Request("http://www.example.com", method='POST')
r3 = Request("http://www.example.com", method='POST', body=b'request body')
self.assertNotEqual(self.function(r1), self.function(r2))
self.assertNotEqual(self.function(r2), self.function(r3))
def test_request_replace(self):
# cached fingerprint must be cleared on request copy
r1 = Request("http://www.example.com")
fp1 = self.function(r1)
r2 = r1.replace(url="http://www.example.com/other")
fp2 = self.function(r2)
self.assertNotEqual(fp1, fp2)
def test_part_separation(self):
# An old implementation used to serialize request data in a way that
# would put the body right after the URL.
r1 = Request("http://www.example.com/foo")
fp1 = self.function(r1)
r2 = Request("http://www.example.com/f", body=b'oo')
fp2 = self.function(r2)
self.assertNotEqual(fp1, fp2)
def test_hashes(self):
"""Test hardcoded hashes, to make sure future changes to not introduce
backward incompatibilities."""
actual = [
self.function(request, **kwargs)
for request, _, kwargs in self.known_hashes
]
expected = [
_fingerprint
for _, _fingerprint, _ in self.known_hashes
]
self.assertEqual(actual, expected)
class RequestFingerprintTest(FingerprintTest):
function = staticmethod(request_fingerprint)
cache = _deprecated_fingerprint_cache
known_hashes: Tuple[Tuple[Request, Union[bytes, str], Dict], ...] = (
(
Request("http://example.org"),
'b2e5245ef826fd9576c93bd6e392fce3133fab62',
{},
),
(
Request("https://example.org"),
'bd10a0a89ea32cdee77917320f1309b0da87e892',
{},
),
(
Request("https://example.org?a"),
'2fb7d48ae02f04b749f40caa969c0bc3c43204ce',
{},
),
(
Request("https://example.org?a=b"),
'42e5fe149b147476e3f67ad0670c57b4cc57856a',
{},
),
(
Request("https://example.org?a=b&a"),
'd23a9787cb56c6375c2cae4453c5a8c634526942',
{},
),
(
Request("https://example.org?a=b&a=c"),
'9a18a7a8552a9182b7f1e05d33876409e421e5c5',
{},
),
(
Request("https://example.org", method='POST'),
'ba20a80cb5c5ca460021ceefb3c2467b2bfd1bc6',
{},
),
(
Request("https://example.org", body=b'a'),
'4bb136e54e715a4ea7a9dd1101831765d33f2d60',
{},
),
(
Request("https://example.org", method='POST', body=b'a'),
'6c6595374a304b293be762f7b7be3f54e9947c65',
{},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
'bd10a0a89ea32cdee77917320f1309b0da87e892',
{},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
'515b633cb3ca502a33a9d8c890e889ec1e425e65',
{'include_headers': ['A']},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
'505c96e7da675920dfef58725e8c957dfdb38f47',
{'keep_fragments': True},
),
(
Request("https://example.org#a", headers={'A': b'B'}),
'd6f673cdcb661b7970c2b9a00ee63e87d1e2e5da',
{'include_headers': ['A'], 'keep_fragments': True},
),
(
Request("https://example.org/ab"),
'4e2870fee58582d6f81755e9b8fdefe3cba0c951',
{},
),
(
Request("https://example.org/a", body=b'b'),
'4e2870fee58582d6f81755e9b8fdefe3cba0c951',
{},
),
)
def setUp(self) -> None:
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
def tearDown(self) -> None:
warnings.simplefilter("default", ScrapyDeprecationWarning)
@pytest.mark.xfail(reason='known bug kept for backward compatibility', strict=True)
def test_part_separation(self):
super().test_part_separation()
class RequestFingerprintDeprecationTest(unittest.TestCase):
def test_deprecation_default_parameters(self):
with pytest.warns(ScrapyDeprecationWarning) as warnings:
request_fingerprint(Request("http://www.example.com"))
messages = [str(warning.message) for warning in warnings]
self.assertTrue(
any(
'Call to deprecated function' in message
for message in messages
)
)
self.assertFalse(any('non-default' in message for message in messages))
def test_deprecation_non_default_parameters(self):
with pytest.warns(ScrapyDeprecationWarning) as warnings:
request_fingerprint(Request("http://www.example.com"), keep_fragments=True)
messages = [str(warning.message) for warning in warnings]
self.assertTrue(
any(
'Call to deprecated function' in message
for message in messages
)
)
self.assertTrue(any('non-default' in message for message in messages))
class RequestFingerprintAsBytesTest(FingerprintTest):
function = staticmethod(_request_fingerprint_as_bytes)
cache = _deprecated_fingerprint_cache
known_hashes = RequestFingerprintTest.known_hashes
def test_caching(self):
r1 = Request('http://www.example.com/hnnoticiaj1.aspx?78160,199')
self.assertEqual(
self.function(r1),
bytes.fromhex(self.cache[r1][self.default_cache_key])
)
@pytest.mark.xfail(reason='known bug kept for backward compatibility', strict=True)
def test_part_separation(self):
super().test_part_separation()
def test_hashes(self):
actual = [
self.function(request, **kwargs)
for request, _, kwargs in self.known_hashes
]
expected = [
bytes.fromhex(_fingerprint)
for _, _fingerprint, _ in self.known_hashes
]
self.assertEqual(actual, expected)
_fingerprint_cache_2_6: Mapping[Request, Tuple[None, bool]] = WeakKeyDictionary()
def request_fingerprint_2_6(request, include_headers=None, keep_fragments=False):
if include_headers:
include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
cache = _fingerprint_cache_2_6.setdefault(request, {})
cache_key = (include_headers, keep_fragments)
if cache_key not in cache:
fp = sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
fp.update(request.body or b'')
if include_headers:
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[cache_key] = fp.hexdigest()
return cache[cache_key]
REQUEST_OBJECTS_TO_TEST = (
Request("http://www.example.com/"),
Request("http://www.example.com/query?id=111&cat=222"),
Request("http://www.example.com/query?cat=222&id=111"),
Request('http://www.example.com/hnnoticiaj1.aspx?78132,199'),
Request('http://www.example.com/hnnoticiaj1.aspx?78160,199'),
Request("http://www.example.com/members/offers.html"),
Request(
"http://www.example.com/members/offers.html",
headers={'SESSIONID': b"somehash"},
),
Request(
"http://www.example.com/",
headers={'Accept-Language': b"en"},
),
Request(
"http://www.example.com/",
headers={
'Accept-Language': b"en",
'SESSIONID': b"somehash",
},
),
Request("http://www.example.com/test.html"),
Request("http://www.example.com/test.html#fragment"),
Request("http://www.example.com", method='POST'),
Request("http://www.example.com", method='POST', body=b'request body'),
)
class BackwardCompatibilityTestCase(unittest.TestCase):
def test_function_backward_compatibility(self):
include_headers_to_test = (
None,
['Accept-Language'],
['accept-language', 'sessionid'],
['SESSIONID', 'Accept-Language'],
)
for request_object in REQUEST_OBJECTS_TO_TEST:
for include_headers in include_headers_to_test:
for keep_fragments in (False, True):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fp = request_fingerprint(
request_object,
include_headers=include_headers,
keep_fragments=keep_fragments,
)
old_fp = request_fingerprint_2_6(
request_object,
include_headers=include_headers,
keep_fragments=keep_fragments,
)
self.assertEqual(fp, old_fp)
def test_component_backward_compatibility(self):
for request_object in REQUEST_OBJECTS_TO_TEST:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
crawler = get_crawler(prevent_warnings=False)
fp = crawler.request_fingerprinter.fingerprint(request_object)
old_fp = request_fingerprint_2_6(request_object)
self.assertEqual(fp.hex(), old_fp)
def test_custom_component_backward_compatibility(self):
"""Tests that the backward-compatible request fingerprinting class featured
in the documentation is indeed backward compatible and does not cause a
warning to be logged."""
class RequestFingerprinter:
cache = WeakKeyDictionary()
def fingerprint(self, request):
if request not in self.cache:
fp = sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url)))
fp.update(request.body or b'')
self.cache[request] = fp.digest()
return self.cache[request]
for request_object in REQUEST_OBJECTS_TO_TEST:
with warnings.catch_warnings() as logged_warnings:
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
}
crawler = get_crawler(settings_dict=settings)
fp = crawler.request_fingerprinter.fingerprint(request_object)
old_fp = request_fingerprint_2_6(request_object)
self.assertEqual(fp.hex(), old_fp)
self.assertFalse(logged_warnings)
class RequestFingerprinterTestCase(unittest.TestCase):
def test_default_implementation(self):
with warnings.catch_warnings(record=True) as logged_warnings:
crawler = get_crawler(prevent_warnings=False)
request = Request('https://example.com')
self.assertEqual(
crawler.request_fingerprinter.fingerprint(request),
_request_fingerprint_as_bytes(request),
)
self.assertTrue(logged_warnings)
def test_deprecated_implementation(self):
settings = {
'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'PREVIOUS_VERSION',
}
with warnings.catch_warnings(record=True) as logged_warnings:
crawler = get_crawler(settings_dict=settings)
request = Request('https://example.com')
self.assertEqual(
crawler.request_fingerprinter.fingerprint(request),
_request_fingerprint_as_bytes(request),
)
self.assertTrue(logged_warnings)
def test_recommended_implementation(self):
settings = {
'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION',
}
with warnings.catch_warnings(record=True) as logged_warnings:
crawler = get_crawler(settings_dict=settings)
request = Request('https://example.com')
self.assertEqual(
crawler.request_fingerprinter.fingerprint(request),
fingerprint(request),
)
self.assertFalse(logged_warnings)
def test_unknown_implementation(self):
settings = {
'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.5',
}
with self.assertRaises(ValueError):
get_crawler(settings_dict=settings)
class CustomRequestFingerprinterTestCase(unittest.TestCase):
def test_include_headers(self):
class RequestFingerprinter:
def fingerprint(self, request):
return fingerprint(request, include_headers=['X-ID'])
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
}
crawler = get_crawler(settings_dict=settings)
r1 = Request("http://www.example.com", headers={'X-ID': '1'})
fp1 = crawler.request_fingerprinter.fingerprint(r1)
r2 = Request("http://www.example.com", headers={'X-ID': '2'})
fp2 = crawler.request_fingerprinter.fingerprint(r2)
self.assertNotEqual(fp1, fp2)
def test_dont_canonicalize(self):
class RequestFingerprinter:
cache = WeakKeyDictionary()
def fingerprint(self, request):
if request not in self.cache:
fp = sha1()
fp.update(to_bytes(request.url))
self.cache[request] = fp.digest()
return self.cache[request]
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
}
crawler = get_crawler(settings_dict=settings)
r1 = Request("http://www.example.com?a=1&a=2")
fp1 = crawler.request_fingerprinter.fingerprint(r1)
r2 = Request("http://www.example.com?a=2&a=1")
fp2 = crawler.request_fingerprinter.fingerprint(r2)
self.assertNotEqual(fp1, fp2)
def test_meta(self):
class RequestFingerprinter:
def fingerprint(self, request):
if 'fingerprint' in request.meta:
return request.meta['fingerprint']
return fingerprint(request)
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
}
crawler = get_crawler(settings_dict=settings)
r1 = Request("http://www.example.com")
fp1 = crawler.request_fingerprinter.fingerprint(r1)
r2 = Request("http://www.example.com", meta={'fingerprint': 'a'})
fp2 = crawler.request_fingerprinter.fingerprint(r2)
r3 = Request("http://www.example.com", meta={'fingerprint': 'a'})
fp3 = crawler.request_fingerprinter.fingerprint(r3)
r4 = Request("http://www.example.com", meta={'fingerprint': 'b'})
fp4 = crawler.request_fingerprinter.fingerprint(r4)
self.assertNotEqual(fp1, fp2)
self.assertNotEqual(fp1, fp4)
self.assertNotEqual(fp2, fp4)
self.assertEqual(fp2, fp3)
def test_from_crawler(self):
class RequestFingerprinter:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
self._fingerprint = crawler.settings['FINGERPRINT']
def fingerprint(self, request):
return self._fingerprint
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
'FINGERPRINT': b'fingerprint',
}
crawler = get_crawler(settings_dict=settings)
request = Request("http://www.example.com")
fingerprint = crawler.request_fingerprinter.fingerprint(request)
self.assertEqual(fingerprint, settings['FINGERPRINT'])
def test_from_settings(self):
class RequestFingerprinter:
@classmethod
def from_settings(cls, settings):
return cls(settings)
def __init__(self, settings):
self._fingerprint = settings['FINGERPRINT']
def fingerprint(self, request):
return self._fingerprint
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
'FINGERPRINT': b'fingerprint',
}
crawler = get_crawler(settings_dict=settings)
request = Request("http://www.example.com")
fingerprint = crawler.request_fingerprinter.fingerprint(request)
self.assertEqual(fingerprint, settings['FINGERPRINT'])
def test_from_crawler_and_settings(self):
class RequestFingerprinter:
# This method is ignored due to the presence of from_crawler
@classmethod
def from_settings(cls, settings):
return cls(settings)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
self._fingerprint = crawler.settings['FINGERPRINT']
def fingerprint(self, request):
return self._fingerprint
settings = {
'REQUEST_FINGERPRINTER_CLASS': RequestFingerprinter,
'FINGERPRINT': b'fingerprint',
}
crawler = get_crawler(settings_dict=settings)
request = Request("http://www.example.com")
fingerprint = crawler.request_fingerprinter.fingerprint(request)
self.assertEqual(fingerprint, settings['FINGERPRINT'])
if __name__ == "__main__":
unittest.main()