diff --git a/scrapy/utils/request.py b/scrapy/utils/request.py index 9c143b83a..fb5af66a2 100644 --- a/scrapy/utils/request.py +++ b/scrapy/utils/request.py @@ -16,7 +16,7 @@ from scrapy.utils.httpobj import urlparse_cached _fingerprint_cache = weakref.WeakKeyDictionary() -def request_fingerprint(request, include_headers=None): +def request_fingerprint(request, include_headers=None, keep_fragments=False): """ Return the request fingerprint. @@ -42,15 +42,21 @@ def request_fingerprint(request, include_headers=None): the fingeprint. If you want to include specific headers use the include_headers argument, which is a list of Request headers to include. + Also, servers usually ignore fragments in urls when handling requests, + so they are also ignored by default when calculating the fingerprint. + If you want to include them, set the keep_fragments argument to True + (for instance when handling requests with a headless browser). + """ if include_headers: include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers)) cache = _fingerprint_cache.setdefault(request, {}) - if include_headers not in cache: + cache_key = (include_headers, keep_fragments) + if cache_key not in cache: fp = hashlib.sha1() fp.update(to_bytes(request.method)) - fp.update(to_bytes(canonicalize_url(request.url))) + fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))) fp.update(request.body or b'') if include_headers: for hdr in include_headers: @@ -58,8 +64,8 @@ def request_fingerprint(request, include_headers=None): fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) - cache[include_headers] = fp.hexdigest() - return cache[include_headers] + cache[cache_key] = fp.hexdigest() + return cache[cache_key] def request_authenticate(request, username, password): diff --git a/tests/test_utils_request.py b/tests/test_utils_request.py index e8a4eb3ea..625a32048 100644 --- a/tests/test_utils_request.py +++ b/tests/test_utils_request.py @@ -17,7 +17,7 @@ class UtilsRequestTest(unittest.TestCase): self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2)) # make sure caching is working - self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None]) + self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][(None, False)]) r1 = Request("http://www.example.com/members/offers.html") r2 = Request("http://www.example.com/members/offers.html") @@ -42,6 +42,13 @@ class UtilsRequestTest(unittest.TestCase): self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']), request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language'])) + r1 = Request("http://www.example.com/test.html") + r2 = Request("http://www.example.com/test.html#fragment") + self.assertEqual(request_fingerprint(r1), request_fingerprint(r2)) + self.assertEqual(request_fingerprint(r1), request_fingerprint(r1, keep_fragments=True)) + self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r2, keep_fragments=True)) + self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2, keep_fragments=True)) + r1 = Request("http://www.example.com") r2 = Request("http://www.example.com", method='POST') r3 = Request("http://www.example.com", method='POST', body=b'request body')