1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 04:03:55 +00:00

Add a keep_fragments parameter to the request_fingerprint function (#4104)

This commit is contained in:
Benjamin Ooghe-Tabanou 2019-10-30 09:13:36 +01:00 committed by Adrián Chaves
parent 66cbceeb0a
commit 6d6da78eda
2 changed files with 19 additions and 6 deletions

View File

@ -16,7 +16,7 @@ from scrapy.utils.httpobj import urlparse_cached
_fingerprint_cache = weakref.WeakKeyDictionary()
def request_fingerprint(request, include_headers=None):
def request_fingerprint(request, include_headers=None, keep_fragments=False):
"""
Return the request fingerprint.
@ -42,15 +42,21 @@ def request_fingerprint(request, include_headers=None):
the fingeprint. If you want to include specific headers use the
include_headers argument, which is a list of Request headers to include.
Also, servers usually ignore fragments in urls when handling requests,
so they are also ignored by default when calculating the fingerprint.
If you want to include them, set the keep_fragments argument to True
(for instance when handling requests with a headless browser).
"""
if include_headers:
include_headers = tuple(to_bytes(h.lower())
for h in sorted(include_headers))
cache = _fingerprint_cache.setdefault(request, {})
if include_headers not in cache:
cache_key = (include_headers, keep_fragments)
if cache_key not in cache:
fp = hashlib.sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url)))
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
fp.update(request.body or b'')
if include_headers:
for hdr in include_headers:
@ -58,8 +64,8 @@ def request_fingerprint(request, include_headers=None):
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[include_headers] = fp.hexdigest()
return cache[include_headers]
cache[cache_key] = fp.hexdigest()
return cache[cache_key]
def request_authenticate(request, username, password):

View File

@ -17,7 +17,7 @@ class UtilsRequestTest(unittest.TestCase):
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))
# make sure caching is working
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None])
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][(None, False)])
r1 = Request("http://www.example.com/members/offers.html")
r2 = Request("http://www.example.com/members/offers.html")
@ -42,6 +42,13 @@ class UtilsRequestTest(unittest.TestCase):
self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']),
request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language']))
r1 = Request("http://www.example.com/test.html")
r2 = Request("http://www.example.com/test.html#fragment")
self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))
self.assertEqual(request_fingerprint(r1), request_fingerprint(r1, keep_fragments=True))
self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r2, keep_fragments=True))
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2, keep_fragments=True))
r1 = Request("http://www.example.com")
r2 = Request("http://www.example.com", method='POST')
r3 = Request("http://www.example.com", method='POST', body=b'request body')