mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 04:03:55 +00:00
Add a keep_fragments parameter to the request_fingerprint function (#4104)
This commit is contained in:
parent
66cbceeb0a
commit
6d6da78eda
@ -16,7 +16,7 @@ from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
|
||||
_fingerprint_cache = weakref.WeakKeyDictionary()
|
||||
def request_fingerprint(request, include_headers=None):
|
||||
def request_fingerprint(request, include_headers=None, keep_fragments=False):
|
||||
"""
|
||||
Return the request fingerprint.
|
||||
|
||||
@ -42,15 +42,21 @@ def request_fingerprint(request, include_headers=None):
|
||||
the fingeprint. If you want to include specific headers use the
|
||||
include_headers argument, which is a list of Request headers to include.
|
||||
|
||||
Also, servers usually ignore fragments in urls when handling requests,
|
||||
so they are also ignored by default when calculating the fingerprint.
|
||||
If you want to include them, set the keep_fragments argument to True
|
||||
(for instance when handling requests with a headless browser).
|
||||
|
||||
"""
|
||||
if include_headers:
|
||||
include_headers = tuple(to_bytes(h.lower())
|
||||
for h in sorted(include_headers))
|
||||
cache = _fingerprint_cache.setdefault(request, {})
|
||||
if include_headers not in cache:
|
||||
cache_key = (include_headers, keep_fragments)
|
||||
if cache_key not in cache:
|
||||
fp = hashlib.sha1()
|
||||
fp.update(to_bytes(request.method))
|
||||
fp.update(to_bytes(canonicalize_url(request.url)))
|
||||
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
|
||||
fp.update(request.body or b'')
|
||||
if include_headers:
|
||||
for hdr in include_headers:
|
||||
@ -58,8 +64,8 @@ def request_fingerprint(request, include_headers=None):
|
||||
fp.update(hdr)
|
||||
for v in request.headers.getlist(hdr):
|
||||
fp.update(v)
|
||||
cache[include_headers] = fp.hexdigest()
|
||||
return cache[include_headers]
|
||||
cache[cache_key] = fp.hexdigest()
|
||||
return cache[cache_key]
|
||||
|
||||
|
||||
def request_authenticate(request, username, password):
|
||||
|
@ -17,7 +17,7 @@ class UtilsRequestTest(unittest.TestCase):
|
||||
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))
|
||||
|
||||
# make sure caching is working
|
||||
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None])
|
||||
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][(None, False)])
|
||||
|
||||
r1 = Request("http://www.example.com/members/offers.html")
|
||||
r2 = Request("http://www.example.com/members/offers.html")
|
||||
@ -42,6 +42,13 @@ class UtilsRequestTest(unittest.TestCase):
|
||||
self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']),
|
||||
request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language']))
|
||||
|
||||
r1 = Request("http://www.example.com/test.html")
|
||||
r2 = Request("http://www.example.com/test.html#fragment")
|
||||
self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))
|
||||
self.assertEqual(request_fingerprint(r1), request_fingerprint(r1, keep_fragments=True))
|
||||
self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r2, keep_fragments=True))
|
||||
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2, keep_fragments=True))
|
||||
|
||||
r1 = Request("http://www.example.com")
|
||||
r2 = Request("http://www.example.com", method='POST')
|
||||
r3 = Request("http://www.example.com", method='POST', body=b'request body')
|
||||
|
Loading…
x
Reference in New Issue
Block a user