1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 08:24:21 +00:00

switched request_fingerprint to use WeakKeyDictionary for caching (instead of Request.cache)

This commit is contained in:
Pablo Hoffman 2009-08-24 08:45:23 -03:00
parent 0186c6937a
commit 49e11d34c2
2 changed files with 18 additions and 20 deletions

View File

@ -1,6 +1,7 @@
import unittest
from scrapy.http import Request
from scrapy.utils.request import request_fingerprint, request_authenticate, request_httprepr
from scrapy.utils.request import request_fingerprint, _fingerprint_cache, \
request_authenticate, request_httprepr
class UtilsRequestTest(unittest.TestCase):
@ -15,7 +16,7 @@ class UtilsRequestTest(unittest.TestCase):
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))
# make sure caching is working
self.assertEqual(request_fingerprint(r1), r1.cache['fingerprint'])
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None])
r1 = Request("http://www.example.com/members/offers.html")
r2 = Request("http://www.example.com/members/offers.html")

View File

@ -4,12 +4,15 @@ scrapy.http.Request objects
"""
import hashlib
import weakref
from base64 import urlsafe_b64encode
from scrapy.utils.url import canonicalize_url
from scrapy.utils.httpobj import urlparse_cached
def request_fingerprint(request, include_headers=()):
_fingerprint_cache = weakref.WeakKeyDictionary()
def request_fingerprint(request, include_headers=None):
"""
Return the request fingerprint.
@ -36,28 +39,22 @@ def request_fingerprint(request, include_headers=()):
include_headers argument, which is a list of Request headers to include.
"""
if include_headers:
include_headers = [h.lower() for h in sorted(include_headers)]
cachekey = 'fingerprint' + '_'.join(include_headers)
else:
cachekey = 'fingerprint'
try:
return request.cache[cachekey]
except KeyError:
include_headers = tuple([h.lower() for h in sorted(include_headers)])
cache = _fingerprint_cache.setdefault(request, {})
if include_headers not in cache:
fp = hashlib.sha1()
fp.update(request.method)
fp.update(canonicalize_url(request.url))
fp.update(request.body or '')
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
fphash = fp.hexdigest()
request.cache[cachekey] = fphash
return fphash
if include_headers:
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[include_headers] = fp.hexdigest()
return cache[include_headers]
def request_authenticate(request, username, password):
"""Autenticate the given request (in place) using the HTTP basic access