implemented CACHE2_EXPIRATION_SECS and migrated sha to hashlib

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%4080
2025-02-22 00:53:24 +00:00 · 2008-07-21 13:23:23 +00:00 · 2008-07-21 13:23:23 +00:00 · d2121141a3
commit d2121141a3
parent b45d87d0fe
3 changed files with 10 additions and 3 deletions
--- a/scrapy/trunk/scrapy/conf/core_settings.py
+++ b/scrapy/trunk/scrapy/conf/core_settings.py
@ -11,6 +11,7 @@ USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
 DOWNLOAD_TIMEOUT = 180      # 3mins
 CONCURRENT_DOMAINS = 8    # number of domains to scrape in parallel
 REQUESTS_PER_DOMAIN = 8     # max simultaneous requests per domain
+CACHE2_EXPIRATION_SECS = 48 * 60 * 60 # seconds while cached response is still valid

 LOG_ENABLED = True  #
 LOGLEVEL = 'DEBUG'   # default loglevel
--- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py
+++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py
@ -111,7 +111,12 @@ class Cache(object):

    def is_cached(self, domain, key):
        requestpath = self.requestpath(domain, key)
-        return os.path.exists(requestpath)
+        if os.path.exists(requestpath):
+            with open(os.path.join(requestpath, 'meta_data')) as f:
+                metadata = eval(f.read())
+            return datetime.datetime.now() <= metadata['timestamp'] + datetime.timedelta(seconds=settings.getint('CACHE2_EXPIRATION_SECS'))
+        else:
+            return False

    def retrieve_response(self, domain, key):
        """
--- a/scrapy/trunk/scrapy/http/request.py
+++ b/scrapy/trunk/scrapy/http/request.py
@ -1,6 +1,6 @@
 import urllib
 import warnings
-from sha import sha
+from hashlib import sha1
 from copy import copy
 from base64 import urlsafe_b64encode

@ -149,7 +149,8 @@ class Request(object):
                    headers = dict([(k, v) for k, v in self.headers.items() if k.lower() not in keys])

        # fingerprint generation
-        fp = sha(canonicalize(self.url))
+        fp = sha1()
+        fp.update(canonicalize(self.url))
        fp.update(self.method)

        if self.body and self.method in ['POST', 'PUT']: