cache: read metadata only when when looking for cached items. refs #61

thanks Patrick Mezard for patch. --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40849
2025-02-26 18:23:57 +00:00 · 2009-02-12 08:00:07 +00:00 · 2009-02-12 08:00:07 +00:00 · 2b14251510
commit 2b14251510
parent d68646422d
1 changed files with 20 additions and 18 deletions
--- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py
+++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py
@ -1,5 +1,6 @@
 from __future__ import with_statement

+import errno
 import os
 import hashlib
 import datetime
@ -111,36 +112,37 @@ class Cache(object):
                if not os.path.exists(linkpath):
                    os.makedirs(linkpath)

-    def is_cached(self, domain, key):
+    def read_meta(self, domain, key):
+        """Return the metadata dictionary (possibly empty) if the entry is
+        cached, None otherwise.
+        """
        requestpath = self.requestpath(domain, key)
-        if os.path.exists(requestpath):
+        try:
            with open(os.path.join(requestpath, 'pickled_meta'), 'r') as f:
                metadata = pickle.load(f)
+        except IOError, e:
+            if e.errno != errno.ENOENT:
+                raise
+            return None
        expiration_secs = settings.getint('CACHE2_EXPIRATION_SECS')
        if expiration_secs >= 0:
-                if datetime.datetime.utcnow() <= metadata['timestamp'] + datetime.timedelta(seconds=expiration_secs):
-                    return True
-                else:
+            expiration_date = metadata['timestamp'] + datetime.timedelta(seconds=expiration_secs)
+            if datetime.datetime.utcnow() > expiration_date:
                log.msg('dropping old cached response from %s' % metadata['timestamp'], level=log.DEBUG)
-                    return False
-            else:
-                # disabled cache expiration
-                return True
-        else:
-            return False
+                return None
+        return metadata

    def retrieve_response(self, domain, key):
        """
        Return response dictionary if request has correspondent cache record;
        return None if not.
        """
-        if not self.is_cached(domain, key):
+        metadata = self.read_meta(domain, key)
+        if metadata is None:
            return None # not cached

        requestpath = self.requestpath(domain, key)
-        metadata = responsebody = responseheaders = None
-        with open(os.path.join(requestpath, 'pickled_meta'), 'r') as f:
-            metadata = pickle.load(f)
+        responsebody = responseheaders = None
        with open(os.path.join(requestpath, 'response_body')) as f:
            responsebody = f.read()
        with open(os.path.join(requestpath, 'response_headers')) as f: