diff --git a/scrapy/trunk/scrapy/conf/core_settings.py b/scrapy/trunk/scrapy/conf/core_settings.py index 74501c26c..70da5c802 100644 --- a/scrapy/trunk/scrapy/conf/core_settings.py +++ b/scrapy/trunk/scrapy/conf/core_settings.py @@ -11,6 +11,7 @@ USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) DOWNLOAD_TIMEOUT = 180 # 3mins CONCURRENT_DOMAINS = 8 # number of domains to scrape in parallel REQUESTS_PER_DOMAIN = 8 # max simultaneous requests per domain +CACHE2_EXPIRATION_SECS = 48 * 60 * 60 # seconds while cached response is still valid LOG_ENABLED = True # LOGLEVEL = 'DEBUG' # default loglevel diff --git a/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py b/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py index 773d3d99d..66f8a99ae 100644 --- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py +++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/cache.py @@ -111,7 +111,12 @@ class Cache(object): def is_cached(self, domain, key): requestpath = self.requestpath(domain, key) - return os.path.exists(requestpath) + if os.path.exists(requestpath): + with open(os.path.join(requestpath, 'meta_data')) as f: + metadata = eval(f.read()) + return datetime.datetime.now() <= metadata['timestamp'] + datetime.timedelta(seconds=settings.getint('CACHE2_EXPIRATION_SECS')) + else: + return False def retrieve_response(self, domain, key): """ diff --git a/scrapy/trunk/scrapy/http/request.py b/scrapy/trunk/scrapy/http/request.py index 2c0623ed1..84d7377ad 100644 --- a/scrapy/trunk/scrapy/http/request.py +++ b/scrapy/trunk/scrapy/http/request.py @@ -1,6 +1,6 @@ import urllib import warnings -from sha import sha +from hashlib import sha1 from copy import copy from base64 import urlsafe_b64encode @@ -149,7 +149,8 @@ class Request(object): headers = dict([(k, v) for k, v in self.headers.items() if k.lower() not in keys]) # fingerprint generation - fp = sha(canonicalize(self.url)) + fp = sha1() + fp.update(canonicalize(self.url)) fp.update(self.method) if self.body and self.method in ['POST', 'PUT']: