Merge pull request #1020 from jojje/gzip_http_cache

[MRG+1] add gzip compression to filesystem http cache backend
2025-02-23 12:04:00 +00:00 · 2015-03-17 14:32:06 -03:00 · 2015-03-17 14:32:06 -03:00 · 934584a355
commit 934584a355
parent f924567591 bd5d99a2d2
4 changed files with 30 additions and 9 deletions
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -563,6 +563,18 @@ Default: ``'scrapy.contrib.httpcache.DummyPolicy'``

 The class which implements the cache policy.

+.. setting:: HTTPCACHE_GZIP
+
+HTTPCACHE_GZIP
+^^^^^^^^^^^^^^
+
+.. versionadded:: 0.25
+
+Default: ``False``
+
+If enabled, will compress all cached data with gzip.
+This setting is specific to the Filesystem backend.
+

 HttpCompressionMiddleware
 -------------------------
--- a/scrapy/contrib/httpcache.py
+++ b/scrapy/contrib/httpcache.py
@ -1,5 +1,6 @@
 from __future__ import print_function
 import os
+import gzip
 from six.moves import cPickle as pickle
 from importlib import import_module
 from time import time
@ -220,6 +221,8 @@ class FilesystemCacheStorage(object):
    def __init__(self, settings):
        self.cachedir = data_path(settings['HTTPCACHE_DIR'])
        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
+        self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
+        self._open = gzip.open if self.use_gzip else open

    def open_spider(self, spider):
        pass
@ -233,9 +236,9 @@ class FilesystemCacheStorage(object):
        if metadata is None:
            return  # not cached
        rpath = self._get_request_path(spider, request)
-        with open(os.path.join(rpath, 'response_body'), 'rb') as f:
+        with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
            body = f.read()
-        with open(os.path.join(rpath, 'response_headers'), 'rb') as f:
+        with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
            rawheaders = f.read()
        url = metadata.get('response_url')
        status = metadata['status']
@ -256,17 +259,17 @@ class FilesystemCacheStorage(object):
            'response_url': response.url,
            'timestamp': time(),
        }
-        with open(os.path.join(rpath, 'meta'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
            f.write(repr(metadata))
-        with open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
            pickle.dump(metadata, f, protocol=2)
-        with open(os.path.join(rpath, 'response_headers'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(response.headers))
-        with open(os.path.join(rpath, 'response_body'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
            f.write(response.body)
-        with open(os.path.join(rpath, 'request_headers'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(request.headers))
-        with open(os.path.join(rpath, 'request_body'), 'wb') as f:
+        with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
            f.write(request.body)

    def _get_request_path(self, spider, request):
@ -281,7 +284,7 @@ class FilesystemCacheStorage(object):
        mtime = os.stat(rpath).st_mtime
        if 0 < self.expiration_secs < time() - mtime:
            return  # expired
-        with open(metapath, 'rb') as f:
+        with self._open(metapath, 'rb') as f:
            return pickle.load(f)


--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@ -154,6 +154,7 @@ HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_IGNORE_SCHEMES = ['file']
 HTTPCACHE_DBM_MODULE = 'anydbm'
 HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.DummyPolicy'
+HTTPCACHE_GZIP = False

 ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'

--- a/tests/test_downloadermiddleware_httpcache.py
+++ b/tests/test_downloadermiddleware_httpcache.py
@ -148,6 +148,11 @@ class FilesystemStorageTest(DefaultStorageTest):

    storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'

+class FilesystemStorageGzipTest(FilesystemStorageTest):
+
+    def _get_settings(self, **new_settings):
+        new_settings.setdefault('HTTPCACHE_GZIP', True)
+        return super(FilesystemStorageTest, self)._get_settings(**new_settings)

 class LeveldbStorageTest(DefaultStorageTest):