mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 12:04:00 +00:00
Merge pull request #1020 from jojje/gzip_http_cache
[MRG+1] add gzip compression to filesystem http cache backend
This commit is contained in:
commit
934584a355
@ -563,6 +563,18 @@ Default: ``'scrapy.contrib.httpcache.DummyPolicy'``
|
||||
|
||||
The class which implements the cache policy.
|
||||
|
||||
.. setting:: HTTPCACHE_GZIP
|
||||
|
||||
HTTPCACHE_GZIP
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
.. versionadded:: 0.25
|
||||
|
||||
Default: ``False``
|
||||
|
||||
If enabled, will compress all cached data with gzip.
|
||||
This setting is specific to the Filesystem backend.
|
||||
|
||||
|
||||
HttpCompressionMiddleware
|
||||
-------------------------
|
||||
|
@ -1,5 +1,6 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import gzip
|
||||
from six.moves import cPickle as pickle
|
||||
from importlib import import_module
|
||||
from time import time
|
||||
@ -220,6 +221,8 @@ class FilesystemCacheStorage(object):
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider):
|
||||
pass
|
||||
@ -233,9 +236,9 @@ class FilesystemCacheStorage(object):
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
@ -256,17 +259,17 @@ class FilesystemCacheStorage(object):
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
}
|
||||
with open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(repr(metadata))
|
||||
with open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=2)
|
||||
with open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
with open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
@ -281,7 +284,7 @@ class FilesystemCacheStorage(object):
|
||||
mtime = os.stat(rpath).st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with open(metapath, 'rb') as f:
|
||||
with self._open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
|
@ -154,6 +154,7 @@ HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
HTTPCACHE_IGNORE_SCHEMES = ['file']
|
||||
HTTPCACHE_DBM_MODULE = 'anydbm'
|
||||
HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.DummyPolicy'
|
||||
HTTPCACHE_GZIP = False
|
||||
|
||||
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||
|
||||
|
@ -148,6 +148,11 @@ class FilesystemStorageTest(DefaultStorageTest):
|
||||
|
||||
storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'
|
||||
|
||||
class FilesystemStorageGzipTest(FilesystemStorageTest):
|
||||
|
||||
def _get_settings(self, **new_settings):
|
||||
new_settings.setdefault('HTTPCACHE_GZIP', True)
|
||||
return super(FilesystemStorageTest, self)._get_settings(**new_settings)
|
||||
|
||||
class LeveldbStorageTest(DefaultStorageTest):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user