1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 19:24:12 +00:00

Merge branch 'http-cache-middleware'

Conflicts:
	scrapy/contrib/downloadermiddleware/httpcache.py
	scrapy/contrib/httpcache.py
	scrapy/tests/test_downloadermiddleware_httpcache.py
This commit is contained in:
Daniel Graña 2013-01-08 17:34:48 -02:00
commit d8a760bf57
6 changed files with 533 additions and 67 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ dropin.cache
docs/build
*egg-info
.tox
venv

View File

@ -285,6 +285,7 @@ HttpAuthMiddleware
.. _Basic access authentication: http://en.wikipedia.org/wiki/Basic_access_authentication
HttpCacheMiddleware
-------------------
@ -294,44 +295,88 @@ HttpCacheMiddleware
.. class:: HttpCacheMiddleware
This middleware provides low-level cache to all HTTP requests and responses.
Every request and its corresponding response are cached. When the same
request is seen again, the response is returned without transferring
anything from the Internet.
It has to be combined with a cache storage backend as well as a cache policy.
The HTTP cache is useful for testing spiders faster (without having to wait for
downloads every time) and for trying your spider offline, when an Internet
connection is not available.
Scrapy ships with two HTTP cache storage backends:
Scrapy ships with two storage backends for the HTTP cache middleware:
* :ref:`httpcache-storage-dbm`
* :ref:`httpcache-storage-fs`
* :ref:`httpcache-dbm-backend`
* :ref:`httpcache-fs-backend`
You can change the HTTP cache storage backend with the :setting:`HTTPCACHE_STORAGE`
setting. Or you can also implement your own storage backend.
You can change the storage backend with the :setting:`HTTPCACHE_STORAGE`
setting. Or you can also implement your own backend.
Scrapy ships with two HTTP cache policies:
.. _httpcache-dbm-backend:
* :ref:`httpcache-policy-rfc2616`
* :ref:`httpcache-policy-dummy`
You can change the HTTP cache policy with the :setting:`HTTPCACHE_POLICY`
setting. Or you can also implement your own policy.
.. _httpcache-policy-rfc2616:
RFC2616 policy (default)
~~~~~~~~~~~~~~~~~~~~~~~~
This policy provides a RFC2616 compliant HTTP cache, i.e. with HTTP
Cache-Control awareness, aimed at production and used in continuous
runs to avoid downloading unmodified data (to save bandwidth and speed up crawls).
In order to use this policy, set:
* :setting:`HTTPCACHE_POLICY` to ``scrapy.contrib.httpcache.RFC2616Policy``
This is the default cache policy.
.. _httpcache-policy-dummy:
Dummy policy
~~~~~~~~~~~~
This policy has no awareness of any HTTP Cache-Control directives.
Every request and its corresponding response are cached. When the same
request is seen again, the response is returned without transferring
anything from the Internet.
The Dummy policy is useful for testing spiders faster (without having
to wait for downloads every time) and for trying your spider offline,
when an Internet connection is not available. The goal is to be able to
"replay" a spider run *exactly as it ran before*.
In order to use this policy, set:
* :setting:`HTTPCACHE_POLICY` to ``scrapy.contrib.httpcache.DummyPolicy``
.. _httpcache-storage-dbm:
DBM storage backend (default)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.13
A DBM_ storage backend is available for the HTTP cache middleware. To use it
(note: it is the default storage backend) set :setting:`HTTPCACHE_STORAGE`
to ``scrapy.contrib.httpcache.DbmCacheStorage``.
A DBM_ storage backend is available for the HTTP cache middleware.
By default, it uses the anydbm_ module, but you can change it with the
:setting:`HTTPCACHE_DBM_MODULE` setting.
.. _httpcache-fs-backend:
In order to use this storage backend, set:
File system backend
~~~~~~~~~~~~~~~~~~~
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.DbmCacheStorage``
.. _httpcache-storage-fs:
Filesystem storage backend
~~~~~~~~~~~~~~~~~~~~~~~~~~
A file system storage backend is also available for the HTTP cache middleware.
To use it (instead of the default DBM_ storage backend) set :setting:`HTTPCACHE_STORAGE`
to ``scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage``.
In order to use this storage backend, set:
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.FilesystemCacheStorage``
Each request/response pair is stored in a different directory containing
the following files:
@ -352,6 +397,7 @@ inefficient in many file systems). An example directory could be::
/path/to/cache/dir/example.com/72/72811f648e718090f041317756c03adb0ada46c7
HTTPCache middleware settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -450,6 +496,17 @@ Default: ``'anydbm'``
The database module to use in the :ref:`DBM storage backend
<httpcache-dbm-backend>`. This setting is specific to the DBM backend.
.. setting:: HTTPCACHE_POLICY
HTTPCACHE_POLICY
^^^^^^^^^^^^^^^^
.. versionadded:: 0.18
Default: ``'scrapy.contrib.httpcache.RFC2616Policy'``
The class which implements the cache policy.
HttpCompressionMiddleware
-------------------------

View File

@ -1,7 +1,11 @@
from time import time
from email.utils import formatdate
from weakref import WeakKeyDictionary
from scrapy import signals
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import load_object
from scrapy.contrib.httpcache import rfc1123_to_epoch, parse_cachecontrol
class HttpCacheMiddleware(object):
@ -9,10 +13,9 @@ class HttpCacheMiddleware(object):
def __init__(self, settings, stats):
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
self.stats = stats
@classmethod
@ -29,36 +32,65 @@ class HttpCacheMiddleware(object):
self.storage.close_spider(spider)
def process_request(self, request, spider):
if not self.is_cacheable(request):
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return
response = self.storage.retrieve_response(spider, request)
if response and self.is_cacheable_response(response):
response.flags.append('cached')
self.stats.inc_value('httpcache/hits', spider=spider)
return response
self.stats.inc_value('httpcache/misses', spider=spider)
if self.ignore_missing:
raise IgnoreRequest("Ignored request not in cache: %s" % request)
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
def process_response(self, request, response, spider):
if (self.is_cacheable(request)
and self.is_cacheable_response(response)
and 'cached' not in response.flags):
self.storage.store_response(spider, request, response)
self.stats.inc_value('httpcache/store', spider=spider)
# Skip cached responses and uncacheable requests
if 'cached' in response.flags or '_dont_cache' in request.meta:
request.meta.pop('_dont_cache', None)
return response
# RFC2616 requires origin server to set Date header,
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
if 'Date' not in response.headers:
response.headers['Date'] = formatdate(usegmt=1)
# Do not validate first-hand responses
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is None:
self.stats.inc_value('httpcache/firsthand', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
if self.policy.is_cached_response_valid(cachedresponse, response, request):
self.stats.inc_value('httpcache/revalidate', spider=spider)
return cachedresponse
self.stats.inc_value('httpcache/invalidate', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
def is_cacheable_response(self, response):
return response.status not in self.ignore_http_codes
def is_cacheable(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
def _cache_response(self, spider, response, request, cachedresponse):
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value('httpcache/uncacheable', spider=spider)
from scrapy.contrib.httpcache import FilesystemCacheStorage as _FilesystemCacheStorage
class FilesystemCacheStorage(_FilesystemCacheStorage):
def __init__(self, *args, **kwargs):

View File

@ -1,11 +1,161 @@
import os
from time import time
import cPickle as pickle
from time import time
from weakref import WeakKeyDictionary
from email.utils import mktime_tz, parsedate_tz
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.utils.request import request_fingerprint
from scrapy.utils.project import data_path
from scrapy.utils.httpobj import urlparse_cached
class DummyPolicy(object):
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
def should_cache_response(self, response, request):
return response.status not in self.ignore_http_codes
def is_cached_response_fresh(self, response, request):
return True
def is_cached_response_valid(self, cachedresponse, response, request):
return True
class RFC2616Policy(object):
MAXAGE = 3600 * 24 * 365 # one year
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self._cc_parsed = WeakKeyDictionary()
def _parse_cachecontrol(self, r):
if r not in self._cc_parsed:
cch = r.headers.get('Cache-Control', '')
self._cc_parsed[r] = parse_cachecontrol(cch)
return self._cc_parsed[r]
def should_cache_request(self, request):
if urlparse_cached(request).scheme in self.ignore_schemes:
return False
cc = self._parse_cachecontrol(request)
# obey user-agent directive "Cache-Control: no-store"
if 'no-store' in cc:
return False
# Any other is eligible for caching
return True
def should_cache_response(self, response, request):
# What is cacheable - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
# Response cacheability - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
# Status code 206 is not included because cache can not deal with partial contents
cc = self._parse_cachecontrol(response)
# obey directive "Cache-Control: no-store"
if 'no-store' in cc:
return False
# Never cache 304 (Not Modified) responses
elif response.status == 304:
return False
# Any hint on response expiration is good
elif 'max-age' in cc or 'Expires' in response.headers:
return True
# Firefox fallbacks this statuses to one year expiration if none is set
elif response.status in (300, 301, 308):
return True
# Other statuses without expiration requires at least one validator
elif response.status in (200, 203, 401):
return 'Last-Modified' in response.headers or 'ETag' in response.headers
# Any other is probably not eligible for caching
# Makes no sense to cache responses that does not contain expiration
# info and can not be revalidated
else:
return False
def is_cached_response_fresh(self, cachedresponse, request):
cc = self._parse_cachecontrol(cachedresponse)
ccreq = self._parse_cachecontrol(request)
if 'no-cache' in cc or 'no-cache' in ccreq:
return False
now = time()
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
currentage = self._compute_current_age(cachedresponse, request, now)
if currentage < freshnesslifetime:
return True
# Cached response is stale, try to set validators if any
self._set_conditional_validators(request, cachedresponse)
return False
def is_cached_response_valid(self, cachedresponse, response, request):
return response.status == 304
def _set_conditional_validators(self, request, cachedresponse):
if 'Last-Modified' in cachedresponse.headers:
request.headers['If-Modified-Since'] = cachedresponse.headers['Last-Modified']
if 'ETag' in cachedresponse.headers:
request.headers['If-None-Match'] = cachedresponse.headers['ETag']
def _compute_freshness_lifetime(self, response, request, now):
# Reference nsHttpResponseHead::ComputeFresshnessLifetime
# http://dxr.mozilla.org/mozilla-central/netwerk/protocol/http/nsHttpResponseHead.cpp.html#l259
cc = self._parse_cachecontrol(response)
if 'max-age' in cc:
try:
return max(0, int(cc['max-age']))
except ValueError:
pass
# Parse date header or synthesize it if none exists
date = rfc1123_to_epoch(response.headers.get('Date')) or now
# Try HTTP/1.0 Expires header
if 'Expires' in response.headers:
expires = rfc1123_to_epoch(response.headers['Expires'])
# When parsing Expires header fails RFC 2616 section 14.21 says we
# should treat this as an expiration time in the past.
return max(0, expires - date) if expires else 0
# Fallback to heuristic using last-modified header
# This is not in RFC but on Firefox caching implementation
lastmodified = rfc1123_to_epoch(response.headers.get('Last-Modified'))
if lastmodified and lastmodified <= date:
return (date - lastmodified) / 10
# This request can be cached indefinitely
if response.status in (300, 301, 308):
return self.MAXAGE
# Insufficient information to compute fresshness lifetime
return 0
def _compute_current_age(self, response, request, now):
# Reference nsHttpResponseHead::ComputeCurrentAge
# http://dxr.mozilla.org/mozilla-central/netwerk/protocol/http/nsHttpResponseHead.cpp.html
currentage = 0
# If Date header is not set we assume it is a fast connection, and
# clock is in sync with the server
date = rfc1123_to_epoch(response.headers.get('Date')) or now
if now > date:
currentage = now - date
if 'Age' in response.headers:
try:
age = int(response.headers['Age'])
currentage = max(currentage, age)
except ValueError:
pass
return currentage
class DbmCacheStorage(object):
@ -50,11 +200,13 @@ class DbmCacheStorage(object):
key = self._request_key(request)
db = self.db
tkey = '%s_time' % key
if not db.has_key(tkey):
return # not found
if tkey not in db:
return # not found
ts = db[tkey]
if 0 < self.expiration_secs < time() - float(ts):
return # expired
return # expired
return pickle.loads(db['%s_data' % key])
def _request_key(self, request):
@ -129,3 +281,29 @@ class FilesystemCacheStorage(object):
return # expired
with open(metapath, 'rb') as f:
return pickle.load(f)
def parse_cachecontrol(header):
"""Parse Cache-Control header
http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
>>> cachecontrol_directives('public, max-age=3600')
{'public': None, 'max-age': '3600'}
>>> cachecontrol_directives('')
{}
"""
directives = {}
for directive in header.split(','):
key, sep, val = directive.strip().partition('=')
if key:
directives[key.lower()] = val if sep else None
return directives
def rfc1123_to_epoch(date_str):
try:
return mktime_tz(parsedate_tz(date_str))
except Exception:
return None

View File

@ -143,6 +143,7 @@ HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_DBM_MODULE = 'anydbm'
HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy'
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'

View File

@ -2,6 +2,7 @@ import time
import tempfile
import shutil
import unittest
import email.utils
from contextlib import contextmanager
from scrapy.http import Response, HtmlResponse, Request
@ -9,15 +10,18 @@ from scrapy.spider import BaseSpider
from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.test import get_crawler
from scrapy.contrib.httpcache import FilesystemCacheStorage, DbmCacheStorage
from scrapy.contrib.downloadermiddleware.httpcache import HttpCacheMiddleware
class HttpCacheMiddlewareTest(unittest.TestCase):
class _BaseTest(unittest.TestCase):
storage_class = DbmCacheStorage
storage_class = 'scrapy.contrib.httpcache.DbmCacheStorage'
policy_class = 'scrapy.contrib.httpcache.RFC2616Policy'
def setUp(self):
self.yesterday = email.utils.formatdate(time.time() - 86400)
self.today = email.utils.formatdate()
self.tomorrow = email.utils.formatdate(time.time() + 86400)
self.crawler = get_crawler()
self.spider = BaseSpider('example.com')
self.tmpdir = tempfile.mkdtemp()
@ -39,19 +43,21 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
'HTTPCACHE_DIR': self.tmpdir,
'HTTPCACHE_EXPIRATION_SECS': 1,
'HTTPCACHE_IGNORE_HTTP_CODES': [],
'HTTPCACHE_POLICY': self.policy_class,
'HTTPCACHE_STORAGE': self.storage_class,
}
settings.update(new_settings)
return Settings(settings)
@contextmanager
def _storage(self, **new_settings):
settings = self._get_settings(**new_settings)
storage = self.storage_class(settings)
storage.open_spider(self.spider)
try:
yield storage
finally:
storage.close_spider(self.spider)
with self._middleware(**new_settings) as mw:
yield mw.storage
@contextmanager
def _policy(self, **new_settings):
with self._middleware(**new_settings) as mw:
yield mw.policy
@contextmanager
def _middleware(self, **new_settings):
@ -63,6 +69,27 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
finally:
mw.spider_closed(self.spider)
def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
self.assertEqual(response1.status, response2.status)
self.assertEqual(response1.headers, response2.headers)
self.assertEqual(response1.body, response2.body)
def assertEqualRequest(self, request1, request2):
self.assertEqual(request1.url, request2.url)
self.assertEqual(request1.headers, request2.headers)
self.assertEqual(request1.body, request2.body)
def assertEqualRequestButWithCacheValidators(self, request1, request2):
self.assertEqual(request1.url, request2.url)
assert not 'If-None-Match' in request1.headers
assert not 'If-Modified-Since' in request1.headers
assert any(h in request2.headers for h in ('If-None-Match', 'If-Modified-Since'))
self.assertEqual(request1.body, request2.body)
class DefaultStorageTest(_BaseTest):
def test_storage(self):
with self._storage() as storage:
request2 = self.request.copy()
@ -83,11 +110,25 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
time.sleep(0.5) # give the chance to expire
assert storage.retrieve_response(self.spider, self.request)
class DbmStorageTest(DefaultStorageTest):
storage_class = 'scrapy.contrib.httpcache.DbmCacheStorage'
class FilesystemStorageTest(DefaultStorageTest):
storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'
class DummyPolicyTest(_BaseTest):
policy_class = 'scrapy.contrib.httpcache.DummyPolicy'
def test_middleware(self):
with self._middleware() as mw:
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
@ -97,10 +138,8 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
with self._middleware() as mw:
req = Request('http://host.com/path')
res = Response('http://host2.net/test.html')
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response)
self.assertEqualResponse(res, cached)
@ -173,16 +212,174 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
self.assertEqual(response1.status, response2.status)
self.assertEqual(response1.headers, response2.headers)
self.assertEqual(response1.body, response2.body)
class RFC2616PolicyTest(DefaultStorageTest):
class FilesystemCacheStorageTest(HttpCacheMiddlewareTest):
policy_class = 'scrapy.contrib.httpcache.RFC2616Policy'
def _process_requestresponse(self, mw, request, response):
try:
result = mw.process_request(request, self.spider)
if result:
assert isinstance(result, (Request, Response))
return result
else:
result = mw.process_response(request, response, self.spider)
assert isinstance(result, Response)
return result
except Exception:
print 'Request', request
print 'Response', response
print 'Result', result
raise
def test_request_cacheability(self):
res0 = Response(self.request.url, status=200,
headers={'Expires': self.tomorrow})
req0 = Request('http://example.com')
req1 = req0.replace(headers={'Cache-Control': 'no-store'})
req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
with self._middleware() as mw:
# response for a request with no-store must not be cached
res1 = self._process_requestresponse(mw, req1, res0)
self.assertEqualResponse(res1, res0)
assert mw.storage.retrieve_response(self.spider, req1) is None
# Re-do request without no-store and expect it to be cached
res2 = self._process_requestresponse(mw, req0, res0)
assert 'cached' not in res2.flags
res3 = mw.process_request(req0, self.spider)
assert 'cached' in res3.flags
self.assertEqualResponse(res2, res3)
# request with no-cache directive must not return cached response
# but it allows new response to be stored
res0b = res0.replace(body='foo')
res4 = self._process_requestresponse(mw, req2, res0b)
self.assertEqualResponse(res4, res0b)
assert 'cached' not in res4.flags
res5 = self._process_requestresponse(mw, req0, None)
self.assertEqualResponse(res5, res0b)
assert 'cached' in res5.flags
def test_response_cacheability(self):
responses = [
# 304 is not cacheable no matter what servers sends
(False, 304, {}),
(False, 304, {'Last-Modified': self.yesterday}),
(False, 304, {'Expires': self.tomorrow}),
(False, 304, {'Etag': 'bar'}),
(False, 304, {'Cache-Control': 'max-age=3600'}),
# Always obey no-store cache control
(False, 200, {'Cache-Control': 'no-store'}),
(False, 200, {'Cache-Control': 'no-store, max-age=300'}), # invalid
(False, 200, {'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # invalid
# Ignore responses missing expiration and/or validation headers
(False, 200, {}),
(False, 302, {}),
(False, 307, {}),
(False, 404, {}),
# Cache responses with expiration and/or validation headers
(True, 200, {'Last-Modified': self.yesterday}),
(True, 203, {'Last-Modified': self.yesterday}),
(True, 300, {'Last-Modified': self.yesterday}),
(True, 301, {'Last-Modified': self.yesterday}),
(True, 401, {'Last-Modified': self.yesterday}),
(True, 404, {'Cache-Control': 'public, max-age=600'}),
(True, 302, {'Expires': self.tomorrow}),
(True, 200, {'Etag': 'foo'}),
]
with self._middleware() as mw:
for idx, (shouldcache, status, headers) in enumerate(responses):
req0 = Request('http://example-%d.com' % idx)
res0 = Response(req0.url, status=status, headers=headers)
res1 = self._process_requestresponse(mw, req0, res0)
res304 = res0.replace(status=304)
res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0)
self.assertEqualResponse(res1, res0)
self.assertEqualResponse(res2, res0)
resc = mw.storage.retrieve_response(self.spider, req0)
if shouldcache:
self.assertEqualResponse(resc, res1)
assert 'cached' in res2.flags and res2.status != 304
else:
self.assertFalse(resc)
assert 'cached' not in res2.flags
def test_cached_and_fresh(self):
sampledata = [
(200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
(200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}),
(200, {'Age': '299', 'Cache-Control': 'max-age=300'}),
# Obey max-age if present over any others
(200, {'Date': self.today,
'Age': '86405',
'Cache-Control': 'max-age=' + str(86400 * 3),
'Expires': self.yesterday,
'Last-Modified': self.yesterday,
}),
# obey Expires if max-age is not present
(200, {'Date': self.yesterday,
'Age': '86400',
'Cache-Control': 'public',
'Expires': self.tomorrow,
'Last-Modified': self.yesterday,
}),
# Default missing Date header to right now
(200, {'Expires': self.tomorrow}),
# Firefox - Expires if age is greater than 10% of (Date - Last-Modified)
(200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}),
# Firefox - Set one year maxage to permanent redirects missing expiration info
(300, {}), (301, {}), (308, {}),
]
with self._middleware() as mw:
for idx, (status, headers) in enumerate(sampledata):
req0 = Request('http://example-%d.com' % idx)
res0 = Response(req0.url, status=status, headers=headers)
# cache fresh response
res1 = self._process_requestresponse(mw, req0, res0)
self.assertEqualResponse(res1, res0)
assert 'cached' not in res1.flags
# return fresh cached response without network interaction
res2 = self._process_requestresponse(mw, req0, None)
self.assertEqualResponse(res1, res2)
assert 'cached' in res2.flags
def test_cached_and_stale(self):
sampledata = [
(200, {'Date': self.today, 'Expires': self.yesterday}),
(200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
(200, {'Expires': self.yesterday}),
(200, {'Expires': self.yesterday, 'ETag': 'foo'}),
(200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
(200, {'Expires': self.tomorrow, 'Age': '86405'}),
(200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}),
# no-cache forces expiration, also revalidation if validators exists
(200, {'Cache-Control': 'no-cache'}),
(200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
(200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
]
with self._middleware() as mw:
for idx, (status, headers) in enumerate(sampledata):
req0 = Request('http://example-%d.com' % idx)
res0a = Response(req0.url, status=status, headers=headers)
# cache expired response
res1 = self._process_requestresponse(mw, req0, res0a)
self.assertEqualResponse(res1, res0a)
assert 'cached' not in res1.flags
# Same request but as cached response is stale a new response must
# be returned
res0b = res0a.replace(body='bar')
res2 = self._process_requestresponse(mw, req0, res0b)
self.assertEqualResponse(res2, res0b)
assert 'cached' not in res2.flags
# Previous response expired too, subsequent request to same
# resource must revalidate and succeed on 304 if validators
# are present
if 'ETag' in headers or 'Last-Modified' in headers:
res0c = res0b.replace(status=304)
res3 = self._process_requestresponse(mw, req0, res0c)
self.assertEqualResponse(res3, res0b)
assert 'cached' in res3.flags
storage = FilesystemCacheStorage
if __name__ == '__main__':
unittest.main()