mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 23:43:59 +00:00
Merge branch 'http-cache-middleware'
Conflicts: scrapy/contrib/downloadermiddleware/httpcache.py scrapy/contrib/httpcache.py scrapy/tests/test_downloadermiddleware_httpcache.py
This commit is contained in:
commit
d8a760bf57
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ dropin.cache
|
|||||||
docs/build
|
docs/build
|
||||||
*egg-info
|
*egg-info
|
||||||
.tox
|
.tox
|
||||||
|
venv
|
||||||
|
@ -285,6 +285,7 @@ HttpAuthMiddleware
|
|||||||
|
|
||||||
.. _Basic access authentication: http://en.wikipedia.org/wiki/Basic_access_authentication
|
.. _Basic access authentication: http://en.wikipedia.org/wiki/Basic_access_authentication
|
||||||
|
|
||||||
|
|
||||||
HttpCacheMiddleware
|
HttpCacheMiddleware
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
@ -294,44 +295,88 @@ HttpCacheMiddleware
|
|||||||
.. class:: HttpCacheMiddleware
|
.. class:: HttpCacheMiddleware
|
||||||
|
|
||||||
This middleware provides low-level cache to all HTTP requests and responses.
|
This middleware provides low-level cache to all HTTP requests and responses.
|
||||||
Every request and its corresponding response are cached. When the same
|
It has to be combined with a cache storage backend as well as a cache policy.
|
||||||
request is seen again, the response is returned without transferring
|
|
||||||
anything from the Internet.
|
|
||||||
|
|
||||||
The HTTP cache is useful for testing spiders faster (without having to wait for
|
Scrapy ships with two HTTP cache storage backends:
|
||||||
downloads every time) and for trying your spider offline, when an Internet
|
|
||||||
connection is not available.
|
|
||||||
|
|
||||||
Scrapy ships with two storage backends for the HTTP cache middleware:
|
* :ref:`httpcache-storage-dbm`
|
||||||
|
* :ref:`httpcache-storage-fs`
|
||||||
|
|
||||||
* :ref:`httpcache-dbm-backend`
|
You can change the HTTP cache storage backend with the :setting:`HTTPCACHE_STORAGE`
|
||||||
* :ref:`httpcache-fs-backend`
|
setting. Or you can also implement your own storage backend.
|
||||||
|
|
||||||
You can change the storage backend with the :setting:`HTTPCACHE_STORAGE`
|
Scrapy ships with two HTTP cache policies:
|
||||||
setting. Or you can also implement your own backend.
|
|
||||||
|
|
||||||
.. _httpcache-dbm-backend:
|
* :ref:`httpcache-policy-rfc2616`
|
||||||
|
* :ref:`httpcache-policy-dummy`
|
||||||
|
|
||||||
|
You can change the HTTP cache policy with the :setting:`HTTPCACHE_POLICY`
|
||||||
|
setting. Or you can also implement your own policy.
|
||||||
|
|
||||||
|
|
||||||
|
.. _httpcache-policy-rfc2616:
|
||||||
|
|
||||||
|
RFC2616 policy (default)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This policy provides a RFC2616 compliant HTTP cache, i.e. with HTTP
|
||||||
|
Cache-Control awareness, aimed at production and used in continuous
|
||||||
|
runs to avoid downloading unmodified data (to save bandwidth and speed up crawls).
|
||||||
|
|
||||||
|
In order to use this policy, set:
|
||||||
|
|
||||||
|
* :setting:`HTTPCACHE_POLICY` to ``scrapy.contrib.httpcache.RFC2616Policy``
|
||||||
|
|
||||||
|
This is the default cache policy.
|
||||||
|
|
||||||
|
|
||||||
|
.. _httpcache-policy-dummy:
|
||||||
|
|
||||||
|
Dummy policy
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This policy has no awareness of any HTTP Cache-Control directives.
|
||||||
|
Every request and its corresponding response are cached. When the same
|
||||||
|
request is seen again, the response is returned without transferring
|
||||||
|
anything from the Internet.
|
||||||
|
|
||||||
|
The Dummy policy is useful for testing spiders faster (without having
|
||||||
|
to wait for downloads every time) and for trying your spider offline,
|
||||||
|
when an Internet connection is not available. The goal is to be able to
|
||||||
|
"replay" a spider run *exactly as it ran before*.
|
||||||
|
|
||||||
|
In order to use this policy, set:
|
||||||
|
|
||||||
|
* :setting:`HTTPCACHE_POLICY` to ``scrapy.contrib.httpcache.DummyPolicy``
|
||||||
|
|
||||||
|
|
||||||
|
.. _httpcache-storage-dbm:
|
||||||
|
|
||||||
DBM storage backend (default)
|
DBM storage backend (default)
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. versionadded:: 0.13
|
.. versionadded:: 0.13
|
||||||
|
|
||||||
A DBM_ storage backend is available for the HTTP cache middleware. To use it
|
A DBM_ storage backend is available for the HTTP cache middleware.
|
||||||
(note: it is the default storage backend) set :setting:`HTTPCACHE_STORAGE`
|
|
||||||
to ``scrapy.contrib.httpcache.DbmCacheStorage``.
|
|
||||||
|
|
||||||
By default, it uses the anydbm_ module, but you can change it with the
|
By default, it uses the anydbm_ module, but you can change it with the
|
||||||
:setting:`HTTPCACHE_DBM_MODULE` setting.
|
:setting:`HTTPCACHE_DBM_MODULE` setting.
|
||||||
|
|
||||||
.. _httpcache-fs-backend:
|
In order to use this storage backend, set:
|
||||||
|
|
||||||
File system backend
|
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.DbmCacheStorage``
|
||||||
~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
|
.. _httpcache-storage-fs:
|
||||||
|
|
||||||
|
Filesystem storage backend
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
A file system storage backend is also available for the HTTP cache middleware.
|
A file system storage backend is also available for the HTTP cache middleware.
|
||||||
To use it (instead of the default DBM_ storage backend) set :setting:`HTTPCACHE_STORAGE`
|
|
||||||
to ``scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage``.
|
In order to use this storage backend, set:
|
||||||
|
|
||||||
|
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.FilesystemCacheStorage``
|
||||||
|
|
||||||
Each request/response pair is stored in a different directory containing
|
Each request/response pair is stored in a different directory containing
|
||||||
the following files:
|
the following files:
|
||||||
@ -352,6 +397,7 @@ inefficient in many file systems). An example directory could be::
|
|||||||
|
|
||||||
/path/to/cache/dir/example.com/72/72811f648e718090f041317756c03adb0ada46c7
|
/path/to/cache/dir/example.com/72/72811f648e718090f041317756c03adb0ada46c7
|
||||||
|
|
||||||
|
|
||||||
HTTPCache middleware settings
|
HTTPCache middleware settings
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -450,6 +496,17 @@ Default: ``'anydbm'``
|
|||||||
The database module to use in the :ref:`DBM storage backend
|
The database module to use in the :ref:`DBM storage backend
|
||||||
<httpcache-dbm-backend>`. This setting is specific to the DBM backend.
|
<httpcache-dbm-backend>`. This setting is specific to the DBM backend.
|
||||||
|
|
||||||
|
.. setting:: HTTPCACHE_POLICY
|
||||||
|
|
||||||
|
HTTPCACHE_POLICY
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. versionadded:: 0.18
|
||||||
|
|
||||||
|
Default: ``'scrapy.contrib.httpcache.RFC2616Policy'``
|
||||||
|
|
||||||
|
The class which implements the cache policy.
|
||||||
|
|
||||||
|
|
||||||
HttpCompressionMiddleware
|
HttpCompressionMiddleware
|
||||||
-------------------------
|
-------------------------
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
|
from time import time
|
||||||
|
from email.utils import formatdate
|
||||||
|
from weakref import WeakKeyDictionary
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||||
from scrapy.utils.httpobj import urlparse_cached
|
from scrapy.utils.httpobj import urlparse_cached
|
||||||
from scrapy.utils.misc import load_object
|
from scrapy.utils.misc import load_object
|
||||||
|
from scrapy.contrib.httpcache import rfc1123_to_epoch, parse_cachecontrol
|
||||||
|
|
||||||
|
|
||||||
class HttpCacheMiddleware(object):
|
class HttpCacheMiddleware(object):
|
||||||
@ -9,10 +13,9 @@ class HttpCacheMiddleware(object):
|
|||||||
def __init__(self, settings, stats):
|
def __init__(self, settings, stats):
|
||||||
if not settings.getbool('HTTPCACHE_ENABLED'):
|
if not settings.getbool('HTTPCACHE_ENABLED'):
|
||||||
raise NotConfigured
|
raise NotConfigured
|
||||||
|
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
|
||||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
|
||||||
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
|
|
||||||
self.stats = stats
|
self.stats = stats
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -29,36 +32,65 @@ class HttpCacheMiddleware(object):
|
|||||||
self.storage.close_spider(spider)
|
self.storage.close_spider(spider)
|
||||||
|
|
||||||
def process_request(self, request, spider):
|
def process_request(self, request, spider):
|
||||||
if not self.is_cacheable(request):
|
# Skip uncacheable requests
|
||||||
|
if not self.policy.should_cache_request(request):
|
||||||
|
request.meta['_dont_cache'] = True # flag as uncacheable
|
||||||
return
|
return
|
||||||
response = self.storage.retrieve_response(spider, request)
|
|
||||||
if response and self.is_cacheable_response(response):
|
|
||||||
response.flags.append('cached')
|
|
||||||
self.stats.inc_value('httpcache/hits', spider=spider)
|
|
||||||
return response
|
|
||||||
|
|
||||||
self.stats.inc_value('httpcache/misses', spider=spider)
|
# Look for cached response and check if expired
|
||||||
if self.ignore_missing:
|
cachedresponse = self.storage.retrieve_response(spider, request)
|
||||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
if cachedresponse is None:
|
||||||
|
self.stats.inc_value('httpcache/miss', spider=spider)
|
||||||
|
if self.ignore_missing:
|
||||||
|
self.stats.inc_value('httpcache/ignore', spider=spider)
|
||||||
|
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||||
|
return # first time request
|
||||||
|
|
||||||
|
# Return cached response only if not expired
|
||||||
|
cachedresponse.flags.append('cached')
|
||||||
|
if self.policy.is_cached_response_fresh(cachedresponse, request):
|
||||||
|
self.stats.inc_value('httpcache/hit', spider=spider)
|
||||||
|
return cachedresponse
|
||||||
|
|
||||||
|
# Keep a reference to cached response to avoid a second cache lookup on
|
||||||
|
# process_response hook
|
||||||
|
request.meta['cached_response'] = cachedresponse
|
||||||
|
|
||||||
def process_response(self, request, response, spider):
|
def process_response(self, request, response, spider):
|
||||||
if (self.is_cacheable(request)
|
# Skip cached responses and uncacheable requests
|
||||||
and self.is_cacheable_response(response)
|
if 'cached' in response.flags or '_dont_cache' in request.meta:
|
||||||
and 'cached' not in response.flags):
|
request.meta.pop('_dont_cache', None)
|
||||||
self.storage.store_response(spider, request, response)
|
return response
|
||||||
self.stats.inc_value('httpcache/store', spider=spider)
|
|
||||||
|
# RFC2616 requires origin server to set Date header,
|
||||||
|
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
|
||||||
|
if 'Date' not in response.headers:
|
||||||
|
response.headers['Date'] = formatdate(usegmt=1)
|
||||||
|
|
||||||
|
# Do not validate first-hand responses
|
||||||
|
cachedresponse = request.meta.pop('cached_response', None)
|
||||||
|
if cachedresponse is None:
|
||||||
|
self.stats.inc_value('httpcache/firsthand', spider=spider)
|
||||||
|
self._cache_response(spider, response, request, cachedresponse)
|
||||||
|
return response
|
||||||
|
|
||||||
|
if self.policy.is_cached_response_valid(cachedresponse, response, request):
|
||||||
|
self.stats.inc_value('httpcache/revalidate', spider=spider)
|
||||||
|
return cachedresponse
|
||||||
|
|
||||||
|
self.stats.inc_value('httpcache/invalidate', spider=spider)
|
||||||
|
self._cache_response(spider, response, request, cachedresponse)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def is_cacheable_response(self, response):
|
def _cache_response(self, spider, response, request, cachedresponse):
|
||||||
return response.status not in self.ignore_http_codes
|
if self.policy.should_cache_response(response, request):
|
||||||
|
self.stats.inc_value('httpcache/store', spider=spider)
|
||||||
def is_cacheable(self, request):
|
self.storage.store_response(spider, request, response)
|
||||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
else:
|
||||||
|
self.stats.inc_value('httpcache/uncacheable', spider=spider)
|
||||||
|
|
||||||
|
|
||||||
from scrapy.contrib.httpcache import FilesystemCacheStorage as _FilesystemCacheStorage
|
from scrapy.contrib.httpcache import FilesystemCacheStorage as _FilesystemCacheStorage
|
||||||
|
|
||||||
|
|
||||||
class FilesystemCacheStorage(_FilesystemCacheStorage):
|
class FilesystemCacheStorage(_FilesystemCacheStorage):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
@ -1,11 +1,161 @@
|
|||||||
import os
|
import os
|
||||||
from time import time
|
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
|
from time import time
|
||||||
|
from weakref import WeakKeyDictionary
|
||||||
|
from email.utils import mktime_tz, parsedate_tz
|
||||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||||
from scrapy.http import Headers
|
from scrapy.http import Headers
|
||||||
from scrapy.responsetypes import responsetypes
|
from scrapy.responsetypes import responsetypes
|
||||||
from scrapy.utils.request import request_fingerprint
|
from scrapy.utils.request import request_fingerprint
|
||||||
from scrapy.utils.project import data_path
|
from scrapy.utils.project import data_path
|
||||||
|
from scrapy.utils.httpobj import urlparse_cached
|
||||||
|
|
||||||
|
|
||||||
|
class DummyPolicy(object):
|
||||||
|
|
||||||
|
def __init__(self, settings):
|
||||||
|
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||||
|
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
|
||||||
|
|
||||||
|
def should_cache_request(self, request):
|
||||||
|
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||||
|
|
||||||
|
def should_cache_response(self, response, request):
|
||||||
|
return response.status not in self.ignore_http_codes
|
||||||
|
|
||||||
|
def is_cached_response_fresh(self, response, request):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class RFC2616Policy(object):
|
||||||
|
|
||||||
|
MAXAGE = 3600 * 24 * 365 # one year
|
||||||
|
|
||||||
|
def __init__(self, settings):
|
||||||
|
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||||
|
self._cc_parsed = WeakKeyDictionary()
|
||||||
|
|
||||||
|
def _parse_cachecontrol(self, r):
|
||||||
|
if r not in self._cc_parsed:
|
||||||
|
cch = r.headers.get('Cache-Control', '')
|
||||||
|
self._cc_parsed[r] = parse_cachecontrol(cch)
|
||||||
|
return self._cc_parsed[r]
|
||||||
|
|
||||||
|
def should_cache_request(self, request):
|
||||||
|
if urlparse_cached(request).scheme in self.ignore_schemes:
|
||||||
|
return False
|
||||||
|
cc = self._parse_cachecontrol(request)
|
||||||
|
# obey user-agent directive "Cache-Control: no-store"
|
||||||
|
if 'no-store' in cc:
|
||||||
|
return False
|
||||||
|
# Any other is eligible for caching
|
||||||
|
return True
|
||||||
|
|
||||||
|
def should_cache_response(self, response, request):
|
||||||
|
# What is cacheable - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
|
||||||
|
# Response cacheability - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||||
|
# Status code 206 is not included because cache can not deal with partial contents
|
||||||
|
cc = self._parse_cachecontrol(response)
|
||||||
|
# obey directive "Cache-Control: no-store"
|
||||||
|
if 'no-store' in cc:
|
||||||
|
return False
|
||||||
|
# Never cache 304 (Not Modified) responses
|
||||||
|
elif response.status == 304:
|
||||||
|
return False
|
||||||
|
# Any hint on response expiration is good
|
||||||
|
elif 'max-age' in cc or 'Expires' in response.headers:
|
||||||
|
return True
|
||||||
|
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||||
|
elif response.status in (300, 301, 308):
|
||||||
|
return True
|
||||||
|
# Other statuses without expiration requires at least one validator
|
||||||
|
elif response.status in (200, 203, 401):
|
||||||
|
return 'Last-Modified' in response.headers or 'ETag' in response.headers
|
||||||
|
# Any other is probably not eligible for caching
|
||||||
|
# Makes no sense to cache responses that does not contain expiration
|
||||||
|
# info and can not be revalidated
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_cached_response_fresh(self, cachedresponse, request):
|
||||||
|
cc = self._parse_cachecontrol(cachedresponse)
|
||||||
|
ccreq = self._parse_cachecontrol(request)
|
||||||
|
if 'no-cache' in cc or 'no-cache' in ccreq:
|
||||||
|
return False
|
||||||
|
|
||||||
|
now = time()
|
||||||
|
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||||
|
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||||
|
if currentage < freshnesslifetime:
|
||||||
|
return True
|
||||||
|
# Cached response is stale, try to set validators if any
|
||||||
|
self._set_conditional_validators(request, cachedresponse)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||||
|
return response.status == 304
|
||||||
|
|
||||||
|
def _set_conditional_validators(self, request, cachedresponse):
|
||||||
|
if 'Last-Modified' in cachedresponse.headers:
|
||||||
|
request.headers['If-Modified-Since'] = cachedresponse.headers['Last-Modified']
|
||||||
|
|
||||||
|
if 'ETag' in cachedresponse.headers:
|
||||||
|
request.headers['If-None-Match'] = cachedresponse.headers['ETag']
|
||||||
|
|
||||||
|
def _compute_freshness_lifetime(self, response, request, now):
|
||||||
|
# Reference nsHttpResponseHead::ComputeFresshnessLifetime
|
||||||
|
# http://dxr.mozilla.org/mozilla-central/netwerk/protocol/http/nsHttpResponseHead.cpp.html#l259
|
||||||
|
cc = self._parse_cachecontrol(response)
|
||||||
|
if 'max-age' in cc:
|
||||||
|
try:
|
||||||
|
return max(0, int(cc['max-age']))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Parse date header or synthesize it if none exists
|
||||||
|
date = rfc1123_to_epoch(response.headers.get('Date')) or now
|
||||||
|
|
||||||
|
# Try HTTP/1.0 Expires header
|
||||||
|
if 'Expires' in response.headers:
|
||||||
|
expires = rfc1123_to_epoch(response.headers['Expires'])
|
||||||
|
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||||
|
# should treat this as an expiration time in the past.
|
||||||
|
return max(0, expires - date) if expires else 0
|
||||||
|
|
||||||
|
# Fallback to heuristic using last-modified header
|
||||||
|
# This is not in RFC but on Firefox caching implementation
|
||||||
|
lastmodified = rfc1123_to_epoch(response.headers.get('Last-Modified'))
|
||||||
|
if lastmodified and lastmodified <= date:
|
||||||
|
return (date - lastmodified) / 10
|
||||||
|
|
||||||
|
# This request can be cached indefinitely
|
||||||
|
if response.status in (300, 301, 308):
|
||||||
|
return self.MAXAGE
|
||||||
|
|
||||||
|
# Insufficient information to compute fresshness lifetime
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _compute_current_age(self, response, request, now):
|
||||||
|
# Reference nsHttpResponseHead::ComputeCurrentAge
|
||||||
|
# http://dxr.mozilla.org/mozilla-central/netwerk/protocol/http/nsHttpResponseHead.cpp.html
|
||||||
|
currentage = 0
|
||||||
|
# If Date header is not set we assume it is a fast connection, and
|
||||||
|
# clock is in sync with the server
|
||||||
|
date = rfc1123_to_epoch(response.headers.get('Date')) or now
|
||||||
|
if now > date:
|
||||||
|
currentage = now - date
|
||||||
|
|
||||||
|
if 'Age' in response.headers:
|
||||||
|
try:
|
||||||
|
age = int(response.headers['Age'])
|
||||||
|
currentage = max(currentage, age)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return currentage
|
||||||
|
|
||||||
|
|
||||||
class DbmCacheStorage(object):
|
class DbmCacheStorage(object):
|
||||||
@ -50,11 +200,13 @@ class DbmCacheStorage(object):
|
|||||||
key = self._request_key(request)
|
key = self._request_key(request)
|
||||||
db = self.db
|
db = self.db
|
||||||
tkey = '%s_time' % key
|
tkey = '%s_time' % key
|
||||||
if not db.has_key(tkey):
|
if tkey not in db:
|
||||||
return # not found
|
return # not found
|
||||||
|
|
||||||
ts = db[tkey]
|
ts = db[tkey]
|
||||||
if 0 < self.expiration_secs < time() - float(ts):
|
if 0 < self.expiration_secs < time() - float(ts):
|
||||||
return # expired
|
return # expired
|
||||||
|
|
||||||
return pickle.loads(db['%s_data' % key])
|
return pickle.loads(db['%s_data' % key])
|
||||||
|
|
||||||
def _request_key(self, request):
|
def _request_key(self, request):
|
||||||
@ -129,3 +281,29 @@ class FilesystemCacheStorage(object):
|
|||||||
return # expired
|
return # expired
|
||||||
with open(metapath, 'rb') as f:
|
with open(metapath, 'rb') as f:
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_cachecontrol(header):
|
||||||
|
"""Parse Cache-Control header
|
||||||
|
|
||||||
|
http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||||
|
|
||||||
|
>>> cachecontrol_directives('public, max-age=3600')
|
||||||
|
{'public': None, 'max-age': '3600'}
|
||||||
|
>>> cachecontrol_directives('')
|
||||||
|
{}
|
||||||
|
|
||||||
|
"""
|
||||||
|
directives = {}
|
||||||
|
for directive in header.split(','):
|
||||||
|
key, sep, val = directive.strip().partition('=')
|
||||||
|
if key:
|
||||||
|
directives[key.lower()] = val if sep else None
|
||||||
|
return directives
|
||||||
|
|
||||||
|
|
||||||
|
def rfc1123_to_epoch(date_str):
|
||||||
|
try:
|
||||||
|
return mktime_tz(parsedate_tz(date_str))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
@ -143,6 +143,7 @@ HTTPCACHE_EXPIRATION_SECS = 0
|
|||||||
HTTPCACHE_IGNORE_HTTP_CODES = []
|
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
HTTPCACHE_IGNORE_SCHEMES = ['file']
|
HTTPCACHE_IGNORE_SCHEMES = ['file']
|
||||||
HTTPCACHE_DBM_MODULE = 'anydbm'
|
HTTPCACHE_DBM_MODULE = 'anydbm'
|
||||||
|
HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy'
|
||||||
|
|
||||||
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ import time
|
|||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import unittest
|
import unittest
|
||||||
|
import email.utils
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from scrapy.http import Response, HtmlResponse, Request
|
from scrapy.http import Response, HtmlResponse, Request
|
||||||
@ -9,15 +10,18 @@ from scrapy.spider import BaseSpider
|
|||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
from scrapy.exceptions import IgnoreRequest
|
from scrapy.exceptions import IgnoreRequest
|
||||||
from scrapy.utils.test import get_crawler
|
from scrapy.utils.test import get_crawler
|
||||||
from scrapy.contrib.httpcache import FilesystemCacheStorage, DbmCacheStorage
|
|
||||||
from scrapy.contrib.downloadermiddleware.httpcache import HttpCacheMiddleware
|
from scrapy.contrib.downloadermiddleware.httpcache import HttpCacheMiddleware
|
||||||
|
|
||||||
|
|
||||||
class HttpCacheMiddlewareTest(unittest.TestCase):
|
class _BaseTest(unittest.TestCase):
|
||||||
|
|
||||||
storage_class = DbmCacheStorage
|
storage_class = 'scrapy.contrib.httpcache.DbmCacheStorage'
|
||||||
|
policy_class = 'scrapy.contrib.httpcache.RFC2616Policy'
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
self.yesterday = email.utils.formatdate(time.time() - 86400)
|
||||||
|
self.today = email.utils.formatdate()
|
||||||
|
self.tomorrow = email.utils.formatdate(time.time() + 86400)
|
||||||
self.crawler = get_crawler()
|
self.crawler = get_crawler()
|
||||||
self.spider = BaseSpider('example.com')
|
self.spider = BaseSpider('example.com')
|
||||||
self.tmpdir = tempfile.mkdtemp()
|
self.tmpdir = tempfile.mkdtemp()
|
||||||
@ -39,19 +43,21 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
'HTTPCACHE_DIR': self.tmpdir,
|
'HTTPCACHE_DIR': self.tmpdir,
|
||||||
'HTTPCACHE_EXPIRATION_SECS': 1,
|
'HTTPCACHE_EXPIRATION_SECS': 1,
|
||||||
'HTTPCACHE_IGNORE_HTTP_CODES': [],
|
'HTTPCACHE_IGNORE_HTTP_CODES': [],
|
||||||
|
'HTTPCACHE_POLICY': self.policy_class,
|
||||||
|
'HTTPCACHE_STORAGE': self.storage_class,
|
||||||
}
|
}
|
||||||
settings.update(new_settings)
|
settings.update(new_settings)
|
||||||
return Settings(settings)
|
return Settings(settings)
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _storage(self, **new_settings):
|
def _storage(self, **new_settings):
|
||||||
settings = self._get_settings(**new_settings)
|
with self._middleware(**new_settings) as mw:
|
||||||
storage = self.storage_class(settings)
|
yield mw.storage
|
||||||
storage.open_spider(self.spider)
|
|
||||||
try:
|
@contextmanager
|
||||||
yield storage
|
def _policy(self, **new_settings):
|
||||||
finally:
|
with self._middleware(**new_settings) as mw:
|
||||||
storage.close_spider(self.spider)
|
yield mw.policy
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _middleware(self, **new_settings):
|
def _middleware(self, **new_settings):
|
||||||
@ -63,6 +69,27 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
finally:
|
finally:
|
||||||
mw.spider_closed(self.spider)
|
mw.spider_closed(self.spider)
|
||||||
|
|
||||||
|
def assertEqualResponse(self, response1, response2):
|
||||||
|
self.assertEqual(response1.url, response2.url)
|
||||||
|
self.assertEqual(response1.status, response2.status)
|
||||||
|
self.assertEqual(response1.headers, response2.headers)
|
||||||
|
self.assertEqual(response1.body, response2.body)
|
||||||
|
|
||||||
|
def assertEqualRequest(self, request1, request2):
|
||||||
|
self.assertEqual(request1.url, request2.url)
|
||||||
|
self.assertEqual(request1.headers, request2.headers)
|
||||||
|
self.assertEqual(request1.body, request2.body)
|
||||||
|
|
||||||
|
def assertEqualRequestButWithCacheValidators(self, request1, request2):
|
||||||
|
self.assertEqual(request1.url, request2.url)
|
||||||
|
assert not 'If-None-Match' in request1.headers
|
||||||
|
assert not 'If-Modified-Since' in request1.headers
|
||||||
|
assert any(h in request2.headers for h in ('If-None-Match', 'If-Modified-Since'))
|
||||||
|
self.assertEqual(request1.body, request2.body)
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultStorageTest(_BaseTest):
|
||||||
|
|
||||||
def test_storage(self):
|
def test_storage(self):
|
||||||
with self._storage() as storage:
|
with self._storage() as storage:
|
||||||
request2 = self.request.copy()
|
request2 = self.request.copy()
|
||||||
@ -83,11 +110,25 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
time.sleep(0.5) # give the chance to expire
|
time.sleep(0.5) # give the chance to expire
|
||||||
assert storage.retrieve_response(self.spider, self.request)
|
assert storage.retrieve_response(self.spider, self.request)
|
||||||
|
|
||||||
|
|
||||||
|
class DbmStorageTest(DefaultStorageTest):
|
||||||
|
|
||||||
|
storage_class = 'scrapy.contrib.httpcache.DbmCacheStorage'
|
||||||
|
|
||||||
|
|
||||||
|
class FilesystemStorageTest(DefaultStorageTest):
|
||||||
|
|
||||||
|
storage_class = 'scrapy.contrib.httpcache.FilesystemCacheStorage'
|
||||||
|
|
||||||
|
|
||||||
|
class DummyPolicyTest(_BaseTest):
|
||||||
|
|
||||||
|
policy_class = 'scrapy.contrib.httpcache.DummyPolicy'
|
||||||
|
|
||||||
def test_middleware(self):
|
def test_middleware(self):
|
||||||
with self._middleware() as mw:
|
with self._middleware() as mw:
|
||||||
assert mw.process_request(self.request, self.spider) is None
|
assert mw.process_request(self.request, self.spider) is None
|
||||||
mw.process_response(self.request, self.response, self.spider)
|
mw.process_response(self.request, self.response, self.spider)
|
||||||
|
|
||||||
response = mw.process_request(self.request, self.spider)
|
response = mw.process_request(self.request, self.spider)
|
||||||
assert isinstance(response, HtmlResponse)
|
assert isinstance(response, HtmlResponse)
|
||||||
self.assertEqualResponse(self.response, response)
|
self.assertEqualResponse(self.response, response)
|
||||||
@ -97,10 +138,8 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
with self._middleware() as mw:
|
with self._middleware() as mw:
|
||||||
req = Request('http://host.com/path')
|
req = Request('http://host.com/path')
|
||||||
res = Response('http://host2.net/test.html')
|
res = Response('http://host2.net/test.html')
|
||||||
|
|
||||||
assert mw.process_request(req, self.spider) is None
|
assert mw.process_request(req, self.spider) is None
|
||||||
mw.process_response(req, res, self.spider)
|
mw.process_response(req, res, self.spider)
|
||||||
|
|
||||||
cached = mw.process_request(req, self.spider)
|
cached = mw.process_request(req, self.spider)
|
||||||
assert isinstance(cached, Response)
|
assert isinstance(cached, Response)
|
||||||
self.assertEqualResponse(res, cached)
|
self.assertEqualResponse(res, cached)
|
||||||
@ -173,16 +212,174 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
self.assertEqualResponse(self.response, response)
|
self.assertEqualResponse(self.response, response)
|
||||||
assert 'cached' in response.flags
|
assert 'cached' in response.flags
|
||||||
|
|
||||||
def assertEqualResponse(self, response1, response2):
|
|
||||||
self.assertEqual(response1.url, response2.url)
|
|
||||||
self.assertEqual(response1.status, response2.status)
|
|
||||||
self.assertEqual(response1.headers, response2.headers)
|
|
||||||
self.assertEqual(response1.body, response2.body)
|
|
||||||
|
|
||||||
|
class RFC2616PolicyTest(DefaultStorageTest):
|
||||||
|
|
||||||
class FilesystemCacheStorageTest(HttpCacheMiddlewareTest):
|
policy_class = 'scrapy.contrib.httpcache.RFC2616Policy'
|
||||||
|
|
||||||
|
def _process_requestresponse(self, mw, request, response):
|
||||||
|
try:
|
||||||
|
result = mw.process_request(request, self.spider)
|
||||||
|
if result:
|
||||||
|
assert isinstance(result, (Request, Response))
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
result = mw.process_response(request, response, self.spider)
|
||||||
|
assert isinstance(result, Response)
|
||||||
|
return result
|
||||||
|
except Exception:
|
||||||
|
print 'Request', request
|
||||||
|
print 'Response', response
|
||||||
|
print 'Result', result
|
||||||
|
raise
|
||||||
|
|
||||||
|
def test_request_cacheability(self):
|
||||||
|
res0 = Response(self.request.url, status=200,
|
||||||
|
headers={'Expires': self.tomorrow})
|
||||||
|
req0 = Request('http://example.com')
|
||||||
|
req1 = req0.replace(headers={'Cache-Control': 'no-store'})
|
||||||
|
req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
|
||||||
|
with self._middleware() as mw:
|
||||||
|
# response for a request with no-store must not be cached
|
||||||
|
res1 = self._process_requestresponse(mw, req1, res0)
|
||||||
|
self.assertEqualResponse(res1, res0)
|
||||||
|
assert mw.storage.retrieve_response(self.spider, req1) is None
|
||||||
|
# Re-do request without no-store and expect it to be cached
|
||||||
|
res2 = self._process_requestresponse(mw, req0, res0)
|
||||||
|
assert 'cached' not in res2.flags
|
||||||
|
res3 = mw.process_request(req0, self.spider)
|
||||||
|
assert 'cached' in res3.flags
|
||||||
|
self.assertEqualResponse(res2, res3)
|
||||||
|
# request with no-cache directive must not return cached response
|
||||||
|
# but it allows new response to be stored
|
||||||
|
res0b = res0.replace(body='foo')
|
||||||
|
res4 = self._process_requestresponse(mw, req2, res0b)
|
||||||
|
self.assertEqualResponse(res4, res0b)
|
||||||
|
assert 'cached' not in res4.flags
|
||||||
|
res5 = self._process_requestresponse(mw, req0, None)
|
||||||
|
self.assertEqualResponse(res5, res0b)
|
||||||
|
assert 'cached' in res5.flags
|
||||||
|
|
||||||
|
def test_response_cacheability(self):
|
||||||
|
responses = [
|
||||||
|
# 304 is not cacheable no matter what servers sends
|
||||||
|
(False, 304, {}),
|
||||||
|
(False, 304, {'Last-Modified': self.yesterday}),
|
||||||
|
(False, 304, {'Expires': self.tomorrow}),
|
||||||
|
(False, 304, {'Etag': 'bar'}),
|
||||||
|
(False, 304, {'Cache-Control': 'max-age=3600'}),
|
||||||
|
# Always obey no-store cache control
|
||||||
|
(False, 200, {'Cache-Control': 'no-store'}),
|
||||||
|
(False, 200, {'Cache-Control': 'no-store, max-age=300'}), # invalid
|
||||||
|
(False, 200, {'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # invalid
|
||||||
|
# Ignore responses missing expiration and/or validation headers
|
||||||
|
(False, 200, {}),
|
||||||
|
(False, 302, {}),
|
||||||
|
(False, 307, {}),
|
||||||
|
(False, 404, {}),
|
||||||
|
# Cache responses with expiration and/or validation headers
|
||||||
|
(True, 200, {'Last-Modified': self.yesterday}),
|
||||||
|
(True, 203, {'Last-Modified': self.yesterday}),
|
||||||
|
(True, 300, {'Last-Modified': self.yesterday}),
|
||||||
|
(True, 301, {'Last-Modified': self.yesterday}),
|
||||||
|
(True, 401, {'Last-Modified': self.yesterday}),
|
||||||
|
(True, 404, {'Cache-Control': 'public, max-age=600'}),
|
||||||
|
(True, 302, {'Expires': self.tomorrow}),
|
||||||
|
(True, 200, {'Etag': 'foo'}),
|
||||||
|
]
|
||||||
|
with self._middleware() as mw:
|
||||||
|
for idx, (shouldcache, status, headers) in enumerate(responses):
|
||||||
|
req0 = Request('http://example-%d.com' % idx)
|
||||||
|
res0 = Response(req0.url, status=status, headers=headers)
|
||||||
|
res1 = self._process_requestresponse(mw, req0, res0)
|
||||||
|
res304 = res0.replace(status=304)
|
||||||
|
res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0)
|
||||||
|
self.assertEqualResponse(res1, res0)
|
||||||
|
self.assertEqualResponse(res2, res0)
|
||||||
|
resc = mw.storage.retrieve_response(self.spider, req0)
|
||||||
|
if shouldcache:
|
||||||
|
self.assertEqualResponse(resc, res1)
|
||||||
|
assert 'cached' in res2.flags and res2.status != 304
|
||||||
|
else:
|
||||||
|
self.assertFalse(resc)
|
||||||
|
assert 'cached' not in res2.flags
|
||||||
|
|
||||||
|
def test_cached_and_fresh(self):
|
||||||
|
sampledata = [
|
||||||
|
(200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
|
||||||
|
(200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}),
|
||||||
|
(200, {'Age': '299', 'Cache-Control': 'max-age=300'}),
|
||||||
|
# Obey max-age if present over any others
|
||||||
|
(200, {'Date': self.today,
|
||||||
|
'Age': '86405',
|
||||||
|
'Cache-Control': 'max-age=' + str(86400 * 3),
|
||||||
|
'Expires': self.yesterday,
|
||||||
|
'Last-Modified': self.yesterday,
|
||||||
|
}),
|
||||||
|
# obey Expires if max-age is not present
|
||||||
|
(200, {'Date': self.yesterday,
|
||||||
|
'Age': '86400',
|
||||||
|
'Cache-Control': 'public',
|
||||||
|
'Expires': self.tomorrow,
|
||||||
|
'Last-Modified': self.yesterday,
|
||||||
|
}),
|
||||||
|
# Default missing Date header to right now
|
||||||
|
(200, {'Expires': self.tomorrow}),
|
||||||
|
# Firefox - Expires if age is greater than 10% of (Date - Last-Modified)
|
||||||
|
(200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}),
|
||||||
|
# Firefox - Set one year maxage to permanent redirects missing expiration info
|
||||||
|
(300, {}), (301, {}), (308, {}),
|
||||||
|
]
|
||||||
|
with self._middleware() as mw:
|
||||||
|
for idx, (status, headers) in enumerate(sampledata):
|
||||||
|
req0 = Request('http://example-%d.com' % idx)
|
||||||
|
res0 = Response(req0.url, status=status, headers=headers)
|
||||||
|
# cache fresh response
|
||||||
|
res1 = self._process_requestresponse(mw, req0, res0)
|
||||||
|
self.assertEqualResponse(res1, res0)
|
||||||
|
assert 'cached' not in res1.flags
|
||||||
|
# return fresh cached response without network interaction
|
||||||
|
res2 = self._process_requestresponse(mw, req0, None)
|
||||||
|
self.assertEqualResponse(res1, res2)
|
||||||
|
assert 'cached' in res2.flags
|
||||||
|
|
||||||
|
def test_cached_and_stale(self):
|
||||||
|
sampledata = [
|
||||||
|
(200, {'Date': self.today, 'Expires': self.yesterday}),
|
||||||
|
(200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
|
||||||
|
(200, {'Expires': self.yesterday}),
|
||||||
|
(200, {'Expires': self.yesterday, 'ETag': 'foo'}),
|
||||||
|
(200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
|
||||||
|
(200, {'Expires': self.tomorrow, 'Age': '86405'}),
|
||||||
|
(200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}),
|
||||||
|
# no-cache forces expiration, also revalidation if validators exists
|
||||||
|
(200, {'Cache-Control': 'no-cache'}),
|
||||||
|
(200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
|
||||||
|
(200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
|
||||||
|
]
|
||||||
|
with self._middleware() as mw:
|
||||||
|
for idx, (status, headers) in enumerate(sampledata):
|
||||||
|
req0 = Request('http://example-%d.com' % idx)
|
||||||
|
res0a = Response(req0.url, status=status, headers=headers)
|
||||||
|
# cache expired response
|
||||||
|
res1 = self._process_requestresponse(mw, req0, res0a)
|
||||||
|
self.assertEqualResponse(res1, res0a)
|
||||||
|
assert 'cached' not in res1.flags
|
||||||
|
# Same request but as cached response is stale a new response must
|
||||||
|
# be returned
|
||||||
|
res0b = res0a.replace(body='bar')
|
||||||
|
res2 = self._process_requestresponse(mw, req0, res0b)
|
||||||
|
self.assertEqualResponse(res2, res0b)
|
||||||
|
assert 'cached' not in res2.flags
|
||||||
|
# Previous response expired too, subsequent request to same
|
||||||
|
# resource must revalidate and succeed on 304 if validators
|
||||||
|
# are present
|
||||||
|
if 'ETag' in headers or 'Last-Modified' in headers:
|
||||||
|
res0c = res0b.replace(status=304)
|
||||||
|
res3 = self._process_requestresponse(mw, req0, res0c)
|
||||||
|
self.assertEqualResponse(res3, res0b)
|
||||||
|
assert 'cached' in res3.flags
|
||||||
|
|
||||||
storage = FilesystemCacheStorage
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user