1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 03:43:58 +00:00
This commit is contained in:
Pablo Hoffman 2010-07-13 19:47:55 -03:00
commit 90a04f0530
5 changed files with 37 additions and 4 deletions

View File

@ -276,6 +276,17 @@ Number of seconds to use for HTTP cache expiration. Requests that were cached
before this time will be re-downloaded. If zero, cached requests will always
expire. A negative number means requests will never expire.
.. setting:: HTTPCACHE_IGNORE_HTTP_CODES
HTTPCACHE_IGNORE_HTTP_CODES
^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. versionadded:: 0.10
Default: ``[]``
Don't cache response with these HTTP codes.
.. setting:: HTTPCACHE_IGNORE_MISSING
HTTPCACHE_IGNORE_MISSING

View File

@ -2,8 +2,8 @@
Scrapy - a screen scraping framework written in Python
"""
version_info = (0, 9, 0, '')
__version__ = "0.9"
version_info = (0, 10, 0, 'dev')
__version__ = "0.10-dev"
import sys, os, warnings

View File

@ -123,6 +123,7 @@ HTTPCACHE_DIR = ''
HTTPCACHE_IGNORE_MISSING = False
HTTPCACHE_STORAGE = 'scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage'
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_IGNORE_HTTP_CODES = []
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'

View File

@ -22,6 +22,7 @@ class HttpCacheMiddleware(object):
def __init__(self, settings=conf.settings):
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
@ -35,17 +36,20 @@ class HttpCacheMiddleware(object):
if not self.is_cacheable(request):
return
response = self.storage.retrieve_response(spider, request)
if response:
if response and self.is_cacheable_response(response):
response.flags.append('cached')
return response
elif self.ignore_missing:
raise IgnoreRequest("Ignored request not in cache: %s" % request)
def process_response(self, request, response, spider):
if self.is_cacheable(request):
if self.is_cacheable(request) and self.is_cacheable_response(response):
self.storage.store_response(spider, request, response)
return response
def is_cacheable_response(self, response):
return response.status not in self.ignore_http_codes
def is_cacheable(self, request):
return urlparse_cached(request).scheme in ['http', 'https']

View File

@ -24,6 +24,7 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
settings = {
'HTTPCACHE_DIR': self.tmpdir,
'HTTPCACHE_EXPIRATION_SECS': 1,
'HTTPCACHE_IGNORE_HTTP_CODES': [],
}
settings.update(new_settings)
return Settings(settings)
@ -76,6 +77,22 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def test_middleware_ignore_http_codes(self):
# test response is not cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
# test response is cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
self.assertEqual(response1.status, response2.status)