mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 03:04:06 +00:00
Automated merge with http://hg.scrapy.org/scrapy-0.9
This commit is contained in:
commit
90a04f0530
@ -276,6 +276,17 @@ Number of seconds to use for HTTP cache expiration. Requests that were cached
|
|||||||
before this time will be re-downloaded. If zero, cached requests will always
|
before this time will be re-downloaded. If zero, cached requests will always
|
||||||
expire. A negative number means requests will never expire.
|
expire. A negative number means requests will never expire.
|
||||||
|
|
||||||
|
.. setting:: HTTPCACHE_IGNORE_HTTP_CODES
|
||||||
|
|
||||||
|
HTTPCACHE_IGNORE_HTTP_CODES
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. versionadded:: 0.10
|
||||||
|
|
||||||
|
Default: ``[]``
|
||||||
|
|
||||||
|
Don't cache response with these HTTP codes.
|
||||||
|
|
||||||
.. setting:: HTTPCACHE_IGNORE_MISSING
|
.. setting:: HTTPCACHE_IGNORE_MISSING
|
||||||
|
|
||||||
HTTPCACHE_IGNORE_MISSING
|
HTTPCACHE_IGNORE_MISSING
|
||||||
|
@ -2,8 +2,8 @@
|
|||||||
Scrapy - a screen scraping framework written in Python
|
Scrapy - a screen scraping framework written in Python
|
||||||
"""
|
"""
|
||||||
|
|
||||||
version_info = (0, 9, 0, '')
|
version_info = (0, 10, 0, 'dev')
|
||||||
__version__ = "0.9"
|
__version__ = "0.10-dev"
|
||||||
|
|
||||||
import sys, os, warnings
|
import sys, os, warnings
|
||||||
|
|
||||||
|
@ -123,6 +123,7 @@ HTTPCACHE_DIR = ''
|
|||||||
HTTPCACHE_IGNORE_MISSING = False
|
HTTPCACHE_IGNORE_MISSING = False
|
||||||
HTTPCACHE_STORAGE = 'scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage'
|
HTTPCACHE_STORAGE = 'scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage'
|
||||||
HTTPCACHE_EXPIRATION_SECS = 0
|
HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
|
||||||
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ class HttpCacheMiddleware(object):
|
|||||||
def __init__(self, settings=conf.settings):
|
def __init__(self, settings=conf.settings):
|
||||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||||
|
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
|
||||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||||
|
|
||||||
@ -35,17 +36,20 @@ class HttpCacheMiddleware(object):
|
|||||||
if not self.is_cacheable(request):
|
if not self.is_cacheable(request):
|
||||||
return
|
return
|
||||||
response = self.storage.retrieve_response(spider, request)
|
response = self.storage.retrieve_response(spider, request)
|
||||||
if response:
|
if response and self.is_cacheable_response(response):
|
||||||
response.flags.append('cached')
|
response.flags.append('cached')
|
||||||
return response
|
return response
|
||||||
elif self.ignore_missing:
|
elif self.ignore_missing:
|
||||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||||
|
|
||||||
def process_response(self, request, response, spider):
|
def process_response(self, request, response, spider):
|
||||||
if self.is_cacheable(request):
|
if self.is_cacheable(request) and self.is_cacheable_response(response):
|
||||||
self.storage.store_response(spider, request, response)
|
self.storage.store_response(spider, request, response)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
def is_cacheable_response(self, response):
|
||||||
|
return response.status not in self.ignore_http_codes
|
||||||
|
|
||||||
def is_cacheable(self, request):
|
def is_cacheable(self, request):
|
||||||
return urlparse_cached(request).scheme in ['http', 'https']
|
return urlparse_cached(request).scheme in ['http', 'https']
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
settings = {
|
settings = {
|
||||||
'HTTPCACHE_DIR': self.tmpdir,
|
'HTTPCACHE_DIR': self.tmpdir,
|
||||||
'HTTPCACHE_EXPIRATION_SECS': 1,
|
'HTTPCACHE_EXPIRATION_SECS': 1,
|
||||||
|
'HTTPCACHE_IGNORE_HTTP_CODES': [],
|
||||||
}
|
}
|
||||||
settings.update(new_settings)
|
settings.update(new_settings)
|
||||||
return Settings(settings)
|
return Settings(settings)
|
||||||
@ -76,6 +77,22 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
|||||||
self.assertEqualResponse(self.response, response)
|
self.assertEqualResponse(self.response, response)
|
||||||
assert 'cached' in response.flags
|
assert 'cached' in response.flags
|
||||||
|
|
||||||
|
def test_middleware_ignore_http_codes(self):
|
||||||
|
# test response is not cached
|
||||||
|
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
|
||||||
|
assert mw.process_request(self.request, self.spider) is None
|
||||||
|
mw.process_response(self.request, self.response, self.spider)
|
||||||
|
assert mw.storage.retrieve_response(self.spider, self.request) is None
|
||||||
|
assert mw.process_request(self.request, self.spider) is None
|
||||||
|
|
||||||
|
# test response is cached
|
||||||
|
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
|
||||||
|
mw.process_response(self.request, self.response, self.spider)
|
||||||
|
response = mw.process_request(self.request, self.spider)
|
||||||
|
assert isinstance(response, HtmlResponse)
|
||||||
|
self.assertEqualResponse(self.response, response)
|
||||||
|
assert 'cached' in response.flags
|
||||||
|
|
||||||
def assertEqualResponse(self, response1, response2):
|
def assertEqualResponse(self, response1, response2):
|
||||||
self.assertEqual(response1.url, response2.url)
|
self.assertEqual(response1.url, response2.url)
|
||||||
self.assertEqual(response1.status, response2.status)
|
self.assertEqual(response1.status, response2.status)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user