mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 05:04:03 +00:00
Automated merge with ssh://hg.scrapy.org/scrapy-0.9
This commit is contained in:
commit
6883a99c1e
@ -276,6 +276,17 @@ Number of seconds to use for HTTP cache expiration. Requests that were cached
|
||||
before this time will be re-downloaded. If zero, cached requests will always
|
||||
expire. A negative number means requests will never expire.
|
||||
|
||||
.. setting:: HTTPCACHE_IGNORE_HTTP_CODES
|
||||
|
||||
HTTPCACHE_IGNORE_HTTP_CODES
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. versionadded:: 0.10
|
||||
|
||||
Default: ``[]``
|
||||
|
||||
Don't cache response with these HTTP codes.
|
||||
|
||||
.. setting:: HTTPCACHE_IGNORE_MISSING
|
||||
|
||||
HTTPCACHE_IGNORE_MISSING
|
||||
|
@ -2,8 +2,8 @@
|
||||
Scrapy - a screen scraping framework written in Python
|
||||
"""
|
||||
|
||||
version_info = (0, 9, 0, '')
|
||||
__version__ = "0.9"
|
||||
version_info = (0, 10, 0, 'dev')
|
||||
__version__ = "0.10-dev"
|
||||
|
||||
import sys, os, warnings
|
||||
|
||||
|
@ -123,6 +123,7 @@ HTTPCACHE_DIR = ''
|
||||
HTTPCACHE_IGNORE_MISSING = False
|
||||
HTTPCACHE_STORAGE = 'scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage'
|
||||
HTTPCACHE_EXPIRATION_SECS = 0
|
||||
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
|
||||
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||
|
||||
|
@ -3,15 +3,27 @@ DefaultHeaders downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy.conf import settings
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.default_headers = settings.get('DEFAULT_REQUEST_HEADERS')
|
||||
self.global_default_headers = settings.get('DEFAULT_REQUEST_HEADERS')
|
||||
self._default_headers = {}
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
for k, v in self.default_headers.iteritems():
|
||||
for k, v in self._default_headers[spider].iteritems():
|
||||
if v:
|
||||
request.headers.setdefault(k, v)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._default_headers[spider] = dict(self.global_default_headers,
|
||||
**getattr(spider, 'default_headers', {}))
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self._default_headers.pop(spider)
|
||||
|
@ -22,6 +22,7 @@ class HttpCacheMiddleware(object):
|
||||
def __init__(self, settings=conf.settings):
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
@ -35,17 +36,20 @@ class HttpCacheMiddleware(object):
|
||||
if not self.is_cacheable(request):
|
||||
return
|
||||
response = self.storage.retrieve_response(spider, request)
|
||||
if response:
|
||||
if response and self.is_cacheable_response(response):
|
||||
response.flags.append('cached')
|
||||
return response
|
||||
elif self.ignore_missing:
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if self.is_cacheable(request):
|
||||
if self.is_cacheable(request) and self.is_cacheable_response(response):
|
||||
self.storage.store_response(spider, request, response)
|
||||
return response
|
||||
|
||||
def is_cacheable_response(self, response):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cacheable(self, request):
|
||||
return urlparse_cached(request).scheme in ['http', 'https']
|
||||
|
||||
|
@ -2,7 +2,7 @@ from unittest import TestCase
|
||||
|
||||
from scrapy.conf import settings
|
||||
from scrapy.contrib.downloadermiddleware.defaultheaders import DefaultHeadersMiddleware
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
|
||||
@ -16,15 +16,33 @@ class TestDefaultHeadersMiddleware(TestCase):
|
||||
|
||||
def test_process_request(self):
|
||||
req = Request('http://www.scrapytest.org')
|
||||
self.mw.spider_opened(self.spider)
|
||||
self.mw.process_request(req, self.spider)
|
||||
self.mw.spider_closed(self.spider)
|
||||
self.assertEquals(req.headers, self.default_headers)
|
||||
|
||||
def test_spider_default_headers(self):
|
||||
spider_headers = {'Unexistant-Header': ['value']}
|
||||
# override one of the global default headers by spider
|
||||
if self.default_headers:
|
||||
k = set(self.default_headers).pop()
|
||||
spider_headers[k] = ['__newvalue__']
|
||||
self.spider.default_headers = spider_headers
|
||||
|
||||
req = Request('http://www.scrapytest.org')
|
||||
self.mw.spider_opened(self.spider)
|
||||
self.mw.process_request(req, self.spider)
|
||||
self.mw.spider_closed(self.spider)
|
||||
self.assertEquals(req.headers, dict(self.default_headers, **spider_headers))
|
||||
|
||||
def test_update_headers(self):
|
||||
headers = {'Accept-Language': ['es'], 'Test-Header': ['test']}
|
||||
req = Request('http://www.scrapytest.org', headers=headers)
|
||||
self.assertEquals(req.headers, headers)
|
||||
|
||||
self.mw.spider_opened(self.spider)
|
||||
self.mw.process_request(req, self.spider)
|
||||
self.mw.spider_closed(self.spider)
|
||||
self.default_headers.update(headers)
|
||||
self.assertEquals(req.headers, self.default_headers)
|
||||
|
||||
|
@ -24,6 +24,7 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
||||
settings = {
|
||||
'HTTPCACHE_DIR': self.tmpdir,
|
||||
'HTTPCACHE_EXPIRATION_SECS': 1,
|
||||
'HTTPCACHE_IGNORE_HTTP_CODES': [],
|
||||
}
|
||||
settings.update(new_settings)
|
||||
return Settings(settings)
|
||||
@ -76,6 +77,22 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
|
||||
self.assertEqualResponse(self.response, response)
|
||||
assert 'cached' in response.flags
|
||||
|
||||
def test_middleware_ignore_http_codes(self):
|
||||
# test response is not cached
|
||||
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
|
||||
assert mw.process_request(self.request, self.spider) is None
|
||||
mw.process_response(self.request, self.response, self.spider)
|
||||
assert mw.storage.retrieve_response(self.spider, self.request) is None
|
||||
assert mw.process_request(self.request, self.spider) is None
|
||||
|
||||
# test response is cached
|
||||
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
|
||||
mw.process_response(self.request, self.response, self.spider)
|
||||
response = mw.process_request(self.request, self.spider)
|
||||
assert isinstance(response, HtmlResponse)
|
||||
self.assertEqualResponse(self.response, response)
|
||||
assert 'cached' in response.flags
|
||||
|
||||
def assertEqualResponse(self, response1, response2):
|
||||
self.assertEqual(response1.url, response2.url)
|
||||
self.assertEqual(response1.status, response2.status)
|
||||
|
Loading…
x
Reference in New Issue
Block a user