1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-01 06:38:26 +00:00

Let spiders ignore bogus Cache-Control headers.

Sites often set "no-store", "no-cache", "must-revalidate", etc., but get
upset at the traffic a spider can generate if it respects those
directives.

Allow the spider's author to selectively ignore Cache-Control directives
that are known to be unimportant for the sites being crawled.

We assume that the spider will not issue Cache-Control directives in
requests unless it actually needs them, so directives in requests are
not filtered.
This commit is contained in:
Jamey Sharp 2014-12-28 19:43:16 -08:00 committed by Marven Sanchez
parent dd3a46295c
commit e23a381337
2 changed files with 8 additions and 2 deletions

View File

@ -7,7 +7,7 @@ from time import time
from weakref import WeakKeyDictionary
from email.utils import mktime_tz, parsedate_tz
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
from scrapy.http import Headers
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.request import request_fingerprint
from scrapy.utils.project import data_path
@ -39,12 +39,17 @@ class RFC2616Policy(object):
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
self._cc_parsed = WeakKeyDictionary()
def _parse_cachecontrol(self, r):
if r not in self._cc_parsed:
cch = r.headers.get('Cache-Control', '')
self._cc_parsed[r] = parse_cachecontrol(cch)
parsed = parse_cachecontrol(cch)
if isinstance(r, Response):
for key in self.ignore_response_cache_controls:
parsed.pop(key, None)
self._cc_parsed[r] = parsed
return self._cc_parsed[r]
def should_cache_request(self, request):

View File

@ -155,6 +155,7 @@ HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
HTTPCACHE_DBM_MODULE = 'anydbm'
HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
HTTPCACHE_GZIP = False