Let spiders ignore bogus Cache-Control headers.

Sites often set "no-store", "no-cache", "must-revalidate", etc., but get upset at the traffic a spider can generate if it respects those directives. Allow the spider's author to selectively ignore Cache-Control directives that are known to be unimportant for the sites being crawled. We assume that the spider will not issue Cache-Control directives in requests unless it actually needs them, so directives in requests are not filtered.
2025-03-01 06:38:26 +00:00 · 2014-12-28 19:43:16 -08:00 · 2014-12-28 19:43:16 -08:00 · e23a381337
commit e23a381337
parent dd3a46295c
2 changed files with 8 additions and 2 deletions
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@ -7,7 +7,7 @@ from time import time
 from weakref import WeakKeyDictionary
 from email.utils import mktime_tz, parsedate_tz
 from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
-from scrapy.http import Headers
+from scrapy.http import Headers, Response
 from scrapy.responsetypes import responsetypes
 from scrapy.utils.request import request_fingerprint
 from scrapy.utils.project import data_path
@ -39,12 +39,17 @@ class RFC2616Policy(object):

    def __init__(self, settings):
        self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
+        self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
        self._cc_parsed = WeakKeyDictionary()

    def _parse_cachecontrol(self, r):
        if r not in self._cc_parsed:
            cch = r.headers.get('Cache-Control', '')
-            self._cc_parsed[r] = parse_cachecontrol(cch)
+            parsed = parse_cachecontrol(cch)
+            if isinstance(r, Response):
+                for key in self.ignore_response_cache_controls:
+                    parsed.pop(key, None)
+            self._cc_parsed[r] = parsed
        return self._cc_parsed[r]

    def should_cache_request(self, request):
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@ -155,6 +155,7 @@ HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 HTTPCACHE_EXPIRATION_SECS = 0
 HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_IGNORE_SCHEMES = ['file']
+HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
 HTTPCACHE_DBM_MODULE = 'anydbm'
 HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
 HTTPCACHE_GZIP = False