From e23a38133726b716f5931e59e163cfe70169d17c Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Sun, 28 Dec 2014 19:43:16 -0800 Subject: [PATCH] Let spiders ignore bogus Cache-Control headers. Sites often set "no-store", "no-cache", "must-revalidate", etc., but get upset at the traffic a spider can generate if it respects those directives. Allow the spider's author to selectively ignore Cache-Control directives that are known to be unimportant for the sites being crawled. We assume that the spider will not issue Cache-Control directives in requests unless it actually needs them, so directives in requests are not filtered. --- scrapy/extensions/httpcache.py | 9 +++++++-- scrapy/settings/default_settings.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index 665ad3439..c0efb8996 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -7,7 +7,7 @@ from time import time from weakref import WeakKeyDictionary from email.utils import mktime_tz, parsedate_tz from w3lib.http import headers_raw_to_dict, headers_dict_to_raw -from scrapy.http import Headers +from scrapy.http import Headers, Response from scrapy.responsetypes import responsetypes from scrapy.utils.request import request_fingerprint from scrapy.utils.project import data_path @@ -39,12 +39,17 @@ class RFC2616Policy(object): def __init__(self, settings): self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES') + self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS') self._cc_parsed = WeakKeyDictionary() def _parse_cachecontrol(self, r): if r not in self._cc_parsed: cch = r.headers.get('Cache-Control', '') - self._cc_parsed[r] = parse_cachecontrol(cch) + parsed = parse_cachecontrol(cch) + if isinstance(r, Response): + for key in self.ignore_response_cache_controls: + parsed.pop(key, None) + self._cc_parsed[r] = parsed return self._cc_parsed[r] def should_cache_request(self, request): diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 9debaabc3..bd1bb0936 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -155,6 +155,7 @@ HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_SCHEMES = ['file'] +HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = [] HTTPCACHE_DBM_MODULE = 'anydbm' HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False