mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-01 10:03:14 +00:00
Let spiders ignore bogus Cache-Control headers.
Sites often set "no-store", "no-cache", "must-revalidate", etc., but get upset at the traffic a spider can generate if it respects those directives. Allow the spider's author to selectively ignore Cache-Control directives that are known to be unimportant for the sites being crawled. We assume that the spider will not issue Cache-Control directives in requests unless it actually needs them, so directives in requests are not filtered.
This commit is contained in:
parent
dd3a46295c
commit
e23a381337
@ -7,7 +7,7 @@ from time import time
|
|||||||
from weakref import WeakKeyDictionary
|
from weakref import WeakKeyDictionary
|
||||||
from email.utils import mktime_tz, parsedate_tz
|
from email.utils import mktime_tz, parsedate_tz
|
||||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||||
from scrapy.http import Headers
|
from scrapy.http import Headers, Response
|
||||||
from scrapy.responsetypes import responsetypes
|
from scrapy.responsetypes import responsetypes
|
||||||
from scrapy.utils.request import request_fingerprint
|
from scrapy.utils.request import request_fingerprint
|
||||||
from scrapy.utils.project import data_path
|
from scrapy.utils.project import data_path
|
||||||
@ -39,12 +39,17 @@ class RFC2616Policy(object):
|
|||||||
|
|
||||||
def __init__(self, settings):
|
def __init__(self, settings):
|
||||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||||
|
self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||||
self._cc_parsed = WeakKeyDictionary()
|
self._cc_parsed = WeakKeyDictionary()
|
||||||
|
|
||||||
def _parse_cachecontrol(self, r):
|
def _parse_cachecontrol(self, r):
|
||||||
if r not in self._cc_parsed:
|
if r not in self._cc_parsed:
|
||||||
cch = r.headers.get('Cache-Control', '')
|
cch = r.headers.get('Cache-Control', '')
|
||||||
self._cc_parsed[r] = parse_cachecontrol(cch)
|
parsed = parse_cachecontrol(cch)
|
||||||
|
if isinstance(r, Response):
|
||||||
|
for key in self.ignore_response_cache_controls:
|
||||||
|
parsed.pop(key, None)
|
||||||
|
self._cc_parsed[r] = parsed
|
||||||
return self._cc_parsed[r]
|
return self._cc_parsed[r]
|
||||||
|
|
||||||
def should_cache_request(self, request):
|
def should_cache_request(self, request):
|
||||||
|
@ -155,6 +155,7 @@ HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
|||||||
HTTPCACHE_EXPIRATION_SECS = 0
|
HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
HTTPCACHE_IGNORE_HTTP_CODES = []
|
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
HTTPCACHE_IGNORE_SCHEMES = ['file']
|
HTTPCACHE_IGNORE_SCHEMES = ['file']
|
||||||
|
HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
|
||||||
HTTPCACHE_DBM_MODULE = 'anydbm'
|
HTTPCACHE_DBM_MODULE = 'anydbm'
|
||||||
HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
|
HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
|
||||||
HTTPCACHE_GZIP = False
|
HTTPCACHE_GZIP = False
|
||||||
|
Loading…
x
Reference in New Issue
Block a user