mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 10:24:24 +00:00
Add support for multiple-compressed responses (#6063)
This commit is contained in:
parent
ebd7e199f0
commit
e208f82076
@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from itertools import chain
|
||||
from logging import getLogger
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
@ -102,18 +103,18 @@ class HttpCompressionMiddleware:
|
||||
if isinstance(response, Response):
|
||||
content_encoding = response.headers.getlist("Content-Encoding")
|
||||
if content_encoding:
|
||||
encoding = content_encoding.pop()
|
||||
max_size = request.meta.get("download_maxsize", self._max_size)
|
||||
warn_size = request.meta.get("download_warnsize", self._warn_size)
|
||||
try:
|
||||
decoded_body = self._decode(
|
||||
response.body, encoding.lower(), max_size
|
||||
decoded_body, content_encoding = self._handle_encoding(
|
||||
response.body, content_encoding, max_size
|
||||
)
|
||||
except _DecompressionMaxSizeExceeded:
|
||||
raise IgnoreRequest(
|
||||
f"Ignored response {response} because its body "
|
||||
f"({len(response.body)} B) exceeded DOWNLOAD_MAXSIZE "
|
||||
f"({max_size} B) during decompression."
|
||||
f"({len(response.body)} B compressed) exceeded "
|
||||
f"DOWNLOAD_MAXSIZE ({max_size} B) during "
|
||||
f"decompression."
|
||||
)
|
||||
if len(response.body) < warn_size <= len(decoded_body):
|
||||
logger.warning(
|
||||
@ -121,6 +122,7 @@ class HttpCompressionMiddleware:
|
||||
f"({len(decoded_body)} B) is larger than the "
|
||||
f"download warning size ({warn_size} B)."
|
||||
)
|
||||
response.headers["Content-Encoding"] = content_encoding
|
||||
if self.stats:
|
||||
self.stats.inc_value(
|
||||
"httpcompression/response_bytes",
|
||||
@ -144,6 +146,28 @@ class HttpCompressionMiddleware:
|
||||
|
||||
return response
|
||||
|
||||
def _handle_encoding(self, body, content_encoding, max_size):
|
||||
to_decode, to_keep = self._split_encodings(content_encoding)
|
||||
for encoding in to_decode:
|
||||
body = self._decode(body, encoding, max_size)
|
||||
return body, to_keep
|
||||
|
||||
def _split_encodings(self, content_encoding):
|
||||
to_keep = [
|
||||
encoding.strip().lower()
|
||||
for encoding in chain.from_iterable(
|
||||
encodings.split(b",") for encodings in content_encoding
|
||||
)
|
||||
]
|
||||
to_decode = []
|
||||
while to_keep:
|
||||
encoding = to_keep.pop()
|
||||
if encoding not in ACCEPTED_ENCODINGS:
|
||||
to_keep.append(encoding)
|
||||
return to_decode, to_keep
|
||||
to_decode.append(encoding)
|
||||
return to_decode, to_keep
|
||||
|
||||
def _decode(self, body: bytes, encoding: bytes, max_size: int) -> bytes:
|
||||
if encoding == b"gzip" or encoding == b"x-gzip":
|
||||
return gunzip(body, max_size=max_size)
|
||||
|
BIN
tests/sample_data/compressed/html-gzip-deflate-gzip.bin
Normal file
BIN
tests/sample_data/compressed/html-gzip-deflate-gzip.bin
Normal file
Binary file not shown.
BIN
tests/sample_data/compressed/html-gzip-deflate.bin
Normal file
BIN
tests/sample_data/compressed/html-gzip-deflate.bin
Normal file
Binary file not shown.
@ -27,6 +27,8 @@ FORMAT = {
|
||||
"x-gzip": ("html-gzip.bin", "gzip"),
|
||||
"rawdeflate": ("html-rawdeflate.bin", "deflate"),
|
||||
"zlibdeflate": ("html-zlibdeflate.bin", "deflate"),
|
||||
"gzip-deflate": ("html-gzip-deflate.bin", "gzip, deflate"),
|
||||
"gzip-deflate-gzip": ("html-gzip-deflate-gzip.bin", "gzip, deflate, gzip"),
|
||||
"br": ("html-br.bin", "br"),
|
||||
# $ zstd raw.html --content-size -o html-zstd-static-content-size.bin
|
||||
"zstd-static-content-size": ("html-zstd-static-content-size.bin", "zstd"),
|
||||
@ -205,6 +207,62 @@ class HttpCompressionTest(TestCase):
|
||||
assert newresponse is not response
|
||||
self.assertEqual(newresponse.headers.getlist("Content-Encoding"), [b"uuencode"])
|
||||
|
||||
def test_multi_compression_single_header(self):
|
||||
response = self._getresponse("gzip-deflate")
|
||||
request = response.request
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
assert "Content-Encoding" not in newresponse.headers
|
||||
assert newresponse.body.startswith(b"<!DOCTYPE")
|
||||
|
||||
def test_multi_compression_single_header_invalid_compression(self):
|
||||
response = self._getresponse("gzip-deflate")
|
||||
response.headers["Content-Encoding"] = [b"gzip, foo, deflate"]
|
||||
request = response.request
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
self.assertEqual(
|
||||
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
|
||||
)
|
||||
|
||||
def test_multi_compression_multiple_header(self):
|
||||
response = self._getresponse("gzip-deflate")
|
||||
response.headers["Content-Encoding"] = ["gzip", "deflate"]
|
||||
request = response.request
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
assert "Content-Encoding" not in newresponse.headers
|
||||
assert newresponse.body.startswith(b"<!DOCTYPE")
|
||||
|
||||
def test_multi_compression_multiple_header_invalid_compression(self):
|
||||
response = self._getresponse("gzip-deflate")
|
||||
response.headers["Content-Encoding"] = ["gzip", "foo", "deflate"]
|
||||
request = response.request
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
self.assertEqual(
|
||||
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
|
||||
)
|
||||
|
||||
def test_multi_compression_single_and_multiple_header(self):
|
||||
response = self._getresponse("gzip-deflate-gzip")
|
||||
response.headers["Content-Encoding"] = ["gzip", "deflate, gzip"]
|
||||
request = response.request
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
assert "Content-Encoding" not in newresponse.headers
|
||||
assert newresponse.body.startswith(b"<!DOCTYPE")
|
||||
|
||||
def test_multi_compression_single_and_multiple_header_invalid_compression(self):
|
||||
response = self._getresponse("gzip-deflate")
|
||||
response.headers["Content-Encoding"] = ["gzip", "foo,deflate"]
|
||||
request = response.request
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
self.assertEqual(
|
||||
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
|
||||
)
|
||||
|
||||
def test_process_response_encoding_inside_body(self):
|
||||
headers = {
|
||||
"Content-Type": "text/html",
|
||||
|
Loading…
x
Reference in New Issue
Block a user