1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00

Add support for multiple-compressed responses (#6063)

This commit is contained in:
vishesh10 2024-02-22 16:46:24 +05:30 committed by GitHub
parent ebd7e199f0
commit e208f82076
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 87 additions and 5 deletions

View File

@ -1,6 +1,7 @@
from __future__ import annotations
import warnings
from itertools import chain
from logging import getLogger
from typing import TYPE_CHECKING, List, Optional, Union
@ -102,18 +103,18 @@ class HttpCompressionMiddleware:
if isinstance(response, Response):
content_encoding = response.headers.getlist("Content-Encoding")
if content_encoding:
encoding = content_encoding.pop()
max_size = request.meta.get("download_maxsize", self._max_size)
warn_size = request.meta.get("download_warnsize", self._warn_size)
try:
decoded_body = self._decode(
response.body, encoding.lower(), max_size
decoded_body, content_encoding = self._handle_encoding(
response.body, content_encoding, max_size
)
except _DecompressionMaxSizeExceeded:
raise IgnoreRequest(
f"Ignored response {response} because its body "
f"({len(response.body)} B) exceeded DOWNLOAD_MAXSIZE "
f"({max_size} B) during decompression."
f"({len(response.body)} B compressed) exceeded "
f"DOWNLOAD_MAXSIZE ({max_size} B) during "
f"decompression."
)
if len(response.body) < warn_size <= len(decoded_body):
logger.warning(
@ -121,6 +122,7 @@ class HttpCompressionMiddleware:
f"({len(decoded_body)} B) is larger than the "
f"download warning size ({warn_size} B)."
)
response.headers["Content-Encoding"] = content_encoding
if self.stats:
self.stats.inc_value(
"httpcompression/response_bytes",
@ -144,6 +146,28 @@ class HttpCompressionMiddleware:
return response
def _handle_encoding(self, body, content_encoding, max_size):
to_decode, to_keep = self._split_encodings(content_encoding)
for encoding in to_decode:
body = self._decode(body, encoding, max_size)
return body, to_keep
def _split_encodings(self, content_encoding):
to_keep = [
encoding.strip().lower()
for encoding in chain.from_iterable(
encodings.split(b",") for encodings in content_encoding
)
]
to_decode = []
while to_keep:
encoding = to_keep.pop()
if encoding not in ACCEPTED_ENCODINGS:
to_keep.append(encoding)
return to_decode, to_keep
to_decode.append(encoding)
return to_decode, to_keep
def _decode(self, body: bytes, encoding: bytes, max_size: int) -> bytes:
if encoding == b"gzip" or encoding == b"x-gzip":
return gunzip(body, max_size=max_size)

Binary file not shown.

View File

@ -27,6 +27,8 @@ FORMAT = {
"x-gzip": ("html-gzip.bin", "gzip"),
"rawdeflate": ("html-rawdeflate.bin", "deflate"),
"zlibdeflate": ("html-zlibdeflate.bin", "deflate"),
"gzip-deflate": ("html-gzip-deflate.bin", "gzip, deflate"),
"gzip-deflate-gzip": ("html-gzip-deflate-gzip.bin", "gzip, deflate, gzip"),
"br": ("html-br.bin", "br"),
# $ zstd raw.html --content-size -o html-zstd-static-content-size.bin
"zstd-static-content-size": ("html-zstd-static-content-size.bin", "zstd"),
@ -205,6 +207,62 @@ class HttpCompressionTest(TestCase):
assert newresponse is not response
self.assertEqual(newresponse.headers.getlist("Content-Encoding"), [b"uuencode"])
def test_multi_compression_single_header(self):
response = self._getresponse("gzip-deflate")
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
assert "Content-Encoding" not in newresponse.headers
assert newresponse.body.startswith(b"<!DOCTYPE")
def test_multi_compression_single_header_invalid_compression(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = [b"gzip, foo, deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
)
def test_multi_compression_multiple_header(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = ["gzip", "deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
assert "Content-Encoding" not in newresponse.headers
assert newresponse.body.startswith(b"<!DOCTYPE")
def test_multi_compression_multiple_header_invalid_compression(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = ["gzip", "foo", "deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
)
def test_multi_compression_single_and_multiple_header(self):
response = self._getresponse("gzip-deflate-gzip")
response.headers["Content-Encoding"] = ["gzip", "deflate, gzip"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
assert "Content-Encoding" not in newresponse.headers
assert newresponse.body.startswith(b"<!DOCTYPE")
def test_multi_compression_single_and_multiple_header_invalid_compression(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = ["gzip", "foo,deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
)
def test_process_response_encoding_inside_body(self):
headers = {
"Content-Type": "text/html",