1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 15:48:38 +00:00

Test and address ReDoS attack vectors for open_in_browser

This commit is contained in:
Adrián Chaves 2023-12-13 12:01:35 +01:00
parent 40b3efbbee
commit 1533b69032
2 changed files with 39 additions and 3 deletions

View File

@ -103,9 +103,9 @@ def open_in_browser(
body = response.body
if isinstance(response, HtmlResponse):
if b"<base" not in body:
repl = rf'\1<base href="{response.url}">'
body = re.sub(b"<!--.{,1024}?-->", b"", body, flags=re.DOTALL)
body = re.sub(rb"(<head(?:>|\s.{,1024}?>))", to_bytes(repl), body)
repl = rf'\0<base href="{response.url}">'
body = re.sub(b"(?s)<!--.*?(?:-->|$)", b"", body)
body = re.sub(rb"<head(?:[^<>]*?>)", to_bytes(repl), body, count=1)
ext = ".html"
elif isinstance(response, TextResponse):
ext = ".txt"

View File

@ -1,10 +1,12 @@
import unittest
import warnings
from pathlib import Path
from time import process_time
from urllib.parse import urlparse
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import HtmlResponse, Response, TextResponse
from scrapy.settings.default_settings import DOWNLOAD_MAXSIZE
from scrapy.utils.python import to_bytes
from scrapy.utils.response import (
get_base_url,
@ -198,3 +200,37 @@ class ResponseUtilsTest(unittest.TestCase):
assert open_in_browser(
r5, _openfunc=check_base_url
), "Inject unique base url with conditional comment"
def test_open_in_browser_redos_comment(self):
MAX_CPU_TIME = 30
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /<!--.*?-->/ (old pattern to remove comments).
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
response = HtmlResponse("https://example.com", body=body)
start_time = process_time()
open_in_browser(response, lambda url: True)
end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)
def test_open_in_browser_redos_head(self):
MAX_CPU_TIME = 15
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
response = HtmlResponse("https://example.com", body=body)
start_time = process_time()
open_in_browser(response, lambda url: True)
end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)