Test and address ReDoS attack vectors for open_in_browser

2025-03-14 15:48:38 +00:00 · 2023-12-13 12:01:35 +01:00 · 2023-12-13 12:01:35 +01:00 · 1533b69032
commit 1533b69032
parent 40b3efbbee
2 changed files with 39 additions and 3 deletions
--- a/scrapy/utils/response.py
+++ b/scrapy/utils/response.py
@ -103,9 +103,9 @@ def open_in_browser(
    body = response.body
    if isinstance(response, HtmlResponse):
        if b"<base" not in body:
-            repl = rf'\1<base href="{response.url}">'
-            body = re.sub(b"<!--.{,1024}?-->", b"", body, flags=re.DOTALL)
-            body = re.sub(rb"(<head(?:>|\s.{,1024}?>))", to_bytes(repl), body)
+            repl = rf'\0<base href="{response.url}">'
+            body = re.sub(b"(?s)<!--.*?(?:-->|$)", b"", body)
+            body = re.sub(rb"<head(?:[^<>]*?>)", to_bytes(repl), body, count=1)
        ext = ".html"
    elif isinstance(response, TextResponse):
        ext = ".txt"
--- a/tests/test_utils_response.py
+++ b/tests/test_utils_response.py
@ -1,10 +1,12 @@
 import unittest
 import warnings
 from pathlib import Path
+from time import process_time
 from urllib.parse import urlparse

 from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import HtmlResponse, Response, TextResponse
+from scrapy.settings.default_settings import DOWNLOAD_MAXSIZE
 from scrapy.utils.python import to_bytes
 from scrapy.utils.response import (
    get_base_url,
@ -198,3 +200,37 @@ class ResponseUtilsTest(unittest.TestCase):
        assert open_in_browser(
            r5, _openfunc=check_base_url
        ), "Inject unique base url with conditional comment"
+
+    def test_open_in_browser_redos_comment(self):
+        MAX_CPU_TIME = 30
+
+        # Exploit input from
+        # https://makenowjust-labs.github.io/recheck/playground/
+        # for /<!--.*?-->/ (old pattern to remove comments).
+        body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
+
+        response = HtmlResponse("https://example.com", body=body)
+
+        start_time = process_time()
+
+        open_in_browser(response, lambda url: True)
+
+        end_time = process_time()
+        self.assertLess(end_time - start_time, MAX_CPU_TIME)
+
+    def test_open_in_browser_redos_head(self):
+        MAX_CPU_TIME = 15
+
+        # Exploit input from
+        # https://makenowjust-labs.github.io/recheck/playground/
+        # for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
+        body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
+
+        response = HtmlResponse("https://example.com", body=body)
+
+        start_time = process_time()
+
+        open_in_browser(response, lambda url: True)
+
+        end_time = process_time()
+        self.assertLess(end_time - start_time, MAX_CPU_TIME)