1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 16:58:20 +00:00
scrapy/tests/test_utils_response.py
2023-12-15 11:42:55 +01:00

241 lines
7.8 KiB
Python

import unittest
import warnings
from pathlib import Path
from time import process_time
from urllib.parse import urlparse
import pytest
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import HtmlResponse, Response, TextResponse
from scrapy.settings.default_settings import DOWNLOAD_MAXSIZE
from scrapy.utils.python import to_bytes
from scrapy.utils.response import (
get_base_url,
get_meta_refresh,
open_in_browser,
response_httprepr,
response_status_message,
)
__doctests__ = ["scrapy.utils.response"]
class ResponseUtilsTest(unittest.TestCase):
dummy_response = TextResponse(url="http://example.org/", body=b"dummy_response")
def test_response_httprepr(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
r1 = Response("http://www.example.com")
self.assertEqual(response_httprepr(r1), b"HTTP/1.1 200 OK\r\n\r\n")
r1 = Response(
"http://www.example.com",
status=404,
headers={"Content-type": "text/html"},
body=b"Some body",
)
self.assertEqual(
response_httprepr(r1),
b"HTTP/1.1 404 Not Found\r\nContent-Type: text/html\r\n\r\nSome body",
)
r1 = Response(
"http://www.example.com",
status=6666,
headers={"Content-type": "text/html"},
body=b"Some body",
)
self.assertEqual(
response_httprepr(r1),
b"HTTP/1.1 6666 \r\nContent-Type: text/html\r\n\r\nSome body",
)
def test_open_in_browser(self):
url = "http:///www.example.com/some/page.html"
body = b"<html> <head> <title>test page</title> </head> <body>test body</body> </html>"
def browser_open(burl):
path = urlparse(burl).path
if not path or not Path(path).exists():
path = burl.replace("file://", "")
bbody = Path(path).read_bytes()
self.assertIn(b'<base href="' + to_bytes(url) + b'">', bbody)
return True
response = HtmlResponse(url, body=body)
assert open_in_browser(response, _openfunc=browser_open), "Browser not called"
resp = Response(url, body=body)
self.assertRaises(TypeError, open_in_browser, resp, debug=True)
def test_get_meta_refresh(self):
r1 = HtmlResponse(
"http://www.example.com",
body=b"""
<html>
<head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
<body>blahablsdfsal&amp;</body>
</html>""",
)
r2 = HtmlResponse(
"http://www.example.com",
body=b"""
<html>
<head><title>Dummy</title><noScript>
<meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
</noSCRIPT>
<body>blahablsdfsal&amp;</body>
</html>""",
)
r3 = HtmlResponse(
"http://www.example.com",
body=b"""
<noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript>
<script type="text/javascript">
if(!checkCookies()){
document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">');
}
</script>
""",
)
self.assertEqual(get_meta_refresh(r1), (5.0, "http://example.org/newpage"))
self.assertEqual(get_meta_refresh(r2), (None, None))
self.assertEqual(get_meta_refresh(r3), (None, None))
def test_get_base_url(self):
resp = HtmlResponse(
"http://www.example.com",
body=b"""
<html>
<head><base href="http://www.example.com/img/" target="_blank"></head>
<body>blahablsdfsal&amp;</body>
</html>""",
)
self.assertEqual(get_base_url(resp), "http://www.example.com/img/")
resp2 = HtmlResponse(
"http://www.example.com",
body=b"""
<html><body>blahablsdfsal&amp;</body></html>""",
)
self.assertEqual(get_base_url(resp2), "http://www.example.com")
def test_response_status_message(self):
self.assertEqual(response_status_message(200), "200 OK")
self.assertEqual(response_status_message(404), "404 Not Found")
self.assertEqual(response_status_message(573), "573 Unknown Status")
def test_inject_base_url(self):
url = "http://www.example.com"
def check_base_url(burl):
path = urlparse(burl).path
if not path or not Path(path).exists():
path = burl.replace("file://", "")
bbody = Path(path).read_bytes()
self.assertEqual(bbody.count(b'<base href="' + to_bytes(url) + b'">'), 1)
return True
r1 = HtmlResponse(
url,
body=b"""
<html>
<head><title>Dummy</title></head>
<body><p>Hello world.</p></body>
</html>""",
)
r2 = HtmlResponse(
url,
body=b"""
<html>
<head id="foo"><title>Dummy</title></head>
<body>Hello world.</body>
</html>""",
)
r3 = HtmlResponse(
url,
body=b"""
<html>
<head><title>Dummy</title></head>
<body>
<header>Hello header</header>
<p>Hello world.</p>
</body>
</html>""",
)
r4 = HtmlResponse(
url,
body=b"""
<html>
<!-- <head>Dummy comment</head> -->
<head><title>Dummy</title></head>
<body><p>Hello world.</p></body>
</html>""",
)
r5 = HtmlResponse(
url,
body=b"""
<html>
<!--[if IE]>
<head><title>IE head</title></head>
<![endif]-->
<!--[if !IE]>-->
<head><title>Standard head</title></head>
<!--<![endif]-->
<body><p>Hello world.</p></body>
</html>""",
)
assert open_in_browser(r1, _openfunc=check_base_url), "Inject base url"
assert open_in_browser(
r2, _openfunc=check_base_url
), "Inject base url with argumented head"
assert open_in_browser(
r3, _openfunc=check_base_url
), "Inject unique base url with misleading tag"
assert open_in_browser(
r4, _openfunc=check_base_url
), "Inject unique base url with misleading comment"
assert open_in_browser(
r5, _openfunc=check_base_url
), "Inject unique base url with conditional comment"
@pytest.mark.slow
def test_open_in_browser_redos_comment(self):
MAX_CPU_TIME = 30
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /<!--.*?-->/ (old pattern to remove comments).
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
response = HtmlResponse("https://example.com", body=body)
start_time = process_time()
open_in_browser(response, lambda url: True)
end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)
@pytest.mark.slow
def test_open_in_browser_redos_head(self):
MAX_CPU_TIME = 15
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
response = HtmlResponse("https://example.com", body=body)
start_time = process_time()
open_in_browser(response, lambda url: True)
end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)