1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00
scrapy/tests/test_downloadermiddleware_redirect.py
2025-01-27 11:07:09 +01:00

1317 lines
58 KiB
Python

import unittest
from itertools import chain, product
import pytest
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from scrapy.downloadermiddlewares.redirect import (
MetaRefreshMiddleware,
RedirectMiddleware,
)
from scrapy.exceptions import IgnoreRequest
from scrapy.http import HtmlResponse, Request, Response
from scrapy.spiders import Spider
from scrapy.utils.misc import set_environ
from scrapy.utils.test import get_crawler
class Base:
class Test(unittest.TestCase):
def test_priority_adjust(self):
req = Request("http://a.com")
rsp = self.get_response(req, "http://a.com/redirected")
req2 = self.mw.process_response(req, rsp, self.spider)
self.assertGreater(req2.priority, req.priority)
def test_dont_redirect(self):
url = "http://www.example.com/301"
url2 = "http://www.example.com/redirected"
req = Request(url, meta={"dont_redirect": True})
rsp = self.get_response(req, url2)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Response)
assert r is rsp
# Test that it redirects when dont_redirect is False
req = Request(url, meta={"dont_redirect": False})
rsp = self.get_response(req, url2)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Request)
def test_post(self):
url = "http://www.example.com/302"
url2 = "http://www.example.com/redirected2"
req = Request(
url,
method="POST",
body="test",
headers={"Content-Type": "text/plain", "Content-length": "4"},
)
rsp = self.get_response(req, url2)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, "GET")
assert "Content-Type" not in req2.headers, (
"Content-Type header must not be present in redirected request"
)
assert "Content-Length" not in req2.headers, (
"Content-Length header must not be present in redirected request"
)
assert not req2.body, f"Redirected body must be empty, not '{req2.body}'"
def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request("http://scrapytest.org/302")
rsp = self.get_response(req, "/redirected")
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
assert "redirect_times" in req.meta
self.assertEqual(req.meta["redirect_times"], 1)
self.assertRaises(
IgnoreRequest, self.mw.process_response, req, rsp, self.spider
)
def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request("http://scrapytest.org/302", meta={"redirect_ttl": 1})
rsp = self.get_response(req, "/a")
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertRaises(
IgnoreRequest, self.mw.process_response, req, rsp, self.spider
)
def test_redirect_urls(self):
req1 = Request("http://scrapytest.org/first")
rsp1 = self.get_response(req1, "/redirected")
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = self.get_response(req1, "/redirected2")
req3 = self.mw.process_response(req2, rsp2, self.spider)
self.assertEqual(req2.url, "http://scrapytest.org/redirected")
self.assertEqual(
req2.meta["redirect_urls"], ["http://scrapytest.org/first"]
)
self.assertEqual(req3.url, "http://scrapytest.org/redirected2")
self.assertEqual(
req3.meta["redirect_urls"],
["http://scrapytest.org/first", "http://scrapytest.org/redirected"],
)
def test_redirect_reasons(self):
req1 = Request("http://scrapytest.org/first")
rsp1 = self.get_response(req1, "/redirected1")
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = self.get_response(req2, "/redirected2")
req3 = self.mw.process_response(req2, rsp2, self.spider)
self.assertEqual(req2.meta["redirect_reasons"], [self.reason])
self.assertEqual(req3.meta["redirect_reasons"], [self.reason, self.reason])
def test_cross_origin_header_dropping(self):
safe_headers = {"A": "B"}
cookie_header = {"Cookie": "a=b"}
authorization_header = {"Authorization": "Bearer 123456"}
original_request = Request(
"https://example.com",
headers={**safe_headers, **cookie_header, **authorization_header},
)
# Redirects to the same origin (same scheme, same domain, same port)
# keep all headers.
internal_response = self.get_response(
original_request, "https://example.com/a"
)
internal_redirect_request = self.mw.process_response(
original_request, internal_response, self.spider
)
self.assertIsInstance(internal_redirect_request, Request)
self.assertEqual(
original_request.headers, internal_redirect_request.headers
)
# Redirects to the same origin (same scheme, same domain, same port)
# keep all headers also when the scheme is http.
http_request = Request(
"http://example.com",
headers={**safe_headers, **cookie_header, **authorization_header},
)
http_response = self.get_response(http_request, "http://example.com/a")
http_redirect_request = self.mw.process_response(
http_request, http_response, self.spider
)
self.assertIsInstance(http_redirect_request, Request)
self.assertEqual(http_request.headers, http_redirect_request.headers)
# For default ports, whether the port is explicit or implicit does not
# affect the outcome, it is still the same origin.
to_explicit_port_response = self.get_response(
original_request, "https://example.com:443/a"
)
to_explicit_port_redirect_request = self.mw.process_response(
original_request, to_explicit_port_response, self.spider
)
self.assertIsInstance(to_explicit_port_redirect_request, Request)
self.assertEqual(
original_request.headers, to_explicit_port_redirect_request.headers
)
# For default ports, whether the port is explicit or implicit does not
# affect the outcome, it is still the same origin.
to_implicit_port_response = self.get_response(
original_request, "https://example.com/a"
)
to_implicit_port_redirect_request = self.mw.process_response(
original_request, to_implicit_port_response, self.spider
)
self.assertIsInstance(to_implicit_port_redirect_request, Request)
self.assertEqual(
original_request.headers, to_implicit_port_redirect_request.headers
)
# A port change drops the Authorization header because the origin
# changes, but keeps the Cookie header because the domain remains the
# same.
different_port_response = self.get_response(
original_request, "https://example.com:8080/a"
)
different_port_redirect_request = self.mw.process_response(
original_request, different_port_response, self.spider
)
self.assertIsInstance(different_port_redirect_request, Request)
self.assertEqual(
{**safe_headers, **cookie_header},
different_port_redirect_request.headers.to_unicode_dict(),
)
# A domain change drops both the Authorization and the Cookie header.
external_response = self.get_response(
original_request, "https://example.org/a"
)
external_redirect_request = self.mw.process_response(
original_request, external_response, self.spider
)
self.assertIsInstance(external_redirect_request, Request)
self.assertEqual(
safe_headers, external_redirect_request.headers.to_unicode_dict()
)
# A scheme upgrade (http → https) drops the Authorization header
# because the origin changes, but keeps the Cookie header because the
# domain remains the same.
upgrade_response = self.get_response(http_request, "https://example.com/a")
upgrade_redirect_request = self.mw.process_response(
http_request, upgrade_response, self.spider
)
self.assertIsInstance(upgrade_redirect_request, Request)
self.assertEqual(
{**safe_headers, **cookie_header},
upgrade_redirect_request.headers.to_unicode_dict(),
)
# A scheme downgrade (https → http) drops the Authorization header
# because the origin changes, and the Cookie header because its value
# cannot indicate whether the cookies were secure (HTTPS-only) or not.
#
# Note: If the Cookie header is set by the cookie management
# middleware, as recommended in the docs, the dropping of Cookie on
# scheme downgrade is not an issue, because the cookie management
# middleware will add again the Cookie header to the new request if
# appropriate.
downgrade_response = self.get_response(
original_request, "http://example.com/a"
)
downgrade_redirect_request = self.mw.process_response(
original_request, downgrade_response, self.spider
)
self.assertIsInstance(downgrade_redirect_request, Request)
self.assertEqual(
safe_headers,
downgrade_redirect_request.headers.to_unicode_dict(),
)
def test_meta_proxy_http_absolute(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
meta = {"proxy": "https://a:@a.example"}
request1 = Request("http://example.com", meta=meta)
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_meta_proxy_http_relative(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
meta = {"proxy": "https://a:@a.example"}
request1 = Request("http://example.com", meta=meta)
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "/a")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "/a")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_meta_proxy_https_absolute(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
meta = {"proxy": "https://a:@a.example"}
request1 = Request("https://example.com", meta=meta)
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_meta_proxy_https_relative(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
meta = {"proxy": "https://a:@a.example"}
request1 = Request("https://example.com", meta=meta)
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "/a")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "/a")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_meta_proxy_http_to_https(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
meta = {"proxy": "https://a:@a.example"}
request1 = Request("http://example.com", meta=meta)
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_meta_proxy_https_to_http(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
meta = {"proxy": "https://a:@a.example"}
request1 = Request("https://example.com", meta=meta)
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_http_absolute(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"http_proxy": "https://a:@a.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("http://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_http_relative(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"http_proxy": "https://a:@a.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("http://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "/a")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "/a")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_https_absolute(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"https_proxy": "https://a:@a.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("https://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_https_relative(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"https_proxy": "https://a:@a.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("https://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "/a")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "/a")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_proxied_http_to_proxied_https(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"http_proxy": "https://a:@a.example",
"https_proxy": "https://b:@b.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("http://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic Yjo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://b.example")
self.assertEqual(request2.meta["proxy"], "https://b.example")
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_proxied_http_to_unproxied_https(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"http_proxy": "https://a:@a.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("http://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request1.meta["proxy"], "https://a.example")
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request3.meta["proxy"], "https://a.example")
def test_system_proxy_unproxied_http_to_proxied_https(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"https_proxy": "https://b:@b.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("http://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertNotIn("Proxy-Authorization", request1.headers)
self.assertNotIn("_auth_proxy", request1.meta)
self.assertNotIn("proxy", request1.meta)
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic Yjo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://b.example")
self.assertEqual(request2.meta["proxy"], "https://b.example")
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
def test_system_proxy_unproxied_http_to_unproxied_https(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("http://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertNotIn("Proxy-Authorization", request1.headers)
self.assertNotIn("_auth_proxy", request1.meta)
self.assertNotIn("proxy", request1.meta)
response1 = self.get_response(request1, "https://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
response2 = self.get_response(request2, "http://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
def test_system_proxy_proxied_https_to_proxied_http(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"http_proxy": "https://a:@a.example",
"https_proxy": "https://b:@b.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("https://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic Yjo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://b.example")
self.assertEqual(request1.meta["proxy"], "https://b.example")
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic Yjo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://b.example")
self.assertEqual(request3.meta["proxy"], "https://b.example")
def test_system_proxy_proxied_https_to_unproxied_http(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"https_proxy": "https://b:@b.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("https://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertEqual(request1.headers["Proxy-Authorization"], b"Basic Yjo=")
self.assertEqual(request1.meta["_auth_proxy"], "https://b.example")
self.assertEqual(request1.meta["proxy"], "https://b.example")
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertEqual(request3.headers["Proxy-Authorization"], b"Basic Yjo=")
self.assertEqual(request3.meta["_auth_proxy"], "https://b.example")
self.assertEqual(request3.meta["proxy"], "https://b.example")
def test_system_proxy_unproxied_https_to_proxied_http(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
env = {
"http_proxy": "https://a:@a.example",
}
with set_environ(**env):
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("https://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertNotIn("Proxy-Authorization", request1.headers)
self.assertNotIn("_auth_proxy", request1.meta)
self.assertNotIn("proxy", request1.meta)
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertEqual(request2.headers["Proxy-Authorization"], b"Basic YTo=")
self.assertEqual(request2.meta["_auth_proxy"], "https://a.example")
self.assertEqual(request2.meta["proxy"], "https://a.example")
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
def test_system_proxy_unproxied_https_to_unproxied_http(self):
crawler = get_crawler()
redirect_mw = self.mwcls.from_crawler(crawler)
proxy_mw = HttpProxyMiddleware.from_crawler(crawler)
request1 = Request("https://example.com")
spider = None
proxy_mw.process_request(request1, spider)
self.assertNotIn("Proxy-Authorization", request1.headers)
self.assertNotIn("_auth_proxy", request1.meta)
self.assertNotIn("proxy", request1.meta)
response1 = self.get_response(request1, "http://example.com")
request2 = redirect_mw.process_response(request1, response1, spider)
self.assertIsInstance(request2, Request)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
proxy_mw.process_request(request2, spider)
self.assertNotIn("Proxy-Authorization", request2.headers)
self.assertNotIn("_auth_proxy", request2.meta)
self.assertNotIn("proxy", request2.meta)
response2 = self.get_response(request2, "https://example.com")
request3 = redirect_mw.process_response(request2, response2, spider)
self.assertIsInstance(request3, Request)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
proxy_mw.process_request(request3, spider)
self.assertNotIn("Proxy-Authorization", request3.headers)
self.assertNotIn("_auth_proxy", request3.meta)
self.assertNotIn("proxy", request3.meta)
class RedirectMiddlewareTest(Base.Test):
mwcls = RedirectMiddleware
reason = 302
def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider("foo")
self.mw = self.mwcls.from_crawler(self.crawler)
def get_response(self, request, location, status=302):
headers = {"Location": location}
return Response(request.url, status=status, headers=headers)
def test_redirect_3xx_permanent(self):
def _test(method, status=301):
url = f"http://www.example.com/{status}"
url2 = "http://www.example.com/redirected"
req = Request(url, method=method)
rsp = Response(url, headers={"Location": url2}, status=status)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, method)
# response without Location header but with status code is 3XX should be ignored
del rsp.headers["Location"]
assert self.mw.process_response(req, rsp, self.spider) is rsp
_test("GET")
_test("POST")
_test("HEAD")
_test("GET", status=307)
_test("POST", status=307)
_test("HEAD", status=307)
_test("GET", status=308)
_test("POST", status=308)
_test("HEAD", status=308)
def test_redirect_302_head(self):
url = "http://www.example.com/302"
url2 = "http://www.example.com/redirected2"
req = Request(url, method="HEAD")
rsp = Response(url, headers={"Location": url2}, status=302)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, "HEAD")
def test_redirect_302_relative(self):
url = "http://www.example.com/302"
url2 = "///i8n.example2.com/302"
url3 = "http://i8n.example2.com/302"
req = Request(url, method="HEAD")
rsp = Response(url, headers={"Location": url2}, status=302)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url3)
self.assertEqual(req2.method, "HEAD")
def test_spider_handling(self):
smartspider = self.crawler._create_spider("smarty")
smartspider.handle_httpstatus_list = [404, 301, 302]
url = "http://www.example.com/301"
url2 = "http://www.example.com/redirected"
req = Request(url)
rsp = Response(url, headers={"Location": url2}, status=301)
r = self.mw.process_response(req, rsp, smartspider)
self.assertIs(r, rsp)
def test_request_meta_handling(self):
url = "http://www.example.com/301"
url2 = "http://www.example.com/redirected"
def _test_passthrough(req):
rsp = Response(url, headers={"Location": url2}, status=301, request=req)
r = self.mw.process_response(req, rsp, self.spider)
self.assertIs(r, rsp)
_test_passthrough(
Request(url, meta={"handle_httpstatus_list": [404, 301, 302]})
)
_test_passthrough(Request(url, meta={"handle_httpstatus_all": True}))
def test_latin1_location(self):
req = Request("http://scrapytest.org/first")
latin1_location = "/ação".encode("latin1") # HTTP historically supports latin1
resp = Response(
"http://scrapytest.org/first",
headers={"Location": latin1_location},
status=302,
)
req_result = self.mw.process_response(req, resp, self.spider)
perc_encoded_utf8_url = "http://scrapytest.org/a%E7%E3o"
self.assertEqual(perc_encoded_utf8_url, req_result.url)
def test_utf8_location(self):
req = Request("http://scrapytest.org/first")
utf8_location = "/ação".encode() # header using UTF-8 encoding
resp = Response(
"http://scrapytest.org/first",
headers={"Location": utf8_location},
status=302,
)
req_result = self.mw.process_response(req, resp, self.spider)
perc_encoded_utf8_url = "http://scrapytest.org/a%C3%A7%C3%A3o"
self.assertEqual(perc_encoded_utf8_url, req_result.url)
def test_no_location(self):
request = Request("https://example.com")
response = Response(request.url, status=302)
assert self.mw.process_response(request, response, self.spider) is response
SCHEME_PARAMS = ("url", "location", "target")
HTTP_SCHEMES = ("http", "https")
NON_HTTP_SCHEMES = ("data", "file", "ftp", "s3", "foo")
REDIRECT_SCHEME_CASES = (
# http/https → http/https redirects
*(
(
f"{input_scheme}://example.com/a",
f"{output_scheme}://example.com/b",
f"{output_scheme}://example.com/b",
)
for input_scheme, output_scheme in product(HTTP_SCHEMES, repeat=2)
),
# http/https → data/file/ftp/s3/foo does not redirect
*(
(
f"{input_scheme}://example.com/a",
f"{output_scheme}://example.com/b",
None,
)
for input_scheme in HTTP_SCHEMES
for output_scheme in NON_HTTP_SCHEMES
),
# http/https → relative redirects
*(
(
f"{scheme}://example.com/a",
location,
f"{scheme}://example.com/b",
)
for scheme in HTTP_SCHEMES
for location in ("//example.com/b", "/b")
),
# Note: We do not test data/file/ftp/s3 schemes for the initial URL
# because their download handlers cannot return a status code of 3xx.
)
@pytest.mark.parametrize(SCHEME_PARAMS, REDIRECT_SCHEME_CASES)
def test_redirect_schemes(url, location, target):
crawler = get_crawler(Spider)
spider = crawler._create_spider("foo")
mw = RedirectMiddleware.from_crawler(crawler)
request = Request(url)
response = Response(url, headers={"Location": location}, status=301)
redirect = mw.process_response(request, response, spider)
if target is None:
assert redirect == response
else:
assert isinstance(redirect, Request)
assert redirect.url == target
def meta_refresh_body(url, interval=5):
html = f"""<html><head><meta http-equiv="refresh" content="{interval};url={url}"/></head></html>"""
return html.encode("utf-8")
class MetaRefreshMiddlewareTest(Base.Test):
mwcls = MetaRefreshMiddleware
reason = "meta refresh"
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider("foo")
self.mw = self.mwcls.from_crawler(crawler)
def _body(self, interval=5, url="http://example.org/newpage"):
return meta_refresh_body(url, interval)
def get_response(self, request, location):
return HtmlResponse(request.url, body=self._body(url=location))
def test_meta_refresh(self):
req = Request(url="http://example.org")
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, "http://example.org/newpage")
def test_meta_refresh_with_high_interval(self):
# meta-refresh with high intervals don't trigger redirects
req = Request(url="http://example.org")
rsp = HtmlResponse(
url="http://example.org", body=self._body(interval=1000), encoding="utf-8"
)
rsp2 = self.mw.process_response(req, rsp, self.spider)
assert rsp is rsp2
def test_meta_refresh_trough_posted_request(self):
req = Request(
url="http://example.org",
method="POST",
body="test",
headers={"Content-Type": "text/plain", "Content-length": "4"},
)
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, "http://example.org/newpage")
self.assertEqual(req2.method, "GET")
assert "Content-Type" not in req2.headers, (
"Content-Type header must not be present in redirected request"
)
assert "Content-Length" not in req2.headers, (
"Content-Length header must not be present in redirected request"
)
assert not req2.body, f"Redirected body must be empty, not '{req2.body}'"
def test_ignore_tags_default(self):
req = Request(url="http://example.org")
body = (
"""<noscript><meta http-equiv="refresh" """
"""content="0;URL='http://example.org/newpage'"></noscript>"""
)
rsp = HtmlResponse(req.url, body=body.encode())
response = self.mw.process_response(req, rsp, self.spider)
assert isinstance(response, Response)
def test_ignore_tags_1_x_list(self):
"""Test that Scrapy 1.x behavior remains possible"""
settings = {"METAREFRESH_IGNORE_TAGS": ["script", "noscript"]}
crawler = get_crawler(Spider, settings)
mw = MetaRefreshMiddleware.from_crawler(crawler)
req = Request(url="http://example.org")
body = (
"""<noscript><meta http-equiv="refresh" """
"""content="0;URL='http://example.org/newpage'"></noscript>"""
)
rsp = HtmlResponse(req.url, body=body.encode())
response = mw.process_response(req, rsp, self.spider)
assert isinstance(response, Response)
@pytest.mark.parametrize(
SCHEME_PARAMS,
(
*REDIRECT_SCHEME_CASES,
# data/file/ftp/s3/foo → * does not redirect
*(
(
f"{input_scheme}://example.com/a",
f"{output_scheme}://example.com/b",
None,
)
for input_scheme in NON_HTTP_SCHEMES
for output_scheme in chain(HTTP_SCHEMES, NON_HTTP_SCHEMES)
),
# data/file/ftp/s3/foo → relative does not redirect
*(
(
f"{scheme}://example.com/a",
location,
None,
)
for scheme in NON_HTTP_SCHEMES
for location in ("//example.com/b", "/b")
),
),
)
def test_meta_refresh_schemes(url, location, target):
crawler = get_crawler(Spider)
spider = crawler._create_spider("foo")
mw = MetaRefreshMiddleware.from_crawler(crawler)
request = Request(url)
response = HtmlResponse(url, body=meta_refresh_body(location))
redirect = mw.process_response(request, response, spider)
if target is None:
assert redirect == response
else:
assert isinstance(redirect, Request)
assert redirect.url == target