mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
1342 lines
44 KiB
Python
1342 lines
44 KiB
Python
from __future__ import annotations
|
|
|
|
import warnings
|
|
from typing import Any
|
|
from unittest import TestCase
|
|
from urllib.parse import urlparse
|
|
|
|
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
|
|
from scrapy.http import Request, Response
|
|
from scrapy.settings import Settings
|
|
from scrapy.spidermiddlewares.referer import (
|
|
POLICY_NO_REFERRER,
|
|
POLICY_NO_REFERRER_WHEN_DOWNGRADE,
|
|
POLICY_ORIGIN,
|
|
POLICY_ORIGIN_WHEN_CROSS_ORIGIN,
|
|
POLICY_SAME_ORIGIN,
|
|
POLICY_SCRAPY_DEFAULT,
|
|
POLICY_STRICT_ORIGIN,
|
|
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN,
|
|
POLICY_UNSAFE_URL,
|
|
DefaultReferrerPolicy,
|
|
NoReferrerPolicy,
|
|
NoReferrerWhenDowngradePolicy,
|
|
OriginPolicy,
|
|
OriginWhenCrossOriginPolicy,
|
|
RefererMiddleware,
|
|
ReferrerPolicy,
|
|
SameOriginPolicy,
|
|
StrictOriginPolicy,
|
|
StrictOriginWhenCrossOriginPolicy,
|
|
UnsafeUrlPolicy,
|
|
)
|
|
from scrapy.spiders import Spider
|
|
|
|
|
|
class TestRefererMiddleware(TestCase):
|
|
req_meta: dict[str, Any] = {}
|
|
resp_headers: dict[str, str] = {}
|
|
settings: dict[str, Any] = {}
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
("http://scrapytest.org", "http://scrapytest.org/", b"http://scrapytest.org"),
|
|
]
|
|
|
|
def setUp(self):
|
|
self.spider = Spider("foo")
|
|
settings = Settings(self.settings)
|
|
self.mw = RefererMiddleware(settings)
|
|
|
|
def get_request(self, target):
|
|
return Request(target, meta=self.req_meta)
|
|
|
|
def get_response(self, origin):
|
|
return Response(origin, headers=self.resp_headers)
|
|
|
|
def test(self):
|
|
for origin, target, referrer in self.scenarii:
|
|
response = self.get_response(origin)
|
|
request = self.get_request(target)
|
|
out = list(self.mw.process_spider_output(response, [request], self.spider))
|
|
self.assertEqual(out[0].headers.get("Referer"), referrer)
|
|
|
|
|
|
class MixinDefault:
|
|
"""
|
|
Based on https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
|
|
|
|
with some additional filtering of s3://
|
|
"""
|
|
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
("https://example.com/", "https://scrapy.org/", b"https://example.com/"),
|
|
("http://example.com/", "http://scrapy.org/", b"http://example.com/"),
|
|
("http://example.com/", "https://scrapy.org/", b"http://example.com/"),
|
|
("https://example.com/", "http://scrapy.org/", None),
|
|
# no credentials leak
|
|
(
|
|
"http://user:password@example.com/",
|
|
"https://scrapy.org/",
|
|
b"http://example.com/",
|
|
),
|
|
# no referrer leak for local schemes
|
|
("file:///home/path/to/somefile.html", "https://scrapy.org/", None),
|
|
("file:///home/path/to/somefile.html", "http://scrapy.org/", None),
|
|
# no referrer leak for s3 origins
|
|
("s3://mybucket/path/to/data.csv", "https://scrapy.org/", None),
|
|
("s3://mybucket/path/to/data.csv", "http://scrapy.org/", None),
|
|
]
|
|
|
|
|
|
class MixinNoReferrer:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
("https://example.com/page.html", "https://example.com/", None),
|
|
("http://www.example.com/", "https://scrapy.org/", None),
|
|
("http://www.example.com/", "http://scrapy.org/", None),
|
|
("https://www.example.com/", "http://scrapy.org/", None),
|
|
("file:///home/path/to/somefile.html", "http://scrapy.org/", None),
|
|
]
|
|
|
|
|
|
class MixinNoReferrerWhenDowngrade:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# TLS to TLS: send non-empty referrer
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://not.example.com/",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://scrapy.org/",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"https://example.com:443/page.html",
|
|
"https://scrapy.org/",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"https://example.com:444/page.html",
|
|
"https://scrapy.org/",
|
|
b"https://example.com:444/page.html",
|
|
),
|
|
(
|
|
"ftps://example.com/urls.zip",
|
|
"https://scrapy.org/",
|
|
b"ftps://example.com/urls.zip",
|
|
),
|
|
# TLS to non-TLS: do not send referrer
|
|
("https://example.com/page.html", "http://not.example.com/", None),
|
|
("https://example.com/page.html", "http://scrapy.org/", None),
|
|
("ftps://example.com/urls.zip", "http://scrapy.org/", None),
|
|
# non-TLS to TLS or non-TLS: send referrer
|
|
(
|
|
"http://example.com/page.html",
|
|
"https://not.example.com/",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"https://scrapy.org/",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:8080/page.html",
|
|
"https://scrapy.org/",
|
|
b"http://example.com:8080/page.html",
|
|
),
|
|
(
|
|
"http://example.com:80/page.html",
|
|
"http://not.example.com/",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://scrapy.org/",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:443/page.html",
|
|
"http://scrapy.org/",
|
|
b"http://example.com:443/page.html",
|
|
),
|
|
(
|
|
"ftp://example.com/urls.zip",
|
|
"http://scrapy.org/",
|
|
b"ftp://example.com/urls.zip",
|
|
),
|
|
(
|
|
"ftp://example.com/urls.zip",
|
|
"https://scrapy.org/",
|
|
b"ftp://example.com/urls.zip",
|
|
),
|
|
# test for user/password stripping
|
|
(
|
|
"http://user:password@example.com/page.html",
|
|
"https://not.example.com/",
|
|
b"http://example.com/page.html",
|
|
),
|
|
]
|
|
|
|
|
|
class MixinSameOrigin:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# Same origin (protocol, host, port): send referrer
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://example.com/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"https://example.com:443/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:80/page.html",
|
|
"http://example.com/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://example.com:80/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:8888/page.html",
|
|
"http://example.com:8888/not-page.html",
|
|
b"http://example.com:8888/page.html",
|
|
),
|
|
# Different host: do NOT send referrer
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://not.example.com/otherpage.html",
|
|
None,
|
|
),
|
|
("http://example.com/page.html", "http://not.example.com/otherpage.html", None),
|
|
("http://example.com/page.html", "http://www.example.com/otherpage.html", None),
|
|
# Different port: do NOT send referrer
|
|
(
|
|
"https://example.com:444/page.html",
|
|
"https://example.com/not-page.html",
|
|
None,
|
|
),
|
|
("http://example.com:81/page.html", "http://example.com/not-page.html", None),
|
|
("http://example.com/page.html", "http://example.com:81/not-page.html", None),
|
|
# Different protocols: do NOT send referrer
|
|
("https://example.com/page.html", "http://example.com/not-page.html", None),
|
|
("https://example.com/page.html", "http://not.example.com/", None),
|
|
("ftps://example.com/urls.zip", "https://example.com/not-page.html", None),
|
|
("ftp://example.com/urls.zip", "http://example.com/not-page.html", None),
|
|
("ftps://example.com/urls.zip", "https://example.com/not-page.html", None),
|
|
# test for user/password stripping
|
|
(
|
|
"https://user:password@example.com/page.html",
|
|
"http://example.com/not-page.html",
|
|
None,
|
|
),
|
|
(
|
|
"https://user:password@example.com/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
]
|
|
|
|
|
|
class MixinOrigin:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# TLS or non-TLS to TLS or non-TLS: referrer origin is sent (yes, even for downgrades)
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/",
|
|
),
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://scrapy.org",
|
|
b"https://example.com/",
|
|
),
|
|
("https://example.com/page.html", "http://scrapy.org", b"https://example.com/"),
|
|
("http://example.com/page.html", "http://scrapy.org", b"http://example.com/"),
|
|
# test for user/password stripping
|
|
(
|
|
"https://user:password@example.com/page.html",
|
|
"http://scrapy.org",
|
|
b"https://example.com/",
|
|
),
|
|
]
|
|
|
|
|
|
class MixinStrictOrigin:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# TLS or non-TLS to TLS or non-TLS: referrer origin is sent but not for downgrades
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/",
|
|
),
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://scrapy.org",
|
|
b"https://example.com/",
|
|
),
|
|
("http://example.com/page.html", "http://scrapy.org", b"http://example.com/"),
|
|
# downgrade: send nothing
|
|
("https://example.com/page.html", "http://scrapy.org", None),
|
|
# upgrade: send origin
|
|
("http://example.com/page.html", "https://scrapy.org", b"http://example.com/"),
|
|
# test for user/password stripping
|
|
(
|
|
"https://user:password@example.com/page.html",
|
|
"https://scrapy.org",
|
|
b"https://example.com/",
|
|
),
|
|
("https://user:password@example.com/page.html", "http://scrapy.org", None),
|
|
]
|
|
|
|
|
|
class MixinOriginWhenCrossOrigin:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# Same origin (protocol, host, port): send referrer
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://example.com/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"https://example.com:443/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:80/page.html",
|
|
"http://example.com/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://example.com:80/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:8888/page.html",
|
|
"http://example.com:8888/not-page.html",
|
|
b"http://example.com:8888/page.html",
|
|
),
|
|
# Different host: send origin as referrer
|
|
(
|
|
"https://example2.com/page.html",
|
|
"https://scrapy.org/otherpage.html",
|
|
b"https://example2.com/",
|
|
),
|
|
(
|
|
"https://example2.com/page.html",
|
|
"https://not.example2.com/otherpage.html",
|
|
b"https://example2.com/",
|
|
),
|
|
(
|
|
"http://example2.com/page.html",
|
|
"http://not.example2.com/otherpage.html",
|
|
b"http://example2.com/",
|
|
),
|
|
# exact match required
|
|
(
|
|
"http://example2.com/page.html",
|
|
"http://www.example2.com/otherpage.html",
|
|
b"http://example2.com/",
|
|
),
|
|
# Different port: send origin as referrer
|
|
(
|
|
"https://example3.com:444/page.html",
|
|
"https://example3.com/not-page.html",
|
|
b"https://example3.com:444/",
|
|
),
|
|
(
|
|
"http://example3.com:81/page.html",
|
|
"http://example3.com/not-page.html",
|
|
b"http://example3.com:81/",
|
|
),
|
|
# Different protocols: send origin as referrer
|
|
(
|
|
"https://example4.com/page.html",
|
|
"http://example4.com/not-page.html",
|
|
b"https://example4.com/",
|
|
),
|
|
(
|
|
"https://example4.com/page.html",
|
|
"http://not.example4.com/",
|
|
b"https://example4.com/",
|
|
),
|
|
(
|
|
"ftps://example4.com/urls.zip",
|
|
"https://example4.com/not-page.html",
|
|
b"ftps://example4.com/",
|
|
),
|
|
(
|
|
"ftp://example4.com/urls.zip",
|
|
"http://example4.com/not-page.html",
|
|
b"ftp://example4.com/",
|
|
),
|
|
(
|
|
"ftps://example4.com/urls.zip",
|
|
"https://example4.com/not-page.html",
|
|
b"ftps://example4.com/",
|
|
),
|
|
# test for user/password stripping
|
|
(
|
|
"https://user:password@example5.com/page.html",
|
|
"https://example5.com/not-page.html",
|
|
b"https://example5.com/page.html",
|
|
),
|
|
# TLS to non-TLS downgrade: send origin
|
|
(
|
|
"https://user:password@example5.com/page.html",
|
|
"http://example5.com/not-page.html",
|
|
b"https://example5.com/",
|
|
),
|
|
]
|
|
|
|
|
|
class MixinStrictOriginWhenCrossOrigin:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# Same origin (protocol, host, port): send referrer
|
|
(
|
|
"https://example.com/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://example.com/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"https://example.com:443/page.html",
|
|
"https://example.com/not-page.html",
|
|
b"https://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:80/page.html",
|
|
"http://example.com/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com/page.html",
|
|
"http://example.com:80/not-page.html",
|
|
b"http://example.com/page.html",
|
|
),
|
|
(
|
|
"http://example.com:8888/page.html",
|
|
"http://example.com:8888/not-page.html",
|
|
b"http://example.com:8888/page.html",
|
|
),
|
|
# Different host: send origin as referrer
|
|
(
|
|
"https://example2.com/page.html",
|
|
"https://scrapy.org/otherpage.html",
|
|
b"https://example2.com/",
|
|
),
|
|
(
|
|
"https://example2.com/page.html",
|
|
"https://not.example2.com/otherpage.html",
|
|
b"https://example2.com/",
|
|
),
|
|
(
|
|
"http://example2.com/page.html",
|
|
"http://not.example2.com/otherpage.html",
|
|
b"http://example2.com/",
|
|
),
|
|
# exact match required
|
|
(
|
|
"http://example2.com/page.html",
|
|
"http://www.example2.com/otherpage.html",
|
|
b"http://example2.com/",
|
|
),
|
|
# Different port: send origin as referrer
|
|
(
|
|
"https://example3.com:444/page.html",
|
|
"https://example3.com/not-page.html",
|
|
b"https://example3.com:444/",
|
|
),
|
|
(
|
|
"http://example3.com:81/page.html",
|
|
"http://example3.com/not-page.html",
|
|
b"http://example3.com:81/",
|
|
),
|
|
# downgrade
|
|
("https://example4.com/page.html", "http://example4.com/not-page.html", None),
|
|
("https://example4.com/page.html", "http://not.example4.com/", None),
|
|
# non-TLS to non-TLS
|
|
(
|
|
"ftp://example4.com/urls.zip",
|
|
"http://example4.com/not-page.html",
|
|
b"ftp://example4.com/",
|
|
),
|
|
# upgrade
|
|
(
|
|
"http://example4.com/page.html",
|
|
"https://example4.com/not-page.html",
|
|
b"http://example4.com/",
|
|
),
|
|
(
|
|
"http://example4.com/page.html",
|
|
"https://not.example4.com/",
|
|
b"http://example4.com/",
|
|
),
|
|
# Different protocols: send origin as referrer
|
|
(
|
|
"ftps://example4.com/urls.zip",
|
|
"https://example4.com/not-page.html",
|
|
b"ftps://example4.com/",
|
|
),
|
|
(
|
|
"ftps://example4.com/urls.zip",
|
|
"https://example4.com/not-page.html",
|
|
b"ftps://example4.com/",
|
|
),
|
|
# test for user/password stripping
|
|
(
|
|
"https://user:password@example5.com/page.html",
|
|
"https://example5.com/not-page.html",
|
|
b"https://example5.com/page.html",
|
|
),
|
|
# TLS to non-TLS downgrade: send nothing
|
|
(
|
|
"https://user:password@example5.com/page.html",
|
|
"http://example5.com/not-page.html",
|
|
None,
|
|
),
|
|
]
|
|
|
|
|
|
class MixinUnsafeUrl:
|
|
scenarii: list[tuple[str, str, bytes | None]] = [
|
|
# TLS to TLS: send referrer
|
|
(
|
|
"https://example.com/sekrit.html",
|
|
"http://not.example.com/",
|
|
b"https://example.com/sekrit.html",
|
|
),
|
|
(
|
|
"https://example1.com/page.html",
|
|
"https://not.example1.com/",
|
|
b"https://example1.com/page.html",
|
|
),
|
|
(
|
|
"https://example1.com/page.html",
|
|
"https://scrapy.org/",
|
|
b"https://example1.com/page.html",
|
|
),
|
|
(
|
|
"https://example1.com:443/page.html",
|
|
"https://scrapy.org/",
|
|
b"https://example1.com/page.html",
|
|
),
|
|
(
|
|
"https://example1.com:444/page.html",
|
|
"https://scrapy.org/",
|
|
b"https://example1.com:444/page.html",
|
|
),
|
|
(
|
|
"ftps://example1.com/urls.zip",
|
|
"https://scrapy.org/",
|
|
b"ftps://example1.com/urls.zip",
|
|
),
|
|
# TLS to non-TLS: send referrer (yes, it's unsafe)
|
|
(
|
|
"https://example2.com/page.html",
|
|
"http://not.example2.com/",
|
|
b"https://example2.com/page.html",
|
|
),
|
|
(
|
|
"https://example2.com/page.html",
|
|
"http://scrapy.org/",
|
|
b"https://example2.com/page.html",
|
|
),
|
|
(
|
|
"ftps://example2.com/urls.zip",
|
|
"http://scrapy.org/",
|
|
b"ftps://example2.com/urls.zip",
|
|
),
|
|
# non-TLS to TLS or non-TLS: send referrer (yes, it's unsafe)
|
|
(
|
|
"http://example3.com/page.html",
|
|
"https://not.example3.com/",
|
|
b"http://example3.com/page.html",
|
|
),
|
|
(
|
|
"http://example3.com/page.html",
|
|
"https://scrapy.org/",
|
|
b"http://example3.com/page.html",
|
|
),
|
|
(
|
|
"http://example3.com:8080/page.html",
|
|
"https://scrapy.org/",
|
|
b"http://example3.com:8080/page.html",
|
|
),
|
|
(
|
|
"http://example3.com:80/page.html",
|
|
"http://not.example3.com/",
|
|
b"http://example3.com/page.html",
|
|
),
|
|
(
|
|
"http://example3.com/page.html",
|
|
"http://scrapy.org/",
|
|
b"http://example3.com/page.html",
|
|
),
|
|
(
|
|
"http://example3.com:443/page.html",
|
|
"http://scrapy.org/",
|
|
b"http://example3.com:443/page.html",
|
|
),
|
|
(
|
|
"ftp://example3.com/urls.zip",
|
|
"http://scrapy.org/",
|
|
b"ftp://example3.com/urls.zip",
|
|
),
|
|
(
|
|
"ftp://example3.com/urls.zip",
|
|
"https://scrapy.org/",
|
|
b"ftp://example3.com/urls.zip",
|
|
),
|
|
# test for user/password stripping
|
|
(
|
|
"http://user:password@example4.com/page.html",
|
|
"https://not.example4.com/",
|
|
b"http://example4.com/page.html",
|
|
),
|
|
(
|
|
"https://user:password@example4.com/page.html",
|
|
"http://scrapy.org/",
|
|
b"https://example4.com/page.html",
|
|
),
|
|
]
|
|
|
|
|
|
class TestRefererMiddlewareDefault(MixinDefault, TestRefererMiddleware):
|
|
pass
|
|
|
|
|
|
# --- Tests using settings to set policy using class path
|
|
class TestSettingsNoReferrer(MixinNoReferrer, TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.NoReferrerPolicy"}
|
|
|
|
|
|
class TestSettingsNoReferrerWhenDowngrade(
|
|
MixinNoReferrerWhenDowngrade, TestRefererMiddleware
|
|
):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy"
|
|
}
|
|
|
|
|
|
class TestSettingsSameOrigin(MixinSameOrigin, TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.SameOriginPolicy"}
|
|
|
|
|
|
class TestSettingsOrigin(MixinOrigin, TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.OriginPolicy"}
|
|
|
|
|
|
class TestSettingsStrictOrigin(MixinStrictOrigin, TestRefererMiddleware):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.StrictOriginPolicy"
|
|
}
|
|
|
|
|
|
class TestSettingsOriginWhenCrossOrigin(
|
|
MixinOriginWhenCrossOrigin, TestRefererMiddleware
|
|
):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy"
|
|
}
|
|
|
|
|
|
class TestSettingsStrictOriginWhenCrossOrigin(
|
|
MixinStrictOriginWhenCrossOrigin, TestRefererMiddleware
|
|
):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.StrictOriginWhenCrossOriginPolicy"
|
|
}
|
|
|
|
|
|
class TestSettingsUnsafeUrl(MixinUnsafeUrl, TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.UnsafeUrlPolicy"}
|
|
|
|
|
|
class CustomPythonOrgPolicy(ReferrerPolicy):
|
|
"""
|
|
A dummy policy that returns referrer as http(s)://python.org
|
|
depending on the scheme of the target URL.
|
|
"""
|
|
|
|
def referrer(self, response, request):
|
|
scheme = urlparse(request).scheme
|
|
if scheme == "https":
|
|
return b"https://python.org/"
|
|
if scheme == "http":
|
|
return b"http://python.org/"
|
|
return None
|
|
|
|
|
|
class TestSettingsCustomPolicy(TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": CustomPythonOrgPolicy}
|
|
scenarii = [
|
|
("https://example.com/", "https://scrapy.org/", b"https://python.org/"),
|
|
("http://example.com/", "http://scrapy.org/", b"http://python.org/"),
|
|
("http://example.com/", "https://scrapy.org/", b"https://python.org/"),
|
|
("https://example.com/", "http://scrapy.org/", b"http://python.org/"),
|
|
(
|
|
"file:///home/path/to/somefile.html",
|
|
"https://scrapy.org/",
|
|
b"https://python.org/",
|
|
),
|
|
(
|
|
"file:///home/path/to/somefile.html",
|
|
"http://scrapy.org/",
|
|
b"http://python.org/",
|
|
),
|
|
]
|
|
|
|
|
|
# --- Tests using Request meta dict to set policy
|
|
class TestRequestMetaDefault(MixinDefault, TestRefererMiddleware):
|
|
req_meta = {"referrer_policy": POLICY_SCRAPY_DEFAULT}
|
|
|
|
|
|
class TestRequestMetaNoReferrer(MixinNoReferrer, TestRefererMiddleware):
|
|
req_meta = {"referrer_policy": POLICY_NO_REFERRER}
|
|
|
|
|
|
class TestRequestMetaNoReferrerWhenDowngrade(
|
|
MixinNoReferrerWhenDowngrade, TestRefererMiddleware
|
|
):
|
|
req_meta = {"referrer_policy": POLICY_NO_REFERRER_WHEN_DOWNGRADE}
|
|
|
|
|
|
class TestRequestMetaSameOrigin(MixinSameOrigin, TestRefererMiddleware):
|
|
req_meta = {"referrer_policy": POLICY_SAME_ORIGIN}
|
|
|
|
|
|
class TestRequestMetaOrigin(MixinOrigin, TestRefererMiddleware):
|
|
req_meta = {"referrer_policy": POLICY_ORIGIN}
|
|
|
|
|
|
class TestRequestMetaSrictOrigin(MixinStrictOrigin, TestRefererMiddleware):
|
|
req_meta = {"referrer_policy": POLICY_STRICT_ORIGIN}
|
|
|
|
|
|
class TestRequestMetaOriginWhenCrossOrigin(
|
|
MixinOriginWhenCrossOrigin, TestRefererMiddleware
|
|
):
|
|
req_meta = {"referrer_policy": POLICY_ORIGIN_WHEN_CROSS_ORIGIN}
|
|
|
|
|
|
class TestRequestMetaStrictOriginWhenCrossOrigin(
|
|
MixinStrictOriginWhenCrossOrigin, TestRefererMiddleware
|
|
):
|
|
req_meta = {"referrer_policy": POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN}
|
|
|
|
|
|
class TestRequestMetaUnsafeUrl(MixinUnsafeUrl, TestRefererMiddleware):
|
|
req_meta = {"referrer_policy": POLICY_UNSAFE_URL}
|
|
|
|
|
|
class TestRequestMetaPrecedence001(MixinUnsafeUrl, TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.SameOriginPolicy"}
|
|
req_meta = {"referrer_policy": POLICY_UNSAFE_URL}
|
|
|
|
|
|
class TestRequestMetaPrecedence002(MixinNoReferrer, TestRefererMiddleware):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy"
|
|
}
|
|
req_meta = {"referrer_policy": POLICY_NO_REFERRER}
|
|
|
|
|
|
class TestRequestMetaPrecedence003(MixinUnsafeUrl, TestRefererMiddleware):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy"
|
|
}
|
|
req_meta = {"referrer_policy": POLICY_UNSAFE_URL}
|
|
|
|
|
|
class TestRequestMetaSettingFallback(TestCase):
|
|
params = [
|
|
(
|
|
# When an unknown policy is referenced in Request.meta
|
|
# (here, a typo error),
|
|
# the policy defined in settings takes precedence
|
|
{
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy"
|
|
},
|
|
{},
|
|
{"referrer_policy": "ssscrapy-default"},
|
|
OriginWhenCrossOriginPolicy,
|
|
True,
|
|
),
|
|
(
|
|
# same as above but with string value for settings policy
|
|
{"REFERRER_POLICY": "origin-when-cross-origin"},
|
|
{},
|
|
{"referrer_policy": "ssscrapy-default"},
|
|
OriginWhenCrossOriginPolicy,
|
|
True,
|
|
),
|
|
(
|
|
# request meta references a wrong policy but it is set,
|
|
# so the Referrer-Policy header in response is not used,
|
|
# and the settings' policy is applied
|
|
{"REFERRER_POLICY": "origin-when-cross-origin"},
|
|
{"Referrer-Policy": "unsafe-url"},
|
|
{"referrer_policy": "ssscrapy-default"},
|
|
OriginWhenCrossOriginPolicy,
|
|
True,
|
|
),
|
|
(
|
|
# here, request meta does not set the policy
|
|
# so response headers take precedence
|
|
{"REFERRER_POLICY": "origin-when-cross-origin"},
|
|
{"Referrer-Policy": "unsafe-url"},
|
|
{},
|
|
UnsafeUrlPolicy,
|
|
False,
|
|
),
|
|
(
|
|
# here, request meta does not set the policy,
|
|
# but response headers also use an unknown policy,
|
|
# so the settings' policy is used
|
|
{"REFERRER_POLICY": "origin-when-cross-origin"},
|
|
{"Referrer-Policy": "unknown"},
|
|
{},
|
|
OriginWhenCrossOriginPolicy,
|
|
True,
|
|
),
|
|
]
|
|
|
|
def test(self):
|
|
origin = "http://www.scrapy.org"
|
|
target = "http://www.example.com"
|
|
|
|
for (
|
|
settings,
|
|
response_headers,
|
|
request_meta,
|
|
policy_class,
|
|
check_warning,
|
|
) in self.params[3:]:
|
|
mw = RefererMiddleware(Settings(settings))
|
|
|
|
response = Response(origin, headers=response_headers)
|
|
request = Request(target, meta=request_meta)
|
|
|
|
with warnings.catch_warnings(record=True) as w:
|
|
policy = mw.policy(response, request)
|
|
self.assertIsInstance(policy, policy_class)
|
|
|
|
if check_warning:
|
|
self.assertEqual(len(w), 1)
|
|
self.assertEqual(w[0].category, RuntimeWarning, w[0].message)
|
|
|
|
|
|
class TestSettingsPolicyByName(TestCase):
|
|
def test_valid_name(self):
|
|
for s, p in [
|
|
(POLICY_SCRAPY_DEFAULT, DefaultReferrerPolicy),
|
|
(POLICY_NO_REFERRER, NoReferrerPolicy),
|
|
(POLICY_NO_REFERRER_WHEN_DOWNGRADE, NoReferrerWhenDowngradePolicy),
|
|
(POLICY_SAME_ORIGIN, SameOriginPolicy),
|
|
(POLICY_ORIGIN, OriginPolicy),
|
|
(POLICY_STRICT_ORIGIN, StrictOriginPolicy),
|
|
(POLICY_ORIGIN_WHEN_CROSS_ORIGIN, OriginWhenCrossOriginPolicy),
|
|
(POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN, StrictOriginWhenCrossOriginPolicy),
|
|
(POLICY_UNSAFE_URL, UnsafeUrlPolicy),
|
|
]:
|
|
settings = Settings({"REFERRER_POLICY": s})
|
|
mw = RefererMiddleware(settings)
|
|
self.assertEqual(mw.default_policy, p)
|
|
|
|
def test_valid_name_casevariants(self):
|
|
for s, p in [
|
|
(POLICY_SCRAPY_DEFAULT, DefaultReferrerPolicy),
|
|
(POLICY_NO_REFERRER, NoReferrerPolicy),
|
|
(POLICY_NO_REFERRER_WHEN_DOWNGRADE, NoReferrerWhenDowngradePolicy),
|
|
(POLICY_SAME_ORIGIN, SameOriginPolicy),
|
|
(POLICY_ORIGIN, OriginPolicy),
|
|
(POLICY_STRICT_ORIGIN, StrictOriginPolicy),
|
|
(POLICY_ORIGIN_WHEN_CROSS_ORIGIN, OriginWhenCrossOriginPolicy),
|
|
(POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN, StrictOriginWhenCrossOriginPolicy),
|
|
(POLICY_UNSAFE_URL, UnsafeUrlPolicy),
|
|
]:
|
|
settings = Settings({"REFERRER_POLICY": s.upper()})
|
|
mw = RefererMiddleware(settings)
|
|
self.assertEqual(mw.default_policy, p)
|
|
|
|
def test_invalid_name(self):
|
|
settings = Settings({"REFERRER_POLICY": "some-custom-unknown-policy"})
|
|
with self.assertRaises(RuntimeError):
|
|
RefererMiddleware(settings)
|
|
|
|
def test_multiple_policy_tokens(self):
|
|
# test parsing without space(s) after the comma
|
|
settings1 = Settings(
|
|
{
|
|
"REFERRER_POLICY": (
|
|
f"some-custom-unknown-policy,"
|
|
f"{POLICY_SAME_ORIGIN},"
|
|
f"{POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN},"
|
|
f"another-custom-unknown-policy"
|
|
)
|
|
}
|
|
)
|
|
mw1 = RefererMiddleware(settings1)
|
|
self.assertEqual(mw1.default_policy, StrictOriginWhenCrossOriginPolicy)
|
|
|
|
# test parsing with space(s) after the comma
|
|
settings2 = Settings(
|
|
{
|
|
"REFERRER_POLICY": (
|
|
f"{POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN},"
|
|
f" another-custom-unknown-policy,"
|
|
f" {POLICY_UNSAFE_URL}"
|
|
)
|
|
}
|
|
)
|
|
mw2 = RefererMiddleware(settings2)
|
|
self.assertEqual(mw2.default_policy, UnsafeUrlPolicy)
|
|
|
|
def test_multiple_policy_tokens_all_invalid(self):
|
|
settings = Settings(
|
|
{
|
|
"REFERRER_POLICY": (
|
|
"some-custom-unknown-policy,"
|
|
"another-custom-unknown-policy,"
|
|
"yet-another-custom-unknown-policy"
|
|
)
|
|
}
|
|
)
|
|
with self.assertRaises(RuntimeError):
|
|
RefererMiddleware(settings)
|
|
|
|
|
|
class TestPolicyHeaderPrecedence001(MixinUnsafeUrl, TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.SameOriginPolicy"}
|
|
resp_headers = {"Referrer-Policy": POLICY_UNSAFE_URL.upper()}
|
|
|
|
|
|
class TestPolicyHeaderPrecedence002(MixinNoReferrer, TestRefererMiddleware):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy"
|
|
}
|
|
resp_headers = {"Referrer-Policy": POLICY_NO_REFERRER.swapcase()}
|
|
|
|
|
|
class TestPolicyHeaderPrecedence003(
|
|
MixinNoReferrerWhenDowngrade, TestRefererMiddleware
|
|
):
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy"
|
|
}
|
|
resp_headers = {"Referrer-Policy": POLICY_NO_REFERRER_WHEN_DOWNGRADE.title()}
|
|
|
|
|
|
class TestPolicyHeaderPrecedence004(
|
|
MixinNoReferrerWhenDowngrade, TestRefererMiddleware
|
|
):
|
|
"""
|
|
The empty string means "no-referrer-when-downgrade"
|
|
"""
|
|
|
|
settings = {
|
|
"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy"
|
|
}
|
|
resp_headers = {"Referrer-Policy": ""}
|
|
|
|
|
|
class TestReferrerOnRedirect(TestRefererMiddleware):
|
|
settings = {"REFERRER_POLICY": "scrapy.spidermiddlewares.referer.UnsafeUrlPolicy"}
|
|
scenarii: list[
|
|
tuple[str, str, tuple[tuple[int, str], ...], bytes | None, bytes | None]
|
|
] = [ # type: ignore[assignment]
|
|
(
|
|
"http://scrapytest.org/1", # parent
|
|
"http://scrapytest.org/2", # target
|
|
(
|
|
# redirections: code, URL
|
|
(301, "http://scrapytest.org/3"),
|
|
(301, "http://scrapytest.org/4"),
|
|
),
|
|
b"http://scrapytest.org/1", # expected initial referer
|
|
b"http://scrapytest.org/1", # expected referer for the redirection request
|
|
),
|
|
(
|
|
"https://scrapytest.org/1",
|
|
"https://scrapytest.org/2",
|
|
(
|
|
# redirecting to non-secure URL
|
|
(301, "http://scrapytest.org/3"),
|
|
),
|
|
b"https://scrapytest.org/1",
|
|
b"https://scrapytest.org/1",
|
|
),
|
|
(
|
|
"https://scrapytest.org/1",
|
|
"https://scrapytest.com/2",
|
|
(
|
|
# redirecting to non-secure URL: different origin
|
|
(301, "http://scrapytest.com/3"),
|
|
),
|
|
b"https://scrapytest.org/1",
|
|
b"https://scrapytest.org/1",
|
|
),
|
|
]
|
|
|
|
def setUp(self):
|
|
self.spider = Spider("foo")
|
|
settings = Settings(self.settings)
|
|
self.referrermw = RefererMiddleware(settings)
|
|
self.redirectmw = RedirectMiddleware(settings)
|
|
|
|
def test(self):
|
|
for (
|
|
parent,
|
|
target,
|
|
redirections,
|
|
init_referrer,
|
|
final_referrer,
|
|
) in self.scenarii:
|
|
response = self.get_response(parent)
|
|
request = self.get_request(target)
|
|
|
|
out = list(
|
|
self.referrermw.process_spider_output(response, [request], self.spider)
|
|
)
|
|
self.assertEqual(out[0].headers.get("Referer"), init_referrer)
|
|
|
|
for status, url in redirections:
|
|
response = Response(
|
|
request.url, headers={"Location": url}, status=status
|
|
)
|
|
request = self.redirectmw.process_response(
|
|
request, response, self.spider
|
|
)
|
|
self.referrermw.request_scheduled(request, self.spider)
|
|
|
|
assert isinstance(request, Request)
|
|
self.assertEqual(request.headers.get("Referer"), final_referrer)
|
|
|
|
|
|
class TestReferrerOnRedirectNoReferrer(TestReferrerOnRedirect):
|
|
"""
|
|
No Referrer policy never sets the "Referer" header.
|
|
HTTP redirections should not change that.
|
|
"""
|
|
|
|
settings = {"REFERRER_POLICY": "no-referrer"}
|
|
scenarii = [
|
|
(
|
|
"http://scrapytest.org/1", # parent
|
|
"http://scrapytest.org/2", # target
|
|
(
|
|
# redirections: code, URL
|
|
(301, "http://scrapytest.org/3"),
|
|
(301, "http://scrapytest.org/4"),
|
|
),
|
|
None, # expected initial "Referer"
|
|
None, # expected "Referer" for the redirection request
|
|
),
|
|
(
|
|
"https://scrapytest.org/1",
|
|
"https://scrapytest.org/2",
|
|
((301, "http://scrapytest.org/3"),),
|
|
None,
|
|
None,
|
|
),
|
|
(
|
|
"https://scrapytest.org/1",
|
|
"https://example.com/2", # different origin
|
|
((301, "http://scrapytest.com/3"),),
|
|
None,
|
|
None,
|
|
),
|
|
]
|
|
|
|
|
|
class TestReferrerOnRedirectSameOrigin(TestReferrerOnRedirect):
|
|
"""
|
|
Same Origin policy sends the full URL as "Referer" if the target origin
|
|
is the same as the parent response (same protocol, same domain, same port).
|
|
|
|
HTTP redirections to a different domain or a lower secure level
|
|
should have the "Referer" removed.
|
|
"""
|
|
|
|
settings = {"REFERRER_POLICY": "same-origin"}
|
|
scenarii = [
|
|
(
|
|
"http://scrapytest.org/101", # origin
|
|
"http://scrapytest.org/102", # target
|
|
(
|
|
# redirections: code, URL
|
|
(301, "http://scrapytest.org/103"),
|
|
(301, "http://scrapytest.org/104"),
|
|
),
|
|
b"http://scrapytest.org/101", # expected initial "Referer"
|
|
b"http://scrapytest.org/101", # expected referer for the redirection request
|
|
),
|
|
(
|
|
"https://scrapytest.org/201",
|
|
"https://scrapytest.org/202",
|
|
(
|
|
# redirecting from secure to non-secure URL == different origin
|
|
(301, "http://scrapytest.org/203"),
|
|
),
|
|
b"https://scrapytest.org/201",
|
|
None,
|
|
),
|
|
(
|
|
"https://scrapytest.org/301",
|
|
"https://scrapytest.org/302",
|
|
(
|
|
# different domain == different origin
|
|
(301, "http://example.com/303"),
|
|
),
|
|
b"https://scrapytest.org/301",
|
|
None,
|
|
),
|
|
]
|
|
|
|
|
|
class TestReferrerOnRedirectStrictOrigin(TestReferrerOnRedirect):
|
|
"""
|
|
Strict Origin policy will always send the "origin" as referrer
|
|
(think of it as the parent URL without the path part),
|
|
unless the security level is lower and no "Referer" is sent.
|
|
|
|
Redirections from secure to non-secure URLs should have the
|
|
"Referrer" header removed if necessary.
|
|
"""
|
|
|
|
settings = {"REFERRER_POLICY": POLICY_STRICT_ORIGIN}
|
|
scenarii = [
|
|
(
|
|
"http://scrapytest.org/101",
|
|
"http://scrapytest.org/102",
|
|
(
|
|
(301, "http://scrapytest.org/103"),
|
|
(301, "http://scrapytest.org/104"),
|
|
),
|
|
b"http://scrapytest.org/", # send origin
|
|
b"http://scrapytest.org/", # redirects to same origin: send origin
|
|
),
|
|
(
|
|
"https://scrapytest.org/201",
|
|
"https://scrapytest.org/202",
|
|
(
|
|
# redirecting to non-secure URL: no referrer
|
|
(301, "http://scrapytest.org/203"),
|
|
),
|
|
b"https://scrapytest.org/",
|
|
None,
|
|
),
|
|
(
|
|
"https://scrapytest.org/301",
|
|
"https://scrapytest.org/302",
|
|
(
|
|
# redirecting to non-secure URL (different domain): no referrer
|
|
(301, "http://example.com/303"),
|
|
),
|
|
b"https://scrapytest.org/",
|
|
None,
|
|
),
|
|
(
|
|
"http://scrapy.org/401",
|
|
"http://example.com/402",
|
|
((301, "http://scrapytest.org/403"),),
|
|
b"http://scrapy.org/",
|
|
b"http://scrapy.org/",
|
|
),
|
|
(
|
|
"https://scrapy.org/501",
|
|
"https://example.com/502",
|
|
(
|
|
# HTTPS all along, so origin referrer is kept as-is
|
|
(301, "https://google.com/503"),
|
|
(301, "https://facebook.com/504"),
|
|
),
|
|
b"https://scrapy.org/",
|
|
b"https://scrapy.org/",
|
|
),
|
|
(
|
|
"https://scrapytest.org/601",
|
|
"http://scrapytest.org/602", # TLS to non-TLS: no referrer
|
|
(
|
|
(
|
|
301,
|
|
"https://scrapytest.org/603",
|
|
), # TLS URL again: (still) no referrer
|
|
),
|
|
None,
|
|
None,
|
|
),
|
|
]
|
|
|
|
|
|
class TestReferrerOnRedirectOriginWhenCrossOrigin(TestReferrerOnRedirect):
|
|
"""
|
|
Origin When Cross-Origin policy sends the full URL as "Referer",
|
|
unless the target's origin is different (different domain, different protocol)
|
|
in which case only the origin is sent.
|
|
|
|
Redirections to a different origin should strip the "Referer"
|
|
to the parent origin.
|
|
"""
|
|
|
|
settings = {"REFERRER_POLICY": POLICY_ORIGIN_WHEN_CROSS_ORIGIN}
|
|
scenarii = [
|
|
(
|
|
"http://scrapytest.org/101", # origin
|
|
"http://scrapytest.org/102", # target + redirection
|
|
(
|
|
# redirections: code, URL
|
|
(301, "http://scrapytest.org/103"),
|
|
(301, "http://scrapytest.org/104"),
|
|
),
|
|
b"http://scrapytest.org/101", # expected initial referer
|
|
b"http://scrapytest.org/101", # expected referer for the redirection request
|
|
),
|
|
(
|
|
"https://scrapytest.org/201",
|
|
"https://scrapytest.org/202",
|
|
(
|
|
# redirecting to non-secure URL: send origin
|
|
(301, "http://scrapytest.org/203"),
|
|
),
|
|
b"https://scrapytest.org/201",
|
|
b"https://scrapytest.org/",
|
|
),
|
|
(
|
|
"https://scrapytest.org/301",
|
|
"https://scrapytest.org/302",
|
|
(
|
|
# redirecting to non-secure URL (different domain): send origin
|
|
(301, "http://example.com/303"),
|
|
),
|
|
b"https://scrapytest.org/301",
|
|
b"https://scrapytest.org/",
|
|
),
|
|
(
|
|
"http://scrapy.org/401",
|
|
"http://example.com/402",
|
|
((301, "http://scrapytest.org/403"),),
|
|
b"http://scrapy.org/",
|
|
b"http://scrapy.org/",
|
|
),
|
|
(
|
|
"https://scrapy.org/501",
|
|
"https://example.com/502",
|
|
(
|
|
# all different domains: send origin
|
|
(301, "https://google.com/503"),
|
|
(301, "https://facebook.com/504"),
|
|
),
|
|
b"https://scrapy.org/",
|
|
b"https://scrapy.org/",
|
|
),
|
|
(
|
|
"https://scrapytest.org/301",
|
|
"http://scrapytest.org/302", # TLS to non-TLS: send origin
|
|
((301, "https://scrapytest.org/303"),), # TLS URL again: send origin (also)
|
|
b"https://scrapytest.org/",
|
|
b"https://scrapytest.org/",
|
|
),
|
|
]
|
|
|
|
|
|
class TestReferrerOnRedirectStrictOriginWhenCrossOrigin(TestReferrerOnRedirect):
|
|
"""
|
|
Strict Origin When Cross-Origin policy sends the full URL as "Referer",
|
|
unless the target's origin is different (different domain, different protocol)
|
|
in which case only the origin is sent...
|
|
Unless there's also a downgrade in security and then the "Referer" header
|
|
is not sent.
|
|
|
|
Redirections to a different origin should strip the "Referer" to the parent origin,
|
|
and from https:// to http:// will remove the "Referer" header.
|
|
"""
|
|
|
|
settings = {"REFERRER_POLICY": POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN}
|
|
scenarii = [
|
|
(
|
|
"http://scrapytest.org/101", # origin
|
|
"http://scrapytest.org/102", # target + redirection
|
|
(
|
|
# redirections: code, URL
|
|
(301, "http://scrapytest.org/103"),
|
|
(301, "http://scrapytest.org/104"),
|
|
),
|
|
b"http://scrapytest.org/101", # expected initial referer
|
|
b"http://scrapytest.org/101", # expected referer for the redirection request
|
|
),
|
|
(
|
|
"https://scrapytest.org/201",
|
|
"https://scrapytest.org/202",
|
|
(
|
|
# redirecting to non-secure URL: do not send the "Referer" header
|
|
(301, "http://scrapytest.org/203"),
|
|
),
|
|
b"https://scrapytest.org/201",
|
|
None,
|
|
),
|
|
(
|
|
"https://scrapytest.org/301",
|
|
"https://scrapytest.org/302",
|
|
(
|
|
# redirecting to non-secure URL (different domain): send origin
|
|
(301, "http://example.com/303"),
|
|
),
|
|
b"https://scrapytest.org/301",
|
|
None,
|
|
),
|
|
(
|
|
"http://scrapy.org/401",
|
|
"http://example.com/402",
|
|
((301, "http://scrapytest.org/403"),),
|
|
b"http://scrapy.org/",
|
|
b"http://scrapy.org/",
|
|
),
|
|
(
|
|
"https://scrapy.org/501",
|
|
"https://example.com/502",
|
|
(
|
|
# all different domains: send origin
|
|
(301, "https://google.com/503"),
|
|
(301, "https://facebook.com/504"),
|
|
),
|
|
b"https://scrapy.org/",
|
|
b"https://scrapy.org/",
|
|
),
|
|
(
|
|
"https://scrapytest.org/601",
|
|
"http://scrapytest.org/602", # TLS to non-TLS: do not send "Referer"
|
|
(
|
|
(
|
|
301,
|
|
"https://scrapytest.org/603",
|
|
), # TLS URL again: (still) send nothing
|
|
),
|
|
None,
|
|
None,
|
|
),
|
|
]
|