1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 18:44:20 +00:00
scrapy/tests/test_spidermiddleware_referer.py
2017-03-01 17:51:23 +01:00

794 lines
36 KiB
Python

from six.moves.urllib.parse import urlparse
from unittest import TestCase
from scrapy.exceptions import NotConfigured
from scrapy.http import Response, Request
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
from scrapy.spidermiddlewares.referer import RefererMiddleware, \
POLICY_NO_REFERRER, POLICY_NO_REFERRER_WHEN_DOWNGRADE, \
POLICY_SAME_ORIGIN, POLICY_ORIGIN, POLICY_ORIGIN_WHEN_CROSS_ORIGIN, \
POLICY_SCRAPY_DEFAULT, POLICY_UNSAFE_URL, \
POLICY_STRICT_ORIGIN, POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN, \
DefaultReferrerPolicy, \
NoReferrerPolicy, NoReferrerWhenDowngradePolicy, \
OriginWhenCrossOriginPolicy, OriginPolicy, \
StrictOriginWhenCrossOriginPolicy, StrictOriginPolicy, \
SameOriginPolicy, UnsafeUrlPolicy, ReferrerPolicy
class TestRefererMiddleware(TestCase):
req_meta = {}
resp_headers = {}
settings = {}
scenarii = [
('http://scrapytest.org', 'http://scrapytest.org/', b'http://scrapytest.org'),
]
def setUp(self):
self.spider = Spider('foo')
settings = Settings(self.settings)
self.mw = RefererMiddleware(settings)
def get_request(self, target):
return Request(target, meta=self.req_meta)
def get_response(self, origin):
return Response(origin, headers=self.resp_headers)
def test(self):
for origin, target, referrer in self.scenarii:
response = self.get_response(origin)
request = self.get_request(target)
out = list(self.mw.process_spider_output(response, [request], self.spider))
self.assertEquals(out[0].headers.get('Referer'), referrer)
class MixinDefault(object):
"""
Based on https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
with some additional filtering of s3://
"""
scenarii = [
('https://example.com/', 'https://scrapy.org/', b'https://example.com/'),
('http://example.com/', 'http://scrapy.org/', b'http://example.com/'),
('http://example.com/', 'https://scrapy.org/', b'http://example.com/'),
('https://example.com/', 'http://scrapy.org/', None),
# no credentials leak
('http://user:password@example.com/', 'https://scrapy.org/', b'http://example.com/'),
# no referrer leak for local schemes
('file:///home/path/to/somefile.html', 'https://scrapy.org/', None),
('file:///home/path/to/somefile.html', 'http://scrapy.org/', None),
# no referrer leak for s3 origins
('s3://mybucket/path/to/data.csv', 'https://scrapy.org/', None),
('s3://mybucket/path/to/data.csv', 'http://scrapy.org/', None),
]
class MixinNoReferrer(object):
scenarii = [
('https://example.com/page.html', 'https://example.com/', None),
('http://www.example.com/', 'https://scrapy.org/', None),
('http://www.example.com/', 'http://scrapy.org/', None),
('https://www.example.com/', 'http://scrapy.org/', None),
('file:///home/path/to/somefile.html', 'http://scrapy.org/', None),
]
class MixinNoReferrerWhenDowngrade(object):
scenarii = [
# TLS to TLS: send non-empty referrer
('https://example.com/page.html', 'https://not.example.com/', b'https://example.com/page.html'),
('https://example.com/page.html', 'https://scrapy.org/', b'https://example.com/page.html'),
('https://example.com:443/page.html', 'https://scrapy.org/', b'https://example.com/page.html'),
('https://example.com:444/page.html', 'https://scrapy.org/', b'https://example.com:444/page.html'),
('ftps://example.com/urls.zip', 'https://scrapy.org/', b'ftps://example.com/urls.zip'),
# TLS to non-TLS: do not send referrer
('https://example.com/page.html', 'http://not.example.com/', None),
('https://example.com/page.html', 'http://scrapy.org/', None),
('ftps://example.com/urls.zip', 'http://scrapy.org/', None),
# non-TLS to TLS or non-TLS: send referrer
('http://example.com/page.html', 'https://not.example.com/', b'http://example.com/page.html'),
('http://example.com/page.html', 'https://scrapy.org/', b'http://example.com/page.html'),
('http://example.com:8080/page.html', 'https://scrapy.org/', b'http://example.com:8080/page.html'),
('http://example.com:80/page.html', 'http://not.example.com/', b'http://example.com/page.html'),
('http://example.com/page.html', 'http://scrapy.org/', b'http://example.com/page.html'),
('http://example.com:443/page.html', 'http://scrapy.org/', b'http://example.com:443/page.html'),
('ftp://example.com/urls.zip', 'http://scrapy.org/', b'ftp://example.com/urls.zip'),
('ftp://example.com/urls.zip', 'https://scrapy.org/', b'ftp://example.com/urls.zip'),
# test for user/password stripping
('http://user:password@example.com/page.html', 'https://not.example.com/', b'http://example.com/page.html'),
]
class MixinSameOrigin(object):
scenarii = [
# Same origin (protocol, host, port): send referrer
('https://example.com/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('http://example.com/page.html', 'http://example.com/not-page.html', b'http://example.com/page.html'),
('https://example.com:443/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('http://example.com:80/page.html', 'http://example.com/not-page.html', b'http://example.com/page.html'),
('http://example.com/page.html', 'http://example.com:80/not-page.html', b'http://example.com/page.html'),
('http://example.com:8888/page.html', 'http://example.com:8888/not-page.html', b'http://example.com:8888/page.html'),
# Different host: do NOT send referrer
('https://example.com/page.html', 'https://not.example.com/otherpage.html', None),
('http://example.com/page.html', 'http://not.example.com/otherpage.html', None),
('http://example.com/page.html', 'http://www.example.com/otherpage.html', None),
# Different port: do NOT send referrer
('https://example.com:444/page.html', 'https://example.com/not-page.html', None),
('http://example.com:81/page.html', 'http://example.com/not-page.html', None),
('http://example.com/page.html', 'http://example.com:81/not-page.html', None),
# Different protocols: do NOT send refferer
('https://example.com/page.html', 'http://example.com/not-page.html', None),
('https://example.com/page.html', 'http://not.example.com/', None),
('ftps://example.com/urls.zip', 'https://example.com/not-page.html', None),
('ftp://example.com/urls.zip', 'http://example.com/not-page.html', None),
('ftps://example.com/urls.zip', 'https://example.com/not-page.html', None),
# test for user/password stripping
('https://user:password@example.com/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('https://user:password@example.com/page.html', 'http://example.com/not-page.html', None),
]
class MixinOrigin(object):
scenarii = [
# TLS or non-TLS to TLS or non-TLS: referrer origin is sent (yes, even for downgrades)
('https://example.com/page.html', 'https://example.com/not-page.html', b'https://example.com/'),
('https://example.com/page.html', 'https://scrapy.org', b'https://example.com/'),
('https://example.com/page.html', 'http://scrapy.org', b'https://example.com/'),
('http://example.com/page.html', 'http://scrapy.org', b'http://example.com/'),
# test for user/password stripping
('https://user:password@example.com/page.html', 'http://scrapy.org', b'https://example.com/'),
]
class MixinStrictOrigin(object):
scenarii = [
# TLS or non-TLS to TLS or non-TLS: referrer origin is sent but not for downgrades
('https://example.com/page.html', 'https://example.com/not-page.html', b'https://example.com/'),
('https://example.com/page.html', 'https://scrapy.org', b'https://example.com/'),
('http://example.com/page.html', 'http://scrapy.org', b'http://example.com/'),
# downgrade: send nothing
('https://example.com/page.html', 'http://scrapy.org', None),
# upgrade: send origin
('http://example.com/page.html', 'https://scrapy.org', b'http://example.com/'),
# test for user/password stripping
('https://user:password@example.com/page.html', 'https://scrapy.org', b'https://example.com/'),
('https://user:password@example.com/page.html', 'http://scrapy.org', None),
]
class MixinOriginWhenCrossOrigin(object):
scenarii = [
# Same origin (protocol, host, port): send referrer
('https://example.com/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('http://example.com/page.html', 'http://example.com/not-page.html', b'http://example.com/page.html'),
('https://example.com:443/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('http://example.com:80/page.html', 'http://example.com/not-page.html', b'http://example.com/page.html'),
('http://example.com/page.html', 'http://example.com:80/not-page.html', b'http://example.com/page.html'),
('http://example.com:8888/page.html', 'http://example.com:8888/not-page.html', b'http://example.com:8888/page.html'),
# Different host: send origin as referrer
('https://example2.com/page.html', 'https://scrapy.org/otherpage.html', b'https://example2.com/'),
('https://example2.com/page.html', 'https://not.example2.com/otherpage.html', b'https://example2.com/'),
('http://example2.com/page.html', 'http://not.example2.com/otherpage.html', b'http://example2.com/'),
# exact match required
('http://example2.com/page.html', 'http://www.example2.com/otherpage.html', b'http://example2.com/'),
# Different port: send origin as referrer
('https://example3.com:444/page.html', 'https://example3.com/not-page.html', b'https://example3.com:444/'),
('http://example3.com:81/page.html', 'http://example3.com/not-page.html', b'http://example3.com:81/'),
# Different protocols: send origin as referrer
('https://example4.com/page.html', 'http://example4.com/not-page.html', b'https://example4.com/'),
('https://example4.com/page.html', 'http://not.example4.com/', b'https://example4.com/'),
('ftps://example4.com/urls.zip', 'https://example4.com/not-page.html', b'ftps://example4.com/'),
('ftp://example4.com/urls.zip', 'http://example4.com/not-page.html', b'ftp://example4.com/'),
('ftps://example4.com/urls.zip', 'https://example4.com/not-page.html', b'ftps://example4.com/'),
# test for user/password stripping
('https://user:password@example5.com/page.html', 'https://example5.com/not-page.html', b'https://example5.com/page.html'),
# TLS to non-TLS downgrade: send origin
('https://user:password@example5.com/page.html', 'http://example5.com/not-page.html', b'https://example5.com/'),
]
class MixinStrictOriginWhenCrossOrigin(object):
scenarii = [
# Same origin (protocol, host, port): send referrer
('https://example.com/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('http://example.com/page.html', 'http://example.com/not-page.html', b'http://example.com/page.html'),
('https://example.com:443/page.html', 'https://example.com/not-page.html', b'https://example.com/page.html'),
('http://example.com:80/page.html', 'http://example.com/not-page.html', b'http://example.com/page.html'),
('http://example.com/page.html', 'http://example.com:80/not-page.html', b'http://example.com/page.html'),
('http://example.com:8888/page.html', 'http://example.com:8888/not-page.html', b'http://example.com:8888/page.html'),
# Different host: send origin as referrer
('https://example2.com/page.html', 'https://scrapy.org/otherpage.html', b'https://example2.com/'),
('https://example2.com/page.html', 'https://not.example2.com/otherpage.html', b'https://example2.com/'),
('http://example2.com/page.html', 'http://not.example2.com/otherpage.html', b'http://example2.com/'),
# exact match required
('http://example2.com/page.html', 'http://www.example2.com/otherpage.html', b'http://example2.com/'),
# Different port: send origin as referrer
('https://example3.com:444/page.html', 'https://example3.com/not-page.html', b'https://example3.com:444/'),
('http://example3.com:81/page.html', 'http://example3.com/not-page.html', b'http://example3.com:81/'),
# downgrade
('https://example4.com/page.html', 'http://example4.com/not-page.html', None),
('https://example4.com/page.html', 'http://not.example4.com/', None),
# non-TLS to non-TLS
('ftp://example4.com/urls.zip', 'http://example4.com/not-page.html', b'ftp://example4.com/'),
# upgrade
('http://example4.com/page.html', 'https://example4.com/not-page.html', b'http://example4.com/'),
('http://example4.com/page.html', 'https://not.example4.com/', b'http://example4.com/'),
# Different protocols: send origin as referrer
('ftps://example4.com/urls.zip', 'https://example4.com/not-page.html', b'ftps://example4.com/'),
('ftps://example4.com/urls.zip', 'https://example4.com/not-page.html', b'ftps://example4.com/'),
# test for user/password stripping
('https://user:password@example5.com/page.html', 'https://example5.com/not-page.html', b'https://example5.com/page.html'),
# TLS to non-TLS downgrade: send nothing
('https://user:password@example5.com/page.html', 'http://example5.com/not-page.html', None),
]
class MixinUnsafeUrl(object):
scenarii = [
# TLS to TLS: send referrer
('https://example.com/sekrit.html', 'http://not.example.com/', b'https://example.com/sekrit.html'),
('https://example1.com/page.html', 'https://not.example1.com/', b'https://example1.com/page.html'),
('https://example1.com/page.html', 'https://scrapy.org/', b'https://example1.com/page.html'),
('https://example1.com:443/page.html', 'https://scrapy.org/', b'https://example1.com/page.html'),
('https://example1.com:444/page.html', 'https://scrapy.org/', b'https://example1.com:444/page.html'),
('ftps://example1.com/urls.zip', 'https://scrapy.org/', b'ftps://example1.com/urls.zip'),
# TLS to non-TLS: send referrer (yes, it's unsafe)
('https://example2.com/page.html', 'http://not.example2.com/', b'https://example2.com/page.html'),
('https://example2.com/page.html', 'http://scrapy.org/', b'https://example2.com/page.html'),
('ftps://example2.com/urls.zip', 'http://scrapy.org/', b'ftps://example2.com/urls.zip'),
# non-TLS to TLS or non-TLS: send referrer (yes, it's unsafe)
('http://example3.com/page.html', 'https://not.example3.com/', b'http://example3.com/page.html'),
('http://example3.com/page.html', 'https://scrapy.org/', b'http://example3.com/page.html'),
('http://example3.com:8080/page.html', 'https://scrapy.org/', b'http://example3.com:8080/page.html'),
('http://example3.com:80/page.html', 'http://not.example3.com/', b'http://example3.com/page.html'),
('http://example3.com/page.html', 'http://scrapy.org/', b'http://example3.com/page.html'),
('http://example3.com:443/page.html', 'http://scrapy.org/', b'http://example3.com:443/page.html'),
('ftp://example3.com/urls.zip', 'http://scrapy.org/', b'ftp://example3.com/urls.zip'),
('ftp://example3.com/urls.zip', 'https://scrapy.org/', b'ftp://example3.com/urls.zip'),
# test for user/password stripping
('http://user:password@example4.com/page.html', 'https://not.example4.com/', b'http://example4.com/page.html'),
('https://user:password@example4.com/page.html', 'http://scrapy.org/', b'https://example4.com/page.html'),
]
class TestRefererMiddlewareDefault(MixinDefault, TestRefererMiddleware):
pass
# --- Tests using settings to set policy using class path
class TestSettingsNoReferrer(MixinNoReferrer, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.NoReferrerPolicy'}
class TestSettingsNoReferrerWhenDowngrade(MixinNoReferrerWhenDowngrade, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy'}
class TestSettingsSameOrigin(MixinSameOrigin, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.SameOriginPolicy'}
class TestSettingsOrigin(MixinOrigin, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.OriginPolicy'}
class TestSettingsStrictOrigin(MixinStrictOrigin, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.StrictOriginPolicy'}
class TestSettingsOriginWhenCrossOrigin(MixinOriginWhenCrossOrigin, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy'}
class TestSettingsStrictOriginWhenCrossOrigin(MixinStrictOriginWhenCrossOrigin, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.StrictOriginWhenCrossOriginPolicy'}
class TestSettingsUnsafeUrl(MixinUnsafeUrl, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.UnsafeUrlPolicy'}
class CustomPythonOrgPolicy(ReferrerPolicy):
"""
A dummy policy that returns referrer as http(s)://python.org
depending on the scheme of the target URL.
"""
def referrer(self, response, request):
scheme = urlparse(request).scheme
if scheme == 'https':
return b'https://python.org/'
elif scheme == 'http':
return b'http://python.org/'
class TestSettingsCustomPolicy(TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'tests.test_spidermiddleware_referer.CustomPythonOrgPolicy'}
scenarii = [
('https://example.com/', 'https://scrapy.org/', b'https://python.org/'),
('http://example.com/', 'http://scrapy.org/', b'http://python.org/'),
('http://example.com/', 'https://scrapy.org/', b'https://python.org/'),
('https://example.com/', 'http://scrapy.org/', b'http://python.org/'),
('file:///home/path/to/somefile.html', 'https://scrapy.org/', b'https://python.org/'),
('file:///home/path/to/somefile.html', 'http://scrapy.org/', b'http://python.org/'),
]
# --- Tests using Request meta dict to set policy
class TestRequestMetaDefault(MixinDefault, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_SCRAPY_DEFAULT}
class TestRequestMetaNoReferrer(MixinNoReferrer, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_NO_REFERRER}
class TestRequestMetaNoReferrerWhenDowngrade(MixinNoReferrerWhenDowngrade, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_NO_REFERRER_WHEN_DOWNGRADE}
class TestRequestMetaSameOrigin(MixinSameOrigin, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_SAME_ORIGIN}
class TestRequestMetaOrigin(MixinOrigin, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_ORIGIN}
class TestRequestMetaSrictOrigin(MixinStrictOrigin, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_STRICT_ORIGIN}
class TestRequestMetaOriginWhenCrossOrigin(MixinOriginWhenCrossOrigin, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_ORIGIN_WHEN_CROSS_ORIGIN}
class TestRequestMetaStrictOriginWhenCrossOrigin(MixinStrictOriginWhenCrossOrigin, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN}
class TestRequestMetaUnsafeUrl(MixinUnsafeUrl, TestRefererMiddleware):
req_meta = {'referrer_policy': POLICY_UNSAFE_URL}
class TestRequestMetaPredecence001(MixinUnsafeUrl, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.SameOriginPolicy'}
req_meta = {'referrer_policy': POLICY_UNSAFE_URL}
class TestRequestMetaPredecence002(MixinNoReferrer, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy'}
req_meta = {'referrer_policy': POLICY_NO_REFERRER}
class TestRequestMetaPredecence003(MixinUnsafeUrl, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy'}
req_meta = {'referrer_policy': POLICY_UNSAFE_URL}
class TestSettingsPolicyByName(TestCase):
def test_valid_name(self):
for s, p in [
(POLICY_SCRAPY_DEFAULT, DefaultReferrerPolicy),
(POLICY_NO_REFERRER, NoReferrerPolicy),
(POLICY_NO_REFERRER_WHEN_DOWNGRADE, NoReferrerWhenDowngradePolicy),
(POLICY_SAME_ORIGIN, SameOriginPolicy),
(POLICY_ORIGIN, OriginPolicy),
(POLICY_STRICT_ORIGIN, StrictOriginPolicy),
(POLICY_ORIGIN_WHEN_CROSS_ORIGIN, OriginWhenCrossOriginPolicy),
(POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN, StrictOriginWhenCrossOriginPolicy),
(POLICY_UNSAFE_URL, UnsafeUrlPolicy),
]:
settings = Settings({'REFERRER_POLICY': s})
mw = RefererMiddleware(settings)
self.assertEquals(mw.default_policy, p)
def test_valid_name_casevariants(self):
for s, p in [
(POLICY_SCRAPY_DEFAULT, DefaultReferrerPolicy),
(POLICY_NO_REFERRER, NoReferrerPolicy),
(POLICY_NO_REFERRER_WHEN_DOWNGRADE, NoReferrerWhenDowngradePolicy),
(POLICY_SAME_ORIGIN, SameOriginPolicy),
(POLICY_ORIGIN, OriginPolicy),
(POLICY_STRICT_ORIGIN, StrictOriginPolicy),
(POLICY_ORIGIN_WHEN_CROSS_ORIGIN, OriginWhenCrossOriginPolicy),
(POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN, StrictOriginWhenCrossOriginPolicy),
(POLICY_UNSAFE_URL, UnsafeUrlPolicy),
]:
settings = Settings({'REFERRER_POLICY': s.upper()})
mw = RefererMiddleware(settings)
self.assertEquals(mw.default_policy, p)
def test_invalid_name(self):
settings = Settings({'REFERRER_POLICY': 'some-custom-unknown-policy'})
with self.assertRaises(NotConfigured):
mw = RefererMiddleware(settings)
class TestPolicyHeaderPredecence001(MixinUnsafeUrl, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.SameOriginPolicy'}
resp_headers = {'Referrer-Policy': POLICY_UNSAFE_URL.upper()}
class TestPolicyHeaderPredecence002(MixinNoReferrer, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy'}
resp_headers = {'Referrer-Policy': POLICY_NO_REFERRER.swapcase()}
class TestPolicyHeaderPredecence003(MixinNoReferrerWhenDowngrade, TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy'}
resp_headers = {'Referrer-Policy': POLICY_NO_REFERRER_WHEN_DOWNGRADE.title()}
class TestReferrerOnRedirect(TestRefererMiddleware):
settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.UnsafeUrlPolicy'}
scenarii = [
( 'http://scrapytest.org/1', # parent
'http://scrapytest.org/2', # target
(
# redirections: code, URL
(301, 'http://scrapytest.org/3'),
(301, 'http://scrapytest.org/4'),
),
b'http://scrapytest.org/1', # expected initial referer
b'http://scrapytest.org/1', # expected referer for the redirection request
),
( 'https://scrapytest.org/1',
'https://scrapytest.org/2',
(
# redirecting to non-secure URL
(301, 'http://scrapytest.org/3'),
),
b'https://scrapytest.org/1',
b'https://scrapytest.org/1',
),
( 'https://scrapytest.org/1',
'https://scrapytest.com/2',
(
# redirecting to non-secure URL: different origin
(301, 'http://scrapytest.com/3'),
),
b'https://scrapytest.org/1',
b'https://scrapytest.org/1',
),
]
def setUp(self):
self.spider = Spider('foo')
settings = Settings(self.settings)
self.referrermw = RefererMiddleware(settings)
self.redirectmw = RedirectMiddleware(settings)
def test(self):
for parent, target, redirections, init_referrer, final_referrer in self.scenarii:
response = self.get_response(parent)
request = self.get_request(target)
out = list(self.referrermw.process_spider_output(response, [request], self.spider))
self.assertEquals(out[0].headers.get('Referer'), init_referrer)
for status, url in redirections:
response = Response(request.url, headers={'Location': url}, status=status)
request = self.redirectmw.process_response(request, response, self.spider)
self.referrermw.request_scheduled(request, self.spider)
assert isinstance(request, Request)
self.assertEquals(request.headers.get('Referer'), final_referrer)
class TestReferrerOnRedirectNoReferrer(TestReferrerOnRedirect):
"""
No Referrer policy never sets the "Referer" header.
HTTP redirections should not change that.
"""
settings = {'REFERRER_POLICY': 'no-referrer'}
scenarii = [
( 'http://scrapytest.org/1', # parent
'http://scrapytest.org/2', # target
(
# redirections: code, URL
(301, 'http://scrapytest.org/3'),
(301, 'http://scrapytest.org/4'),
),
None, # expected initial "Referer"
None, # expected "Referer" for the redirection request
),
( 'https://scrapytest.org/1',
'https://scrapytest.org/2',
(
(301, 'http://scrapytest.org/3'),
),
None,
None,
),
( 'https://scrapytest.org/1',
'https://example.com/2', # different origin
(
(301, 'http://scrapytest.com/3'),
),
None,
None,
),
]
class TestReferrerOnRedirectSameOrigin(TestReferrerOnRedirect):
"""
Same Origin policy sends the full URL as "Referer" if the target origin
is the same as the parent response (same protocol, same domain, same port).
HTTP redirections to a different domain or a lower secure level
should have the "Referer" removed.
"""
settings = {'REFERRER_POLICY': 'same-origin'}
scenarii = [
( 'http://scrapytest.org/101', # origin
'http://scrapytest.org/102', # target
(
# redirections: code, URL
(301, 'http://scrapytest.org/103'),
(301, 'http://scrapytest.org/104'),
),
b'http://scrapytest.org/101', # expected initial "Referer"
b'http://scrapytest.org/101', # expected referer for the redirection request
),
( 'https://scrapytest.org/201',
'https://scrapytest.org/202',
(
# redirecting from secure to non-secure URL == different origin
(301, 'http://scrapytest.org/203'),
),
b'https://scrapytest.org/201',
None,
),
( 'https://scrapytest.org/301',
'https://scrapytest.org/302',
(
# different domain == different origin
(301, 'http://example.com/303'),
),
b'https://scrapytest.org/301',
None,
),
]
class TestReferrerOnRedirectStrictOrigin(TestReferrerOnRedirect):
"""
Strict Origin policy will always send the "origin" as referrer
(think of it as the parent URL without the path part),
unless the security level is lower and no "Referer" is sent.
Redirections from secure to non-secure URLs should have the
"Referrer" header removed if necessary.
"""
settings = {'REFERRER_POLICY': POLICY_STRICT_ORIGIN}
scenarii = [
( 'http://scrapytest.org/101',
'http://scrapytest.org/102',
(
(301, 'http://scrapytest.org/103'),
(301, 'http://scrapytest.org/104'),
),
b'http://scrapytest.org/', # send origin
b'http://scrapytest.org/', # redirects to same origin: send origin
),
( 'https://scrapytest.org/201',
'https://scrapytest.org/202',
(
# redirecting to non-secure URL: no referrer
(301, 'http://scrapytest.org/203'),
),
b'https://scrapytest.org/',
None,
),
( 'https://scrapytest.org/301',
'https://scrapytest.org/302',
(
# redirecting to non-secure URL (different domain): no referrer
(301, 'http://example.com/303'),
),
b'https://scrapytest.org/',
None,
),
( 'http://scrapy.org/401',
'http://example.com/402',
(
(301, 'http://scrapytest.org/403'),
),
b'http://scrapy.org/',
b'http://scrapy.org/',
),
( 'https://scrapy.org/501',
'https://example.com/502',
(
# HTTPS all along, so origin referrer is kept as-is
(301, 'https://google.com/503'),
(301, 'https://facebook.com/504'),
),
b'https://scrapy.org/',
b'https://scrapy.org/',
),
( 'https://scrapytest.org/601',
'http://scrapytest.org/602', # TLS to non-TLS: no referrer
(
(301, 'https://scrapytest.org/603'), # TLS URL again: (still) no referrer
),
None,
None,
),
]
class TestReferrerOnRedirectOriginWhenCrossOrigin(TestReferrerOnRedirect):
"""
Origin When Cross-Origin policy sends the full URL as "Referer",
unless the target's origin is different (different domain, different protocol)
in which case only the origin is sent.
Redirections to a different origin should strip the "Referer"
to the parent origin.
"""
settings = {'REFERRER_POLICY': POLICY_ORIGIN_WHEN_CROSS_ORIGIN}
scenarii = [
( 'http://scrapytest.org/101', # origin
'http://scrapytest.org/102', # target + redirection
(
# redirections: code, URL
(301, 'http://scrapytest.org/103'),
(301, 'http://scrapytest.org/104'),
),
b'http://scrapytest.org/101', # expected initial referer
b'http://scrapytest.org/101', # expected referer for the redirection request
),
( 'https://scrapytest.org/201',
'https://scrapytest.org/202',
(
# redirecting to non-secure URL: send origin
(301, 'http://scrapytest.org/203'),
),
b'https://scrapytest.org/201',
b'https://scrapytest.org/',
),
( 'https://scrapytest.org/301',
'https://scrapytest.org/302',
(
# redirecting to non-secure URL (different domain): send origin
(301, 'http://example.com/303'),
),
b'https://scrapytest.org/301',
b'https://scrapytest.org/',
),
( 'http://scrapy.org/401',
'http://example.com/402',
(
(301, 'http://scrapytest.org/403'),
),
b'http://scrapy.org/',
b'http://scrapy.org/',
),
( 'https://scrapy.org/501',
'https://example.com/502',
(
# all different domains: send origin
(301, 'https://google.com/503'),
(301, 'https://facebook.com/504'),
),
b'https://scrapy.org/',
b'https://scrapy.org/',
),
( 'https://scrapytest.org/301',
'http://scrapytest.org/302', # TLS to non-TLS: send origin
(
(301, 'https://scrapytest.org/303'), # TLS URL again: send origin (also)
),
b'https://scrapytest.org/',
b'https://scrapytest.org/',
),
]
class TestReferrerOnRedirectStrictOriginWhenCrossOrigin(TestReferrerOnRedirect):
"""
Strict Origin When Cross-Origin policy sends the full URL as "Referer",
unless the target's origin is different (different domain, different protocol)
in which case only the origin is sent...
Unless there's also a downgrade in security and then the "Referer" header
is not sent.
Redirections to a different origin should strip the "Referer" to the parent origin,
and from https:// to http:// will remove the "Referer" header.
"""
settings = {'REFERRER_POLICY': POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN}
scenarii = [
( 'http://scrapytest.org/101', # origin
'http://scrapytest.org/102', # target + redirection
(
# redirections: code, URL
(301, 'http://scrapytest.org/103'),
(301, 'http://scrapytest.org/104'),
),
b'http://scrapytest.org/101', # expected initial referer
b'http://scrapytest.org/101', # expected referer for the redirection request
),
( 'https://scrapytest.org/201',
'https://scrapytest.org/202',
(
# redirecting to non-secure URL: do not send the "Referer" header
(301, 'http://scrapytest.org/203'),
),
b'https://scrapytest.org/201',
None,
),
( 'https://scrapytest.org/301',
'https://scrapytest.org/302',
(
# redirecting to non-secure URL (different domain): send origin
(301, 'http://example.com/303'),
),
b'https://scrapytest.org/301',
None,
),
( 'http://scrapy.org/401',
'http://example.com/402',
(
(301, 'http://scrapytest.org/403'),
),
b'http://scrapy.org/',
b'http://scrapy.org/',
),
( 'https://scrapy.org/501',
'https://example.com/502',
(
# all different domains: send origin
(301, 'https://google.com/503'),
(301, 'https://facebook.com/504'),
),
b'https://scrapy.org/',
b'https://scrapy.org/',
),
( 'https://scrapytest.org/601',
'http://scrapytest.org/602', # TLS to non-TLS: do not send "Referer"
(
(301, 'https://scrapytest.org/603'), # TLS URL again: (still) send nothing
),
None,
None,
),
]