From 2b7fea26a5ab8f0bb080a4301d35515a17b9e072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gra=C3=B1a?= Date: Mon, 25 Nov 2013 15:34:13 -0200 Subject: [PATCH] Do not set Referer by default when its value is None closes #473 --- scrapy/http/headers.py | 4 +++- scrapy/tests/mockserver.py | 13 ++++++++++++- scrapy/tests/spiders.py | 19 ++++++++++++++++++ scrapy/tests/test_crawl.py | 32 ++++++++++++++++++++++++++++++- scrapy/tests/test_http_headers.py | 8 ++++++++ 5 files changed, 73 insertions(+), 3 deletions(-) diff --git a/scrapy/http/headers.py b/scrapy/http/headers.py index cffe7aab8..35ceadeeb 100644 --- a/scrapy/http/headers.py +++ b/scrapy/http/headers.py @@ -17,7 +17,9 @@ class Headers(CaselessDict): def normvalue(self, value): """Headers must not be unicode""" - if not hasattr(value, '__iter__'): + if value is None: + value = [] + elif not hasattr(value, '__iter__'): value = [value] return [x.encode(self.encoding) if isinstance(x, unicode) else x \ for x in value] diff --git a/scrapy/tests/mockserver.py b/scrapy/tests/mockserver.py index c17bc7ce2..0f26e848d 100644 --- a/scrapy/tests/mockserver.py +++ b/scrapy/tests/mockserver.py @@ -1,5 +1,5 @@ from __future__ import print_function -import sys, time, random, urllib, os +import sys, time, random, urllib, os, json from subprocess import Popen, PIPE from twisted.web.server import Site, NOT_DONE_YET from twisted.web.resource import Resource @@ -119,6 +119,16 @@ class Raw(LeafResource): request.finish() +class Echo(LeafResource): + + def render_GET(self, request): + output = { + 'headers': dict(request.requestHeaders.getAllRawHeaders()), + 'body': request.content.read(), + } + return json.dumps(output) + + class Partial(LeafResource): def render_GET(self, request): @@ -156,6 +166,7 @@ class Root(Resource): self.putChild("partial", Partial()) self.putChild("drop", Drop()) self.putChild("raw", Raw()) + self.putChild("echo", Echo()) def getChild(self, name, request): return self diff --git a/scrapy/tests/spiders.py b/scrapy/tests/spiders.py index 0a79c53b0..346615309 100644 --- a/scrapy/tests/spiders.py +++ b/scrapy/tests/spiders.py @@ -132,3 +132,22 @@ class BrokenStartRequestsSpider(FollowAllSpider): self.seedsseen.append(response.meta.get('seed')) for req in super(BrokenStartRequestsSpider, self).parse(response): yield req + + +class SingleRequestSpider(MetaSpider): + + seed = None + + def start_requests(self): + if isinstance(self.seed, Request): + yield self.seed.replace(callback=self.parse, errback=self.on_error) + else: + yield Request(self.seed, callback=self.parse, errback=self.on_error) + + def parse(self, response): + self.meta.setdefault('responses', []).append(response) + if 'next' in response.meta: + return response.meta['next'] + + def on_error(self, failure): + self.meta['failure'] = failure diff --git a/scrapy/tests/test_crawl.py b/scrapy/tests/test_crawl.py index fcad0ba4a..f8b54381e 100644 --- a/scrapy/tests/test_crawl.py +++ b/scrapy/tests/test_crawl.py @@ -1,9 +1,11 @@ +import json from twisted.internet import defer from twisted.trial.unittest import TestCase from scrapy.utils.test import get_crawler, get_testlog from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \ - BrokenStartRequestsSpider + BrokenStartRequestsSpider, SingleRequestSpider from scrapy.tests.mockserver import MockServer +from scrapy.http import Request def docrawl(spider, settings=None): @@ -158,3 +160,31 @@ with multiples lines log = get_testlog() self.assertEqual(log.count("Retrying"), 2) self.assertEqual(log.count("Gave up retrying"), 1) + + @defer.inlineCallbacks + def test_referer_header(self): + """Referer header is set by RefererMiddleware unless it is already set""" + req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1) + req1 = req0.replace() + req2 = req0.replace(headers={'Referer': None}) + req3 = req0.replace(headers={'Referer': 'http://example.com'}) + req0.meta['next'] = req1 + req1.meta['next'] = req2 + req2.meta['next'] = req3 + spider = SingleRequestSpider(seed=req0) + yield docrawl(spider) + # basic asserts in case of weird communication errors + self.assertIn('responses', spider.meta) + self.assertNotIn('failures', spider.meta) + # start requests doesn't set Referer header + echo0 = json.loads(spider.meta['responses'][2].body) + self.assertNotIn('Referer', echo0['headers']) + # following request sets Referer to start request url + echo1 = json.loads(spider.meta['responses'][1].body) + self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) + # next request avoids Referer header + echo2 = json.loads(spider.meta['responses'][2].body) + self.assertNotIn('Referer', echo2['headers']) + # last request explicitly sets a Referer header + echo3 = json.loads(spider.meta['responses'][3].body) + self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) diff --git a/scrapy/tests/test_http_headers.py b/scrapy/tests/test_http_headers.py index 269a3a339..77449bfd6 100644 --- a/scrapy/tests/test_http_headers.py +++ b/scrapy/tests/test_http_headers.py @@ -119,3 +119,11 @@ class HeadersTest(unittest.TestCase): h1.setlistdefault('header2', ['value2', 'value3']) self.assertEqual(h1.getlist('header1'), ['value1']) self.assertEqual(h1.getlist('header2'), ['value2', 'value3']) + + def test_none_value(self): + h1 = Headers() + h1['foo'] = 'bar' + h1['foo'] = None + h1.setdefault('foo', 'bar') + self.assertEqual(h1.get('foo'), None) + self.assertEqual(h1.getlist('foo'), [])