1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 16:24:18 +00:00

Merge pull request #475 from dangra/473-do-not-send-header

[MRG] Do not set Referer by default when its value is None
This commit is contained in:
Daniel Graña 2013-12-24 05:20:36 -08:00
commit e91025536f
5 changed files with 73 additions and 3 deletions

View File

@ -17,7 +17,9 @@ class Headers(CaselessDict):
def normvalue(self, value):
"""Headers must not be unicode"""
if not hasattr(value, '__iter__'):
if value is None:
value = []
elif not hasattr(value, '__iter__'):
value = [value]
return [x.encode(self.encoding) if isinstance(x, unicode) else x \
for x in value]

View File

@ -1,5 +1,5 @@
from __future__ import print_function
import sys, time, random, urllib, os
import sys, time, random, urllib, os, json
from subprocess import Popen, PIPE
from twisted.web.server import Site, NOT_DONE_YET
from twisted.web.resource import Resource
@ -119,6 +119,16 @@ class Raw(LeafResource):
request.finish()
class Echo(LeafResource):
def render_GET(self, request):
output = {
'headers': dict(request.requestHeaders.getAllRawHeaders()),
'body': request.content.read(),
}
return json.dumps(output)
class Partial(LeafResource):
def render_GET(self, request):
@ -156,6 +166,7 @@ class Root(Resource):
self.putChild("partial", Partial())
self.putChild("drop", Drop())
self.putChild("raw", Raw())
self.putChild("echo", Echo())
def getChild(self, name, request):
return self

View File

@ -132,3 +132,22 @@ class BrokenStartRequestsSpider(FollowAllSpider):
self.seedsseen.append(response.meta.get('seed'))
for req in super(BrokenStartRequestsSpider, self).parse(response):
yield req
class SingleRequestSpider(MetaSpider):
seed = None
def start_requests(self):
if isinstance(self.seed, Request):
yield self.seed.replace(callback=self.parse, errback=self.on_error)
else:
yield Request(self.seed, callback=self.parse, errback=self.on_error)
def parse(self, response):
self.meta.setdefault('responses', []).append(response)
if 'next' in response.meta:
return response.meta['next']
def on_error(self, failure):
self.meta['failure'] = failure

View File

@ -1,9 +1,11 @@
import json
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_crawler, get_testlog
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider
BrokenStartRequestsSpider, SingleRequestSpider
from scrapy.tests.mockserver import MockServer
from scrapy.http import Request
def docrawl(spider, settings=None):
@ -158,3 +160,31 @@ with multiples lines
log = get_testlog()
self.assertEqual(log.count("Retrying"), 2)
self.assertEqual(log.count("Gave up retrying"), 1)
@defer.inlineCallbacks
def test_referer_header(self):
"""Referer header is set by RefererMiddleware unless it is already set"""
req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
req1 = req0.replace()
req2 = req0.replace(headers={'Referer': None})
req3 = req0.replace(headers={'Referer': 'http://example.com'})
req0.meta['next'] = req1
req1.meta['next'] = req2
req2.meta['next'] = req3
spider = SingleRequestSpider(seed=req0)
yield docrawl(spider)
# basic asserts in case of weird communication errors
self.assertIn('responses', spider.meta)
self.assertNotIn('failures', spider.meta)
# start requests doesn't set Referer header
echo0 = json.loads(spider.meta['responses'][2].body)
self.assertNotIn('Referer', echo0['headers'])
# following request sets Referer to start request url
echo1 = json.loads(spider.meta['responses'][1].body)
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
# next request avoids Referer header
echo2 = json.loads(spider.meta['responses'][2].body)
self.assertNotIn('Referer', echo2['headers'])
# last request explicitly sets a Referer header
echo3 = json.loads(spider.meta['responses'][3].body)
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])

View File

@ -119,3 +119,11 @@ class HeadersTest(unittest.TestCase):
h1.setlistdefault('header2', ['value2', 'value3'])
self.assertEqual(h1.getlist('header1'), ['value1'])
self.assertEqual(h1.getlist('header2'), ['value2', 'value3'])
def test_none_value(self):
h1 = Headers()
h1['foo'] = 'bar'
h1['foo'] = None
h1.setdefault('foo', 'bar')
self.assertEqual(h1.get('foo'), None)
self.assertEqual(h1.getlist('foo'), [])