mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 16:24:18 +00:00
Merge pull request #475 from dangra/473-do-not-send-header
[MRG] Do not set Referer by default when its value is None
This commit is contained in:
commit
e91025536f
@ -17,7 +17,9 @@ class Headers(CaselessDict):
|
||||
|
||||
def normvalue(self, value):
|
||||
"""Headers must not be unicode"""
|
||||
if not hasattr(value, '__iter__'):
|
||||
if value is None:
|
||||
value = []
|
||||
elif not hasattr(value, '__iter__'):
|
||||
value = [value]
|
||||
return [x.encode(self.encoding) if isinstance(x, unicode) else x \
|
||||
for x in value]
|
||||
|
@ -1,5 +1,5 @@
|
||||
from __future__ import print_function
|
||||
import sys, time, random, urllib, os
|
||||
import sys, time, random, urllib, os, json
|
||||
from subprocess import Popen, PIPE
|
||||
from twisted.web.server import Site, NOT_DONE_YET
|
||||
from twisted.web.resource import Resource
|
||||
@ -119,6 +119,16 @@ class Raw(LeafResource):
|
||||
request.finish()
|
||||
|
||||
|
||||
class Echo(LeafResource):
|
||||
|
||||
def render_GET(self, request):
|
||||
output = {
|
||||
'headers': dict(request.requestHeaders.getAllRawHeaders()),
|
||||
'body': request.content.read(),
|
||||
}
|
||||
return json.dumps(output)
|
||||
|
||||
|
||||
class Partial(LeafResource):
|
||||
|
||||
def render_GET(self, request):
|
||||
@ -156,6 +166,7 @@ class Root(Resource):
|
||||
self.putChild("partial", Partial())
|
||||
self.putChild("drop", Drop())
|
||||
self.putChild("raw", Raw())
|
||||
self.putChild("echo", Echo())
|
||||
|
||||
def getChild(self, name, request):
|
||||
return self
|
||||
|
@ -132,3 +132,22 @@ class BrokenStartRequestsSpider(FollowAllSpider):
|
||||
self.seedsseen.append(response.meta.get('seed'))
|
||||
for req in super(BrokenStartRequestsSpider, self).parse(response):
|
||||
yield req
|
||||
|
||||
|
||||
class SingleRequestSpider(MetaSpider):
|
||||
|
||||
seed = None
|
||||
|
||||
def start_requests(self):
|
||||
if isinstance(self.seed, Request):
|
||||
yield self.seed.replace(callback=self.parse, errback=self.on_error)
|
||||
else:
|
||||
yield Request(self.seed, callback=self.parse, errback=self.on_error)
|
||||
|
||||
def parse(self, response):
|
||||
self.meta.setdefault('responses', []).append(response)
|
||||
if 'next' in response.meta:
|
||||
return response.meta['next']
|
||||
|
||||
def on_error(self, failure):
|
||||
self.meta['failure'] = failure
|
||||
|
@ -1,9 +1,11 @@
|
||||
import json
|
||||
from twisted.internet import defer
|
||||
from twisted.trial.unittest import TestCase
|
||||
from scrapy.utils.test import get_crawler, get_testlog
|
||||
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
||||
BrokenStartRequestsSpider
|
||||
BrokenStartRequestsSpider, SingleRequestSpider
|
||||
from scrapy.tests.mockserver import MockServer
|
||||
from scrapy.http import Request
|
||||
|
||||
|
||||
def docrawl(spider, settings=None):
|
||||
@ -158,3 +160,31 @@ with multiples lines
|
||||
log = get_testlog()
|
||||
self.assertEqual(log.count("Retrying"), 2)
|
||||
self.assertEqual(log.count("Gave up retrying"), 1)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_referer_header(self):
|
||||
"""Referer header is set by RefererMiddleware unless it is already set"""
|
||||
req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
|
||||
req1 = req0.replace()
|
||||
req2 = req0.replace(headers={'Referer': None})
|
||||
req3 = req0.replace(headers={'Referer': 'http://example.com'})
|
||||
req0.meta['next'] = req1
|
||||
req1.meta['next'] = req2
|
||||
req2.meta['next'] = req3
|
||||
spider = SingleRequestSpider(seed=req0)
|
||||
yield docrawl(spider)
|
||||
# basic asserts in case of weird communication errors
|
||||
self.assertIn('responses', spider.meta)
|
||||
self.assertNotIn('failures', spider.meta)
|
||||
# start requests doesn't set Referer header
|
||||
echo0 = json.loads(spider.meta['responses'][2].body)
|
||||
self.assertNotIn('Referer', echo0['headers'])
|
||||
# following request sets Referer to start request url
|
||||
echo1 = json.loads(spider.meta['responses'][1].body)
|
||||
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
|
||||
# next request avoids Referer header
|
||||
echo2 = json.loads(spider.meta['responses'][2].body)
|
||||
self.assertNotIn('Referer', echo2['headers'])
|
||||
# last request explicitly sets a Referer header
|
||||
echo3 = json.loads(spider.meta['responses'][3].body)
|
||||
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
|
||||
|
@ -119,3 +119,11 @@ class HeadersTest(unittest.TestCase):
|
||||
h1.setlistdefault('header2', ['value2', 'value3'])
|
||||
self.assertEqual(h1.getlist('header1'), ['value1'])
|
||||
self.assertEqual(h1.getlist('header2'), ['value2', 'value3'])
|
||||
|
||||
def test_none_value(self):
|
||||
h1 = Headers()
|
||||
h1['foo'] = 'bar'
|
||||
h1['foo'] = None
|
||||
h1.setdefault('foo', 'bar')
|
||||
self.assertEqual(h1.get('foo'), None)
|
||||
self.assertEqual(h1.getlist('foo'), [])
|
||||
|
Loading…
x
Reference in New Issue
Block a user