mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 08:43:55 +00:00
Merge pull request #475 from dangra/473-do-not-send-header
[MRG] Do not set Referer by default when its value is None
This commit is contained in:
commit
e91025536f
@ -17,7 +17,9 @@ class Headers(CaselessDict):
|
|||||||
|
|
||||||
def normvalue(self, value):
|
def normvalue(self, value):
|
||||||
"""Headers must not be unicode"""
|
"""Headers must not be unicode"""
|
||||||
if not hasattr(value, '__iter__'):
|
if value is None:
|
||||||
|
value = []
|
||||||
|
elif not hasattr(value, '__iter__'):
|
||||||
value = [value]
|
value = [value]
|
||||||
return [x.encode(self.encoding) if isinstance(x, unicode) else x \
|
return [x.encode(self.encoding) if isinstance(x, unicode) else x \
|
||||||
for x in value]
|
for x in value]
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import sys, time, random, urllib, os
|
import sys, time, random, urllib, os, json
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
from twisted.web.server import Site, NOT_DONE_YET
|
from twisted.web.server import Site, NOT_DONE_YET
|
||||||
from twisted.web.resource import Resource
|
from twisted.web.resource import Resource
|
||||||
@ -119,6 +119,16 @@ class Raw(LeafResource):
|
|||||||
request.finish()
|
request.finish()
|
||||||
|
|
||||||
|
|
||||||
|
class Echo(LeafResource):
|
||||||
|
|
||||||
|
def render_GET(self, request):
|
||||||
|
output = {
|
||||||
|
'headers': dict(request.requestHeaders.getAllRawHeaders()),
|
||||||
|
'body': request.content.read(),
|
||||||
|
}
|
||||||
|
return json.dumps(output)
|
||||||
|
|
||||||
|
|
||||||
class Partial(LeafResource):
|
class Partial(LeafResource):
|
||||||
|
|
||||||
def render_GET(self, request):
|
def render_GET(self, request):
|
||||||
@ -156,6 +166,7 @@ class Root(Resource):
|
|||||||
self.putChild("partial", Partial())
|
self.putChild("partial", Partial())
|
||||||
self.putChild("drop", Drop())
|
self.putChild("drop", Drop())
|
||||||
self.putChild("raw", Raw())
|
self.putChild("raw", Raw())
|
||||||
|
self.putChild("echo", Echo())
|
||||||
|
|
||||||
def getChild(self, name, request):
|
def getChild(self, name, request):
|
||||||
return self
|
return self
|
||||||
|
@ -132,3 +132,22 @@ class BrokenStartRequestsSpider(FollowAllSpider):
|
|||||||
self.seedsseen.append(response.meta.get('seed'))
|
self.seedsseen.append(response.meta.get('seed'))
|
||||||
for req in super(BrokenStartRequestsSpider, self).parse(response):
|
for req in super(BrokenStartRequestsSpider, self).parse(response):
|
||||||
yield req
|
yield req
|
||||||
|
|
||||||
|
|
||||||
|
class SingleRequestSpider(MetaSpider):
|
||||||
|
|
||||||
|
seed = None
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
if isinstance(self.seed, Request):
|
||||||
|
yield self.seed.replace(callback=self.parse, errback=self.on_error)
|
||||||
|
else:
|
||||||
|
yield Request(self.seed, callback=self.parse, errback=self.on_error)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
self.meta.setdefault('responses', []).append(response)
|
||||||
|
if 'next' in response.meta:
|
||||||
|
return response.meta['next']
|
||||||
|
|
||||||
|
def on_error(self, failure):
|
||||||
|
self.meta['failure'] = failure
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import json
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer
|
||||||
from twisted.trial.unittest import TestCase
|
from twisted.trial.unittest import TestCase
|
||||||
from scrapy.utils.test import get_crawler, get_testlog
|
from scrapy.utils.test import get_crawler, get_testlog
|
||||||
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
||||||
BrokenStartRequestsSpider
|
BrokenStartRequestsSpider, SingleRequestSpider
|
||||||
from scrapy.tests.mockserver import MockServer
|
from scrapy.tests.mockserver import MockServer
|
||||||
|
from scrapy.http import Request
|
||||||
|
|
||||||
|
|
||||||
def docrawl(spider, settings=None):
|
def docrawl(spider, settings=None):
|
||||||
@ -158,3 +160,31 @@ with multiples lines
|
|||||||
log = get_testlog()
|
log = get_testlog()
|
||||||
self.assertEqual(log.count("Retrying"), 2)
|
self.assertEqual(log.count("Retrying"), 2)
|
||||||
self.assertEqual(log.count("Gave up retrying"), 1)
|
self.assertEqual(log.count("Gave up retrying"), 1)
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_referer_header(self):
|
||||||
|
"""Referer header is set by RefererMiddleware unless it is already set"""
|
||||||
|
req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
|
||||||
|
req1 = req0.replace()
|
||||||
|
req2 = req0.replace(headers={'Referer': None})
|
||||||
|
req3 = req0.replace(headers={'Referer': 'http://example.com'})
|
||||||
|
req0.meta['next'] = req1
|
||||||
|
req1.meta['next'] = req2
|
||||||
|
req2.meta['next'] = req3
|
||||||
|
spider = SingleRequestSpider(seed=req0)
|
||||||
|
yield docrawl(spider)
|
||||||
|
# basic asserts in case of weird communication errors
|
||||||
|
self.assertIn('responses', spider.meta)
|
||||||
|
self.assertNotIn('failures', spider.meta)
|
||||||
|
# start requests doesn't set Referer header
|
||||||
|
echo0 = json.loads(spider.meta['responses'][2].body)
|
||||||
|
self.assertNotIn('Referer', echo0['headers'])
|
||||||
|
# following request sets Referer to start request url
|
||||||
|
echo1 = json.loads(spider.meta['responses'][1].body)
|
||||||
|
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
|
||||||
|
# next request avoids Referer header
|
||||||
|
echo2 = json.loads(spider.meta['responses'][2].body)
|
||||||
|
self.assertNotIn('Referer', echo2['headers'])
|
||||||
|
# last request explicitly sets a Referer header
|
||||||
|
echo3 = json.loads(spider.meta['responses'][3].body)
|
||||||
|
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
|
||||||
|
@ -119,3 +119,11 @@ class HeadersTest(unittest.TestCase):
|
|||||||
h1.setlistdefault('header2', ['value2', 'value3'])
|
h1.setlistdefault('header2', ['value2', 'value3'])
|
||||||
self.assertEqual(h1.getlist('header1'), ['value1'])
|
self.assertEqual(h1.getlist('header1'), ['value1'])
|
||||||
self.assertEqual(h1.getlist('header2'), ['value2', 'value3'])
|
self.assertEqual(h1.getlist('header2'), ['value2', 'value3'])
|
||||||
|
|
||||||
|
def test_none_value(self):
|
||||||
|
h1 = Headers()
|
||||||
|
h1['foo'] = 'bar'
|
||||||
|
h1['foo'] = None
|
||||||
|
h1.setdefault('foo', 'bar')
|
||||||
|
self.assertEqual(h1.get('foo'), None)
|
||||||
|
self.assertEqual(h1.getlist('foo'), [])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user