mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 17:44:33 +00:00
added DEFAULT_RESPONSE_ENCODING setting
This commit is contained in:
parent
3d7a4c890e
commit
7296a7b889
@ -339,6 +339,17 @@ Default::
|
|||||||
The default headers used for Scrapy HTTP Requests. They're populated in the
|
The default headers used for Scrapy HTTP Requests. They're populated in the
|
||||||
:class:`~scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware`.
|
:class:`~scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware`.
|
||||||
|
|
||||||
|
.. setting:: DEFAULT_RESPONSE_ENCODING
|
||||||
|
|
||||||
|
DEFAULT_RESPONSE_ENCODING
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Default: ``'ascii'``
|
||||||
|
|
||||||
|
The default encoding to use for :class:`~scrapy.http.TextResponse` objects (and
|
||||||
|
subclasses) when no encoding is declared and no encoding could be inferred from
|
||||||
|
the body.
|
||||||
|
|
||||||
.. setting:: DEPTH_LIMIT
|
.. setting:: DEPTH_LIMIT
|
||||||
|
|
||||||
DEPTH_LIMIT
|
DEPTH_LIMIT
|
||||||
|
@ -37,6 +37,8 @@ DEFAULT_REQUEST_HEADERS = {
|
|||||||
'Accept-Language': 'en',
|
'Accept-Language': 'en',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEFAULT_RESPONSE_ENCODING = 'ascii'
|
||||||
|
|
||||||
DEPTH_LIMIT = 0
|
DEPTH_LIMIT = 0
|
||||||
DEPTH_STATS = True
|
DEPTH_STATS = True
|
||||||
|
|
||||||
|
@ -11,9 +11,11 @@ from scrapy.xlib.BeautifulSoup import UnicodeDammit
|
|||||||
|
|
||||||
from scrapy.http.response import Response
|
from scrapy.http.response import Response
|
||||||
from scrapy.utils.python import memoizemethod_noargs
|
from scrapy.utils.python import memoizemethod_noargs
|
||||||
|
from scrapy.conf import settings
|
||||||
|
|
||||||
class TextResponse(Response):
|
class TextResponse(Response):
|
||||||
|
|
||||||
|
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
|
||||||
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
||||||
|
|
||||||
__slots__ = ['_encoding', '_body_inferred_encoding']
|
__slots__ = ['_encoding', '_body_inferred_encoding']
|
||||||
@ -71,6 +73,8 @@ class TextResponse(Response):
|
|||||||
self._body_declared_encoding())
|
self._body_declared_encoding())
|
||||||
dammit = UnicodeDammit(self.body, possible_encodings)
|
dammit = UnicodeDammit(self.body, possible_encodings)
|
||||||
self._body_inferred_encoding = dammit.originalEncoding
|
self._body_inferred_encoding = dammit.originalEncoding
|
||||||
|
if self._body_inferred_encoding in ('ascii', None):
|
||||||
|
self._body_inferred_encoding = self._DEFAULT_ENCODING
|
||||||
return dammit.unicode
|
return dammit.unicode
|
||||||
|
|
||||||
def body_encoding(self):
|
def body_encoding(self):
|
||||||
|
@ -2,6 +2,7 @@ import unittest
|
|||||||
import weakref
|
import weakref
|
||||||
|
|
||||||
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
||||||
|
from scrapy.conf import settings
|
||||||
|
|
||||||
|
|
||||||
class BaseResponseTest(unittest.TestCase):
|
class BaseResponseTest(unittest.TestCase):
|
||||||
@ -138,8 +139,10 @@ class TextResponseTest(BaseResponseTest):
|
|||||||
self.assertEqual(r3.encoding, "latin1")
|
self.assertEqual(r3.encoding, "latin1")
|
||||||
|
|
||||||
def test_unicode_url(self):
|
def test_unicode_url(self):
|
||||||
# instantiate with unicode url without encoding
|
# instantiate with unicode url without encoding (should set default encoding)
|
||||||
self.assertRaises(TypeError, self.response_class, u"http://www.example.com/")
|
resp = self.response_class(u"http://www.example.com/")
|
||||||
|
self.assertEqual(resp.encoding, settings['DEFAULT_RESPONSE_ENCODING'])
|
||||||
|
|
||||||
# make sure urls are converted to str
|
# make sure urls are converted to str
|
||||||
resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
|
resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
|
||||||
assert isinstance(resp.url, str)
|
assert isinstance(resp.url, str)
|
||||||
@ -187,7 +190,6 @@ class TextResponseTest(BaseResponseTest):
|
|||||||
# TextResponse (and subclasses) must be passed a encoding when instantiating with unicode bodies
|
# TextResponse (and subclasses) must be passed a encoding when instantiating with unicode bodies
|
||||||
self.assertRaises(TypeError, self.response_class, "http://www.example.com", body=u"\xa3")
|
self.assertRaises(TypeError, self.response_class, "http://www.example.com", body=u"\xa3")
|
||||||
|
|
||||||
|
|
||||||
class HtmlResponseTest(TextResponseTest):
|
class HtmlResponseTest(TextResponseTest):
|
||||||
|
|
||||||
response_class = HtmlResponse
|
response_class = HtmlResponse
|
||||||
@ -229,8 +231,7 @@ class XmlResponseTest(TextResponseTest):
|
|||||||
|
|
||||||
body = "<xml></xml>"
|
body = "<xml></xml>"
|
||||||
r1 = self.response_class("http://www.example.com", body=body)
|
r1 = self.response_class("http://www.example.com", body=body)
|
||||||
# XXX: we may want to swtich default XmlResponse encoding to utf-8
|
self._assert_response_values(r1, settings['DEFAULT_RESPONSE_ENCODING'], body)
|
||||||
self._assert_response_values(r1, 'ascii', body)
|
|
||||||
|
|
||||||
body = """<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
|
body = """<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
|
||||||
r2 = self.response_class("http://www.example.com", body=body)
|
r2 = self.response_class("http://www.example.com", body=body)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user