1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 12:44:06 +00:00

added DEFAULT_RESPONSE_ENCODING setting

This commit is contained in:
Pablo Hoffman 2009-10-21 16:13:41 -02:00
parent 3d7a4c890e
commit 7296a7b889
4 changed files with 23 additions and 5 deletions

View File

@ -339,6 +339,17 @@ Default::
The default headers used for Scrapy HTTP Requests. They're populated in the
:class:`~scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware`.
.. setting:: DEFAULT_RESPONSE_ENCODING
DEFAULT_RESPONSE_ENCODING
-------------------------
Default: ``'ascii'``
The default encoding to use for :class:`~scrapy.http.TextResponse` objects (and
subclasses) when no encoding is declared and no encoding could be inferred from
the body.
.. setting:: DEPTH_LIMIT
DEPTH_LIMIT

View File

@ -37,6 +37,8 @@ DEFAULT_REQUEST_HEADERS = {
'Accept-Language': 'en',
}
DEFAULT_RESPONSE_ENCODING = 'ascii'
DEPTH_LIMIT = 0
DEPTH_STATS = True

View File

@ -11,9 +11,11 @@ from scrapy.xlib.BeautifulSoup import UnicodeDammit
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs
from scrapy.conf import settings
class TextResponse(Response):
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
__slots__ = ['_encoding', '_body_inferred_encoding']
@ -71,6 +73,8 @@ class TextResponse(Response):
self._body_declared_encoding())
dammit = UnicodeDammit(self.body, possible_encodings)
self._body_inferred_encoding = dammit.originalEncoding
if self._body_inferred_encoding in ('ascii', None):
self._body_inferred_encoding = self._DEFAULT_ENCODING
return dammit.unicode
def body_encoding(self):

View File

@ -2,6 +2,7 @@ import unittest
import weakref
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
from scrapy.conf import settings
class BaseResponseTest(unittest.TestCase):
@ -138,8 +139,10 @@ class TextResponseTest(BaseResponseTest):
self.assertEqual(r3.encoding, "latin1")
def test_unicode_url(self):
# instantiate with unicode url without encoding
self.assertRaises(TypeError, self.response_class, u"http://www.example.com/")
# instantiate with unicode url without encoding (should set default encoding)
resp = self.response_class(u"http://www.example.com/")
self.assertEqual(resp.encoding, settings['DEFAULT_RESPONSE_ENCODING'])
# make sure urls are converted to str
resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
assert isinstance(resp.url, str)
@ -187,7 +190,6 @@ class TextResponseTest(BaseResponseTest):
# TextResponse (and subclasses) must be passed a encoding when instantiating with unicode bodies
self.assertRaises(TypeError, self.response_class, "http://www.example.com", body=u"\xa3")
class HtmlResponseTest(TextResponseTest):
response_class = HtmlResponse
@ -229,8 +231,7 @@ class XmlResponseTest(TextResponseTest):
body = "<xml></xml>"
r1 = self.response_class("http://www.example.com", body=body)
# XXX: we may want to swtich default XmlResponse encoding to utf-8
self._assert_response_values(r1, 'ascii', body)
self._assert_response_values(r1, settings['DEFAULT_RESPONSE_ENCODING'], body)
body = """<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
r2 = self.response_class("http://www.example.com", body=body)