mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 16:44:17 +00:00
Fixed bug in HTTP Compression middleware which was failing to properly discover the encoding when the encoding was declared inside the response body. Closes #239. Also changed responsetypes to return Response class (instead of HtmlResponse) when the response has a Content-Encoding header
This commit is contained in:
parent
9acc99e723
commit
39499a2437
@ -3,6 +3,7 @@ from gzip import GzipFile
|
||||
from cStringIO import StringIO
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
|
||||
|
||||
class HttpCompressionMiddleware(object):
|
||||
@ -18,7 +19,9 @@ class HttpCompressionMiddleware(object):
|
||||
if content_encoding:
|
||||
encoding = content_encoding.pop()
|
||||
decoded_body = self._decode(response.body, encoding.lower())
|
||||
response = response.replace(body=decoded_body)
|
||||
respcls = responsetypes.from_args(headers=response.headers, \
|
||||
url=response.url)
|
||||
response = response.replace(cls=respcls, body=decoded_body)
|
||||
if not content_encoding:
|
||||
del response.headers['Content-Encoding']
|
||||
|
||||
|
@ -46,9 +46,11 @@ class ResponseTypes(object):
|
||||
basetype = "%s/*" % mimetype.split('/')[0]
|
||||
return self.classes.get(basetype, Response)
|
||||
|
||||
def from_content_type(self, content_type):
|
||||
def from_content_type(self, content_type, content_encoding=None):
|
||||
"""Return the most appropiate Response class from an HTTP Content-Type
|
||||
header """
|
||||
if content_encoding:
|
||||
return Response
|
||||
mimetype = content_type.split(';')[0].strip().lower()
|
||||
return self.from_mimetype(mimetype)
|
||||
|
||||
@ -65,7 +67,8 @@ class ResponseTypes(object):
|
||||
headers"""
|
||||
cls = Response
|
||||
if 'Content-Type' in headers:
|
||||
cls = self.from_content_type(headers['Content-type'])
|
||||
cls = self.from_content_type(headers['Content-type'], \
|
||||
headers.get('Content-Encoding'))
|
||||
if cls is Response and 'Content-Disposition' in headers:
|
||||
cls = self.from_content_disposition(headers['Content-Disposition'])
|
||||
return cls
|
||||
|
@ -2,11 +2,14 @@ from __future__ import with_statement
|
||||
|
||||
from unittest import TestCase
|
||||
from os.path import join, abspath, dirname
|
||||
from cStringIO import StringIO
|
||||
from gzip import GzipFile
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.http import Response, Request, HtmlResponse
|
||||
from scrapy.contrib.downloadermiddleware.httpcompression import HttpCompressionMiddleware
|
||||
from scrapy.tests import tests_datadir
|
||||
from scrapy.utils.encoding import resolve_encoding
|
||||
|
||||
|
||||
SAMPLEDIR = join(tests_datadir, 'compressed')
|
||||
@ -96,3 +99,22 @@ class HttpCompressionTest(TestCase):
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert newresponse is not response
|
||||
self.assertEqual(newresponse.headers.getlist('Content-Encoding'), ['uuencode'])
|
||||
|
||||
def test_process_response_encoding_inside_body(self):
|
||||
headers = {
|
||||
'Content-Type': 'text/html',
|
||||
'Content-Encoding': 'gzip',
|
||||
}
|
||||
f = StringIO()
|
||||
plainbody = """<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
|
||||
zf = GzipFile(fileobj=f, mode='wb')
|
||||
zf.write(plainbody)
|
||||
zf.close()
|
||||
response = Response("http;//www.example.com/", headers=headers, body=f.getvalue())
|
||||
request = Request("http://www.example.com/")
|
||||
|
||||
newresponse = self.mw.process_response(request, response, self.spider)
|
||||
assert isinstance(newresponse, HtmlResponse)
|
||||
self.assertEqual(newresponse.body, plainbody)
|
||||
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
|
||||
|
||||
|
@ -54,6 +54,7 @@ class ResponseTypesTest(unittest.TestCase):
|
||||
mappings = [
|
||||
({'Content-Type': ['text/html; charset=utf-8']}, HtmlResponse),
|
||||
({'Content-Type': ['application/octet-stream'], 'Content-Disposition': ['attachment; filename=data.txt']}, TextResponse),
|
||||
({'Content-Type': ['text/html; charset=utf-8'], 'Content-Encoding': ['gzip']}, Response),
|
||||
]
|
||||
for source, cls in mappings:
|
||||
source = Headers(source)
|
||||
|
Loading…
x
Reference in New Issue
Block a user