1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 06:44:06 +00:00

Fixed bug in HTTP Compression middleware which was failing to properly discover the encoding when the encoding was declared inside the response body. Closes #239. Also changed responsetypes to return Response class (instead of HtmlResponse) when the response has a Content-Encoding header

This commit is contained in:
Pablo Hoffman 2010-09-14 20:22:25 -03:00
parent 9acc99e723
commit 39499a2437
4 changed files with 33 additions and 4 deletions

View File

@ -3,6 +3,7 @@ from gzip import GzipFile
from cStringIO import StringIO
from scrapy.http import Response
from scrapy.core.downloader.responsetypes import responsetypes
class HttpCompressionMiddleware(object):
@ -18,7 +19,9 @@ class HttpCompressionMiddleware(object):
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
response = response.replace(body=decoded_body)
respcls = responsetypes.from_args(headers=response.headers, \
url=response.url)
response = response.replace(cls=respcls, body=decoded_body)
if not content_encoding:
del response.headers['Content-Encoding']

View File

@ -46,9 +46,11 @@ class ResponseTypes(object):
basetype = "%s/*" % mimetype.split('/')[0]
return self.classes.get(basetype, Response)
def from_content_type(self, content_type):
def from_content_type(self, content_type, content_encoding=None):
"""Return the most appropiate Response class from an HTTP Content-Type
header """
if content_encoding:
return Response
mimetype = content_type.split(';')[0].strip().lower()
return self.from_mimetype(mimetype)
@ -65,7 +67,8 @@ class ResponseTypes(object):
headers"""
cls = Response
if 'Content-Type' in headers:
cls = self.from_content_type(headers['Content-type'])
cls = self.from_content_type(headers['Content-type'], \
headers.get('Content-Encoding'))
if cls is Response and 'Content-Disposition' in headers:
cls = self.from_content_disposition(headers['Content-Disposition'])
return cls

View File

@ -2,11 +2,14 @@ from __future__ import with_statement
from unittest import TestCase
from os.path import join, abspath, dirname
from cStringIO import StringIO
from gzip import GzipFile
from scrapy.spider import BaseSpider
from scrapy.http import Response, Request
from scrapy.http import Response, Request, HtmlResponse
from scrapy.contrib.downloadermiddleware.httpcompression import HttpCompressionMiddleware
from scrapy.tests import tests_datadir
from scrapy.utils.encoding import resolve_encoding
SAMPLEDIR = join(tests_datadir, 'compressed')
@ -96,3 +99,22 @@ class HttpCompressionTest(TestCase):
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(newresponse.headers.getlist('Content-Encoding'), ['uuencode'])
def test_process_response_encoding_inside_body(self):
headers = {
'Content-Type': 'text/html',
'Content-Encoding': 'gzip',
}
f = StringIO()
plainbody = """<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
zf = GzipFile(fileobj=f, mode='wb')
zf.write(plainbody)
zf.close()
response = Response("http;//www.example.com/", headers=headers, body=f.getvalue())
request = Request("http://www.example.com/")
newresponse = self.mw.process_response(request, response, self.spider)
assert isinstance(newresponse, HtmlResponse)
self.assertEqual(newresponse.body, plainbody)
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))

View File

@ -54,6 +54,7 @@ class ResponseTypesTest(unittest.TestCase):
mappings = [
({'Content-Type': ['text/html; charset=utf-8']}, HtmlResponse),
({'Content-Type': ['application/octet-stream'], 'Content-Disposition': ['attachment; filename=data.txt']}, TextResponse),
({'Content-Type': ['text/html; charset=utf-8'], 'Content-Encoding': ['gzip']}, Response),
]
for source, cls in mappings:
source = Headers(source)