1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 21:44:19 +00:00

improved encoding detection by adding support for HTML5 meta charset

This commit is contained in:
Pablo Hoffman 2011-07-12 09:52:50 -03:00
parent 67213ce673
commit 4bb409923c
2 changed files with 7 additions and 3 deletions

View File

@ -18,15 +18,15 @@ class HtmlResponse(TextResponse):
_httpequiv_re = _template % ('http-equiv', 'Content-Type')
_content_re = _template % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
_encoding_re = _template % ('encoding', r'(?P<charset>[\w-]+)')
_content2_re = _template % ('charset', r'(?P<charset>[\w-]+)')
METATAG_RE = re.compile(r'<meta\s+%s\s+%s' % (_httpequiv_re, _content_re), re.I)
METATAG_RE2 = re.compile(r'<meta\s+%s\s+%s' % (_content_re, _httpequiv_re), re.I)
METATAG2_RE = re.compile(r'<meta\s+%s' % _content2_re, re.I)
@memoizemethod_noargs
def _body_declared_encoding(self):
chunk = self.body[:5000]
match = self.METATAG_RE.search(chunk) or self.METATAG_RE2.search(chunk)
match = self.METATAG_RE.search(chunk) or self.METATAG2_RE.search(chunk)
return match.group('charset') if match else None

View File

@ -260,6 +260,10 @@ class HtmlResponseTest(TextResponseTest):
r4 = r3.replace(body=body)
self._assert_response_values(r4, 'iso-8859-1', body)
def test_html5_meta_charset(self):
body = """<html><head><meta charset="gb2312" /><title>Some page</title><body>bla bla</body>"""
r1 = self.response_class("http://www.example.com", body=body)
self._assert_response_values(r1, 'gb2312', body)
class XmlResponseTest(TextResponseTest):