mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 21:44:19 +00:00
improved encoding detection by adding support for HTML5 meta charset
This commit is contained in:
parent
67213ce673
commit
4bb409923c
@ -18,15 +18,15 @@ class HtmlResponse(TextResponse):
|
||||
|
||||
_httpequiv_re = _template % ('http-equiv', 'Content-Type')
|
||||
_content_re = _template % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
|
||||
_encoding_re = _template % ('encoding', r'(?P<charset>[\w-]+)')
|
||||
_content2_re = _template % ('charset', r'(?P<charset>[\w-]+)')
|
||||
|
||||
METATAG_RE = re.compile(r'<meta\s+%s\s+%s' % (_httpequiv_re, _content_re), re.I)
|
||||
METATAG_RE2 = re.compile(r'<meta\s+%s\s+%s' % (_content_re, _httpequiv_re), re.I)
|
||||
METATAG2_RE = re.compile(r'<meta\s+%s' % _content2_re, re.I)
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _body_declared_encoding(self):
|
||||
chunk = self.body[:5000]
|
||||
match = self.METATAG_RE.search(chunk) or self.METATAG_RE2.search(chunk)
|
||||
match = self.METATAG_RE.search(chunk) or self.METATAG2_RE.search(chunk)
|
||||
return match.group('charset') if match else None
|
||||
|
||||
|
||||
|
@ -260,6 +260,10 @@ class HtmlResponseTest(TextResponseTest):
|
||||
r4 = r3.replace(body=body)
|
||||
self._assert_response_values(r4, 'iso-8859-1', body)
|
||||
|
||||
def test_html5_meta_charset(self):
|
||||
body = """<html><head><meta charset="gb2312" /><title>Some page</title><body>bla bla</body>"""
|
||||
r1 = self.response_class("http://www.example.com", body=body)
|
||||
self._assert_response_values(r1, 'gb2312', body)
|
||||
|
||||
|
||||
class XmlResponseTest(TextResponseTest):
|
||||
|
Loading…
x
Reference in New Issue
Block a user