mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 01:23:56 +00:00
forked UnicodeDammit from BeautifulSoup to explicitly disable usage of chardet library
This commit is contained in:
parent
7d18fe18e2
commit
f19442425a
269
scrapy/http/response/dammit.py
Normal file
269
scrapy/http/response/dammit.py
Normal file
@ -0,0 +1,269 @@
|
||||
"""
|
||||
This module contains a fork of the UnicodeDammit class from BeautifulSoup, that
|
||||
expliclty disabled any usage of chardet library.
|
||||
|
||||
The UnicodeDammit class is used as a last resource for detecting the encoding
|
||||
of a response.
|
||||
"""
|
||||
|
||||
import re
|
||||
import codecs
|
||||
|
||||
chardet = None # we don't want to use chardet since it's very slow,
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis" }
|
||||
|
||||
def __init__(self, markup, overrideEncodings=[],
|
||||
smartQuotesTo='xml', isHTML=False):
|
||||
self.declaredHTMLEncoding = None
|
||||
self.markup, documentEncoding, sniffedEncoding = \
|
||||
self._detectEncoding(markup, isHTML)
|
||||
self.smartQuotesTo = smartQuotesTo
|
||||
self.triedEncodings = []
|
||||
if markup == '' or isinstance(markup, unicode):
|
||||
self.originalEncoding = None
|
||||
self.unicode = unicode(markup)
|
||||
return
|
||||
|
||||
u = None
|
||||
for proposedEncoding in overrideEncodings:
|
||||
u = self._convertFrom(proposedEncoding)
|
||||
if u: break
|
||||
if not u:
|
||||
for proposedEncoding in (documentEncoding, sniffedEncoding):
|
||||
u = self._convertFrom(proposedEncoding)
|
||||
if u: break
|
||||
|
||||
# If no luck and we have auto-detection library, try that:
|
||||
if not u and chardet and not isinstance(self.markup, unicode):
|
||||
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
||||
|
||||
# As a last resort, try utf-8 and windows-1252:
|
||||
if not u:
|
||||
for proposed_encoding in ("utf-8", "windows-1252"):
|
||||
u = self._convertFrom(proposed_encoding)
|
||||
if u: break
|
||||
|
||||
self.unicode = u
|
||||
if not u: self.originalEncoding = None
|
||||
|
||||
def _subMSChar(self, orig):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity."""
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if isinstance(sub, tuple):
|
||||
if self.smartQuotesTo == 'xml':
|
||||
sub = '&#x%s;' % sub[1]
|
||||
else:
|
||||
sub = '&%s;' % sub[0]
|
||||
return sub
|
||||
|
||||
def _convertFrom(self, proposed):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or proposed in self.triedEncodings:
|
||||
return None
|
||||
self.triedEncodings.append(proposed)
|
||||
markup = self.markup
|
||||
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if self.smartQuotesTo and proposed.lower() in("windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2"):
|
||||
markup = re.compile("([\x80-\x9f])").sub \
|
||||
(lambda(x): self._subMSChar(x.group(1)),
|
||||
markup)
|
||||
|
||||
try:
|
||||
# print "Trying to convert document to %s" % proposed
|
||||
u = self._toUnicode(markup, proposed)
|
||||
self.markup = u
|
||||
self.originalEncoding = proposed
|
||||
except Exception, e:
|
||||
# print "That didn't work!"
|
||||
# print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _toUnicode(self, data, encoding):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
|
||||
# strip Byte Order Mark (if present)
|
||||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == '\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == '\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == '\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
newdata = unicode(data, encoding)
|
||||
return newdata
|
||||
|
||||
def _detectEncoding(self, xml_data, isHTML=False):
|
||||
"""Given a document, tries to detect its XML encoding."""
|
||||
xml_encoding = sniffed_xml_encoding = None
|
||||
try:
|
||||
if xml_data[:4] == '\x4c\x6f\xa7\x94':
|
||||
# EBCDIC
|
||||
xml_data = self._ebcdic_to_ascii(xml_data)
|
||||
elif xml_data[:4] == '\x00\x3c\x00\x3f':
|
||||
# UTF-16BE
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
|
||||
and (xml_data[2:4] != '\x00\x00'):
|
||||
# UTF-16BE with BOM
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
||||
elif xml_data[:4] == '\x3c\x00\x3f\x00':
|
||||
# UTF-16LE
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
|
||||
(xml_data[2:4] != '\x00\x00'):
|
||||
# UTF-16LE with BOM
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
||||
elif xml_data[:4] == '\x00\x00\x00\x3c':
|
||||
# UTF-32BE
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == '\x3c\x00\x00\x00':
|
||||
# UTF-32LE
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:4] == '\x00\x00\xfe\xff':
|
||||
# UTF-32BE with BOM
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == '\xff\xfe\x00\x00':
|
||||
# UTF-32LE with BOM
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:3] == '\xef\xbb\xbf':
|
||||
# UTF-8 with BOM
|
||||
sniffed_xml_encoding = 'utf-8'
|
||||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
||||
else:
|
||||
sniffed_xml_encoding = 'ascii'
|
||||
pass
|
||||
except:
|
||||
xml_encoding_match = None
|
||||
xml_encoding_match = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
|
||||
if not xml_encoding_match and isHTML:
|
||||
regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
|
||||
xml_encoding_match = regexp.search(xml_data)
|
||||
if xml_encoding_match is not None:
|
||||
xml_encoding = xml_encoding_match.groups()[0].lower()
|
||||
if isHTML:
|
||||
self.declaredHTMLEncoding = xml_encoding
|
||||
if sniffed_xml_encoding and \
|
||||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
||||
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||
'utf16', 'u16')):
|
||||
xml_encoding = sniffed_xml_encoding
|
||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||
|
||||
|
||||
def find_codec(self, charset):
|
||||
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
|
||||
or (charset and self._codec(charset.replace("-", ""))) \
|
||||
or (charset and self._codec(charset.replace("-", "_"))) \
|
||||
or charset
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset: return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
EBCDIC_TO_ASCII_MAP = None
|
||||
def _ebcdic_to_ascii(self, s):
|
||||
c = self.__class__
|
||||
if not c.EBCDIC_TO_ASCII_MAP:
|
||||
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
|
||||
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
|
||||
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
|
||||
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
|
||||
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
|
||||
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
|
||||
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
|
||||
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
|
||||
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
|
||||
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
|
||||
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
|
||||
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
|
||||
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
|
||||
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
|
||||
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
|
||||
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
|
||||
250,251,252,253,254,255)
|
||||
import string
|
||||
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
|
||||
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
|
||||
return s.translate(c.EBCDIC_TO_ASCII_MAP)
|
||||
|
||||
MS_CHARS = { '\x80' : ('euro', '20AC'),
|
||||
'\x81' : ' ',
|
||||
'\x82' : ('sbquo', '201A'),
|
||||
'\x83' : ('fnof', '192'),
|
||||
'\x84' : ('bdquo', '201E'),
|
||||
'\x85' : ('hellip', '2026'),
|
||||
'\x86' : ('dagger', '2020'),
|
||||
'\x87' : ('Dagger', '2021'),
|
||||
'\x88' : ('circ', '2C6'),
|
||||
'\x89' : ('permil', '2030'),
|
||||
'\x8A' : ('Scaron', '160'),
|
||||
'\x8B' : ('lsaquo', '2039'),
|
||||
'\x8C' : ('OElig', '152'),
|
||||
'\x8D' : '?',
|
||||
'\x8E' : ('#x17D', '17D'),
|
||||
'\x8F' : '?',
|
||||
'\x90' : '?',
|
||||
'\x91' : ('lsquo', '2018'),
|
||||
'\x92' : ('rsquo', '2019'),
|
||||
'\x93' : ('ldquo', '201C'),
|
||||
'\x94' : ('rdquo', '201D'),
|
||||
'\x95' : ('bull', '2022'),
|
||||
'\x96' : ('ndash', '2013'),
|
||||
'\x97' : ('mdash', '2014'),
|
||||
'\x98' : ('tilde', '2DC'),
|
||||
'\x99' : ('trade', '2122'),
|
||||
'\x9a' : ('scaron', '161'),
|
||||
'\x9b' : ('rsaquo', '203A'),
|
||||
'\x9c' : ('oelig', '153'),
|
||||
'\x9d' : '?',
|
||||
'\x9e' : ('#x17E', '17E'),
|
||||
'\x9f' : ('Yuml', ''),}
|
||||
|
||||
#######################################################################
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/request-response.rst
|
||||
|
||||
import re
|
||||
import codecs
|
||||
from scrapy.xlib.BeautifulSoup import UnicodeDammit
|
||||
from scrapy.http.response.dammit import UnicodeDammit
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.utils.python import memoizemethod_noargs
|
||||
from scrapy.utils.encoding import encoding_exists, resolve_encoding
|
||||
|
@ -2,10 +2,8 @@ import os
|
||||
import unittest
|
||||
import urlparse
|
||||
|
||||
from scrapy.xlib.BeautifulSoup import BeautifulSoup
|
||||
from scrapy.http import Response, TextResponse, HtmlResponse
|
||||
from scrapy.utils.response import body_or_str, get_base_url, get_meta_refresh, \
|
||||
response_httprepr, get_cached_beautifulsoup, open_in_browser
|
||||
from scrapy.utils.response import body_or_str, response_httprepr, open_in_browser
|
||||
|
||||
__doctests__ = ['scrapy.utils.response']
|
||||
|
||||
@ -41,31 +39,6 @@ class ResponseUtilsTest(unittest.TestCase):
|
||||
r1 = Response("http://www.example.com", status=6666, headers={"Content-type": "text/html"}, body="Some body")
|
||||
self.assertEqual(response_httprepr(r1), 'HTTP/1.1 6666 \r\nContent-Type: text/html\r\n\r\nSome body')
|
||||
|
||||
def test_get_cached_beautifulsoup(self):
|
||||
r1 = Response('http://www.example.com', body='')
|
||||
|
||||
soup1 = get_cached_beautifulsoup(r1)
|
||||
soup2 = get_cached_beautifulsoup(r1)
|
||||
|
||||
assert isinstance(soup1, BeautifulSoup)
|
||||
assert isinstance(soup2, BeautifulSoup)
|
||||
# make sure it's cached
|
||||
assert soup1 is soup2
|
||||
|
||||
# when body is None, an empty soup should be returned
|
||||
r1 = Response('http://www.example.com')
|
||||
assert r1.body == ""
|
||||
assert isinstance(get_cached_beautifulsoup(r1), BeautifulSoup)
|
||||
|
||||
r1 = Response('http://www.example.com', body='')
|
||||
soup1 = get_cached_beautifulsoup(r1)
|
||||
r2 = r1.copy()
|
||||
soup2 = get_cached_beautifulsoup(r1)
|
||||
soup3 = get_cached_beautifulsoup(r2)
|
||||
|
||||
assert soup1 is soup2
|
||||
assert soup1 is not soup3
|
||||
|
||||
def test_open_in_browser(self):
|
||||
url = "http:///www.example.com/some/page.html"
|
||||
body = "<html> <head> <title>test page</title> </head> <body>test body</body> </html>"
|
||||
|
@ -43,14 +43,6 @@ def get_meta_refresh(response):
|
||||
response.encoding)
|
||||
return _metaref_cache[response]
|
||||
|
||||
_beautifulsoup_cache = weakref.WeakKeyDictionary()
|
||||
def get_cached_beautifulsoup(response):
|
||||
"""Return BeautifulSoup object of the given response, with caching
|
||||
support"""
|
||||
if response not in _beautifulsoup_cache:
|
||||
_beautifulsoup_cache[response] = BeautifulSoup(response.body)
|
||||
return _beautifulsoup_cache[response]
|
||||
|
||||
def response_status_message(status):
|
||||
"""Return status code plus status text descriptive message
|
||||
|
||||
|
@ -1,4 +1,8 @@
|
||||
"""Beautiful Soup
|
||||
"""
|
||||
FIXME: this module is only needed beacuse scrapy.xlib.ClientForm uses it. We
|
||||
should remove it after we remove the scrapy.xlib.ClientForm module.
|
||||
|
||||
Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
Loading…
x
Reference in New Issue
Block a user