1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 01:23:56 +00:00

forked UnicodeDammit from BeautifulSoup to explicitly disable usage of chardet library

This commit is contained in:
Pablo Hoffman 2011-07-20 17:41:53 -03:00
parent 7d18fe18e2
commit f19442425a
5 changed files with 276 additions and 38 deletions

View File

@ -0,0 +1,269 @@
"""
This module contains a fork of the UnicodeDammit class from BeautifulSoup, that
expliclty disabled any usage of chardet library.
The UnicodeDammit class is used as a last resource for detecting the encoding
of a response.
"""
import re
import codecs
chardet = None # we don't want to use chardet since it's very slow,
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
windows-1252, can replace MS smart quotes with their HTML or XML
equivalents."""
# This dictionary maps commonly seen values for "charset" in HTML
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
def __init__(self, markup, overrideEncodings=[],
smartQuotesTo='xml', isHTML=False):
self.declaredHTMLEncoding = None
self.markup, documentEncoding, sniffedEncoding = \
self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
self.originalEncoding = None
self.unicode = unicode(markup)
return
u = None
for proposedEncoding in overrideEncodings:
u = self._convertFrom(proposedEncoding)
if u: break
if not u:
for proposedEncoding in (documentEncoding, sniffedEncoding):
u = self._convertFrom(proposedEncoding)
if u: break
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252:
if not u:
for proposed_encoding in ("utf-8", "windows-1252"):
u = self._convertFrom(proposed_encoding)
if u: break
self.unicode = u
if not u: self.originalEncoding = None
def _subMSChar(self, orig):
"""Changes a MS smart quote character to an XML or HTML
entity."""
sub = self.MS_CHARS.get(orig)
if isinstance(sub, tuple):
if self.smartQuotesTo == 'xml':
sub = '&#x%s;' % sub[1]
else:
sub = '&%s;' % sub[0]
return sub
def _convertFrom(self, proposed):
proposed = self.find_codec(proposed)
if not proposed or proposed in self.triedEncodings:
return None
self.triedEncodings.append(proposed)
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if self.smartQuotesTo and proposed.lower() in("windows-1252",
"iso-8859-1",
"iso-8859-2"):
markup = re.compile("([\x80-\x9f])").sub \
(lambda(x): self._subMSChar(x.group(1)),
markup)
try:
# print "Trying to convert document to %s" % proposed
u = self._toUnicode(markup, proposed)
self.markup = u
self.originalEncoding = proposed
except Exception, e:
# print "That didn't work!"
# print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
def _toUnicode(self, data, encoding):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
encoding = 'utf-8'
data = data[3:]
elif data[:4] == '\x00\x00\xfe\xff':
encoding = 'utf-32be'
data = data[4:]
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
newdata = unicode(data, encoding)
return newdata
def _detectEncoding(self, xml_data, isHTML=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
if xml_data[:4] == '\x4c\x6f\xa7\x94':
# EBCDIC
xml_data = self._ebcdic_to_ascii(xml_data)
elif xml_data[:4] == '\x00\x3c\x00\x3f':
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
and (xml_data[2:4] != '\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
(xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == '\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
else:
sniffed_xml_encoding = 'ascii'
pass
except:
xml_encoding_match = None
xml_encoding_match = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
if not xml_encoding_match and isHTML:
regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
xml_encoding_match = regexp.search(xml_data)
if xml_encoding_match is not None:
xml_encoding = xml_encoding_match.groups()[0].lower()
if isHTML:
self.declaredHTMLEncoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
'utf-16', 'utf-32', 'utf_16', 'utf_32',
'utf16', 'u16')):
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
or (charset and self._codec(charset.replace("-", "_"))) \
or charset
def _codec(self, charset):
if not charset: return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
except (LookupError, ValueError):
pass
return codec
EBCDIC_TO_ASCII_MAP = None
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
250,251,252,253,254,255)
import string
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
MS_CHARS = { '\x80' : ('euro', '20AC'),
'\x81' : ' ',
'\x82' : ('sbquo', '201A'),
'\x83' : ('fnof', '192'),
'\x84' : ('bdquo', '201E'),
'\x85' : ('hellip', '2026'),
'\x86' : ('dagger', '2020'),
'\x87' : ('Dagger', '2021'),
'\x88' : ('circ', '2C6'),
'\x89' : ('permil', '2030'),
'\x8A' : ('Scaron', '160'),
'\x8B' : ('lsaquo', '2039'),
'\x8C' : ('OElig', '152'),
'\x8D' : '?',
'\x8E' : ('#x17D', '17D'),
'\x8F' : '?',
'\x90' : '?',
'\x91' : ('lsquo', '2018'),
'\x92' : ('rsquo', '2019'),
'\x93' : ('ldquo', '201C'),
'\x94' : ('rdquo', '201D'),
'\x95' : ('bull', '2022'),
'\x96' : ('ndash', '2013'),
'\x97' : ('mdash', '2014'),
'\x98' : ('tilde', '2DC'),
'\x99' : ('trade', '2122'),
'\x9a' : ('scaron', '161'),
'\x9b' : ('rsaquo', '203A'),
'\x9c' : ('oelig', '153'),
'\x9d' : '?',
'\x9e' : ('#x17E', '17E'),
'\x9f' : ('Yuml', ''),}
#######################################################################

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/request-response.rst
import re
import codecs
from scrapy.xlib.BeautifulSoup import UnicodeDammit
from scrapy.http.response.dammit import UnicodeDammit
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs
from scrapy.utils.encoding import encoding_exists, resolve_encoding

View File

@ -2,10 +2,8 @@ import os
import unittest
import urlparse
from scrapy.xlib.BeautifulSoup import BeautifulSoup
from scrapy.http import Response, TextResponse, HtmlResponse
from scrapy.utils.response import body_or_str, get_base_url, get_meta_refresh, \
response_httprepr, get_cached_beautifulsoup, open_in_browser
from scrapy.utils.response import body_or_str, response_httprepr, open_in_browser
__doctests__ = ['scrapy.utils.response']
@ -41,31 +39,6 @@ class ResponseUtilsTest(unittest.TestCase):
r1 = Response("http://www.example.com", status=6666, headers={"Content-type": "text/html"}, body="Some body")
self.assertEqual(response_httprepr(r1), 'HTTP/1.1 6666 \r\nContent-Type: text/html\r\n\r\nSome body')
def test_get_cached_beautifulsoup(self):
r1 = Response('http://www.example.com', body='')
soup1 = get_cached_beautifulsoup(r1)
soup2 = get_cached_beautifulsoup(r1)
assert isinstance(soup1, BeautifulSoup)
assert isinstance(soup2, BeautifulSoup)
# make sure it's cached
assert soup1 is soup2
# when body is None, an empty soup should be returned
r1 = Response('http://www.example.com')
assert r1.body == ""
assert isinstance(get_cached_beautifulsoup(r1), BeautifulSoup)
r1 = Response('http://www.example.com', body='')
soup1 = get_cached_beautifulsoup(r1)
r2 = r1.copy()
soup2 = get_cached_beautifulsoup(r1)
soup3 = get_cached_beautifulsoup(r2)
assert soup1 is soup2
assert soup1 is not soup3
def test_open_in_browser(self):
url = "http:///www.example.com/some/page.html"
body = "<html> <head> <title>test page</title> </head> <body>test body</body> </html>"

View File

@ -43,14 +43,6 @@ def get_meta_refresh(response):
response.encoding)
return _metaref_cache[response]
_beautifulsoup_cache = weakref.WeakKeyDictionary()
def get_cached_beautifulsoup(response):
"""Return BeautifulSoup object of the given response, with caching
support"""
if response not in _beautifulsoup_cache:
_beautifulsoup_cache[response] = BeautifulSoup(response.body)
return _beautifulsoup_cache[response]
def response_status_message(status):
"""Return status code plus status text descriptive message

View File

@ -1,4 +1,8 @@
"""Beautiful Soup
"""
FIXME: this module is only needed beacuse scrapy.xlib.ClientForm uses it. We
should remove it after we remove the scrapy.xlib.ClientForm module.
Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/