From f19442425a25fe794d31201c7ee3eb72674cf9c3 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 20 Jul 2011 17:41:53 -0300 Subject: [PATCH] forked UnicodeDammit from BeautifulSoup to explicitly disable usage of chardet library --- scrapy/http/response/dammit.py | 269 ++++++++++++++++++++++++++++ scrapy/http/response/text.py | 2 +- scrapy/tests/test_utils_response.py | 29 +-- scrapy/utils/response.py | 8 - scrapy/xlib/BeautifulSoup.py | 6 +- 5 files changed, 276 insertions(+), 38 deletions(-) create mode 100644 scrapy/http/response/dammit.py diff --git a/scrapy/http/response/dammit.py b/scrapy/http/response/dammit.py new file mode 100644 index 000000000..b9254904d --- /dev/null +++ b/scrapy/http/response/dammit.py @@ -0,0 +1,269 @@ +""" +This module contains a fork of the UnicodeDammit class from BeautifulSoup, that +expliclty disabled any usage of chardet library. + +The UnicodeDammit class is used as a last resource for detecting the encoding +of a response. +""" + +import re +import codecs + +chardet = None # we don't want to use chardet since it's very slow, + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = { "macintosh" : "mac-roman", + "x-sjis" : "shift-jis" } + + def __init__(self, markup, overrideEncodings=[], + smartQuotesTo='xml', isHTML=False): + self.declaredHTMLEncoding = None + self.markup, documentEncoding, sniffedEncoding = \ + self._detectEncoding(markup, isHTML) + self.smartQuotesTo = smartQuotesTo + self.triedEncodings = [] + if markup == '' or isinstance(markup, unicode): + self.originalEncoding = None + self.unicode = unicode(markup) + return + + u = None + for proposedEncoding in overrideEncodings: + u = self._convertFrom(proposedEncoding) + if u: break + if not u: + for proposedEncoding in (documentEncoding, sniffedEncoding): + u = self._convertFrom(proposedEncoding) + if u: break + + # If no luck and we have auto-detection library, try that: + if not u and chardet and not isinstance(self.markup, unicode): + u = self._convertFrom(chardet.detect(self.markup)['encoding']) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convertFrom(proposed_encoding) + if u: break + + self.unicode = u + if not u: self.originalEncoding = None + + def _subMSChar(self, orig): + """Changes a MS smart quote character to an XML or HTML + entity.""" + sub = self.MS_CHARS.get(orig) + if isinstance(sub, tuple): + if self.smartQuotesTo == 'xml': + sub = '&#x%s;' % sub[1] + else: + sub = '&%s;' % sub[0] + return sub + + def _convertFrom(self, proposed): + proposed = self.find_codec(proposed) + if not proposed or proposed in self.triedEncodings: + return None + self.triedEncodings.append(proposed) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if self.smartQuotesTo and proposed.lower() in("windows-1252", + "iso-8859-1", + "iso-8859-2"): + markup = re.compile("([\x80-\x9f])").sub \ + (lambda(x): self._subMSChar(x.group(1)), + markup) + + try: + # print "Trying to convert document to %s" % proposed + u = self._toUnicode(markup, proposed) + self.markup = u + self.originalEncoding = proposed + except Exception, e: + # print "That didn't work!" + # print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _toUnicode(self, data, encoding): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + return newdata + + def _detectEncoding(self, xml_data, isHTML=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == '\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == '\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ + and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ + (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_match = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) + if not xml_encoding_match and isHTML: + regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].lower() + if isHTML: + self.declaredHTMLEncoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ + ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + MS_CHARS = { '\x80' : ('euro', '20AC'), + '\x81' : ' ', + '\x82' : ('sbquo', '201A'), + '\x83' : ('fnof', '192'), + '\x84' : ('bdquo', '201E'), + '\x85' : ('hellip', '2026'), + '\x86' : ('dagger', '2020'), + '\x87' : ('Dagger', '2021'), + '\x88' : ('circ', '2C6'), + '\x89' : ('permil', '2030'), + '\x8A' : ('Scaron', '160'), + '\x8B' : ('lsaquo', '2039'), + '\x8C' : ('OElig', '152'), + '\x8D' : '?', + '\x8E' : ('#x17D', '17D'), + '\x8F' : '?', + '\x90' : '?', + '\x91' : ('lsquo', '2018'), + '\x92' : ('rsquo', '2019'), + '\x93' : ('ldquo', '201C'), + '\x94' : ('rdquo', '201D'), + '\x95' : ('bull', '2022'), + '\x96' : ('ndash', '2013'), + '\x97' : ('mdash', '2014'), + '\x98' : ('tilde', '2DC'), + '\x99' : ('trade', '2122'), + '\x9a' : ('scaron', '161'), + '\x9b' : ('rsaquo', '203A'), + '\x9c' : ('oelig', '153'), + '\x9d' : '?', + '\x9e' : ('#x17E', '17E'), + '\x9f' : ('Yuml', ''),} + +####################################################################### + diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py index f707f42d2..11c52236b 100644 --- a/scrapy/http/response/text.py +++ b/scrapy/http/response/text.py @@ -7,7 +7,7 @@ See documentation in docs/topics/request-response.rst import re import codecs -from scrapy.xlib.BeautifulSoup import UnicodeDammit +from scrapy.http.response.dammit import UnicodeDammit from scrapy.http.response import Response from scrapy.utils.python import memoizemethod_noargs from scrapy.utils.encoding import encoding_exists, resolve_encoding diff --git a/scrapy/tests/test_utils_response.py b/scrapy/tests/test_utils_response.py index 92acb4d1b..6cde6332c 100644 --- a/scrapy/tests/test_utils_response.py +++ b/scrapy/tests/test_utils_response.py @@ -2,10 +2,8 @@ import os import unittest import urlparse -from scrapy.xlib.BeautifulSoup import BeautifulSoup from scrapy.http import Response, TextResponse, HtmlResponse -from scrapy.utils.response import body_or_str, get_base_url, get_meta_refresh, \ - response_httprepr, get_cached_beautifulsoup, open_in_browser +from scrapy.utils.response import body_or_str, response_httprepr, open_in_browser __doctests__ = ['scrapy.utils.response'] @@ -41,31 +39,6 @@ class ResponseUtilsTest(unittest.TestCase): r1 = Response("http://www.example.com", status=6666, headers={"Content-type": "text/html"}, body="Some body") self.assertEqual(response_httprepr(r1), 'HTTP/1.1 6666 \r\nContent-Type: text/html\r\n\r\nSome body') - def test_get_cached_beautifulsoup(self): - r1 = Response('http://www.example.com', body='') - - soup1 = get_cached_beautifulsoup(r1) - soup2 = get_cached_beautifulsoup(r1) - - assert isinstance(soup1, BeautifulSoup) - assert isinstance(soup2, BeautifulSoup) - # make sure it's cached - assert soup1 is soup2 - - # when body is None, an empty soup should be returned - r1 = Response('http://www.example.com') - assert r1.body == "" - assert isinstance(get_cached_beautifulsoup(r1), BeautifulSoup) - - r1 = Response('http://www.example.com', body='') - soup1 = get_cached_beautifulsoup(r1) - r2 = r1.copy() - soup2 = get_cached_beautifulsoup(r1) - soup3 = get_cached_beautifulsoup(r2) - - assert soup1 is soup2 - assert soup1 is not soup3 - def test_open_in_browser(self): url = "http:///www.example.com/some/page.html" body = " test page test body " diff --git a/scrapy/utils/response.py b/scrapy/utils/response.py index cd7807b23..668e344f0 100644 --- a/scrapy/utils/response.py +++ b/scrapy/utils/response.py @@ -43,14 +43,6 @@ def get_meta_refresh(response): response.encoding) return _metaref_cache[response] -_beautifulsoup_cache = weakref.WeakKeyDictionary() -def get_cached_beautifulsoup(response): - """Return BeautifulSoup object of the given response, with caching - support""" - if response not in _beautifulsoup_cache: - _beautifulsoup_cache[response] = BeautifulSoup(response.body) - return _beautifulsoup_cache[response] - def response_status_message(status): """Return status code plus status text descriptive message diff --git a/scrapy/xlib/BeautifulSoup.py b/scrapy/xlib/BeautifulSoup.py index 748e6fe4b..52672525d 100644 --- a/scrapy/xlib/BeautifulSoup.py +++ b/scrapy/xlib/BeautifulSoup.py @@ -1,4 +1,8 @@ -"""Beautiful Soup +""" +FIXME: this module is only needed beacuse scrapy.xlib.ClientForm uses it. We +should remove it after we remove the scrapy.xlib.ClientForm module. + +Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/