diff --git a/scrapy/trunk/scrapy/tests/test_utils_markup.py b/scrapy/trunk/scrapy/tests/test_utils_markup.py index 01121133e..b466dce88 100644 --- a/scrapy/trunk/scrapy/tests/test_utils_markup.py +++ b/scrapy/trunk/scrapy/tests/test_utils_markup.py @@ -7,6 +7,10 @@ from scrapy.utils.markup import unquote_markup class UtilsMarkupTest(unittest.TestCase): def test_remove_entities(self): + # make sure it always return uncode + assert isinstance(remove_entities('no entities'), unicode) + assert isinstance(remove_entities('Price: £100!'), unicode) + # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') @@ -27,6 +31,9 @@ class UtilsMarkupTest(unittest.TestCase): def test_replace_tags(self): + # make sure it always return uncode + assert isinstance(replace_tags('no entities'), unicode) + self.assertEqual(replace_tags(u'This text contains some tag'), u'This text contains some tag') @@ -38,6 +45,10 @@ class UtilsMarkupTest(unittest.TestCase): u'Click here') def test_remove_comments(self): + # make sure it always return unicode + assert isinstance(remove_comments('without comments'), unicode) + assert isinstance(remove_comments(''), unicode) + # text without comments self.assertEqual(remove_comments(u'text without comments'), u'text without comments') @@ -46,6 +57,13 @@ class UtilsMarkupTest(unittest.TestCase): self.assertEqual(remove_comments(u'Hello'),u'Hello') def test_remove_tags(self): + # make sure it always return unicode + assert isinstance(remove_tags('no tags'), unicode) + assert isinstance(remove_tags('no tags', which_ones=('p',)), unicode) + assert isinstance(remove_tags('

one tag

'), unicode) + assert isinstance(remove_tags('

one tag

', which_ones=('p')), unicode) + assert isinstance(remove_tags('link', which_ones=('b',)), unicode) + # text without tags self.assertEqual(remove_tags(u'no tags'), u'no tags') self.assertEqual(remove_tags(u'no tags', which_ones=('p','b',)), u'no tags') @@ -63,6 +81,12 @@ class UtilsMarkupTest(unittest.TestCase): u'

texty

') def test_remove_tags_with_content(self): + # make sure it always return unicode + assert isinstance(remove_tags_with_content('no tags'), unicode) + assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode) + assert isinstance(remove_tags_with_content('

one tag

', which_ones=('p',)), unicode) + assert isinstance(remove_tags_with_content('link', which_ones=('b',)), unicode) + # text without tags self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags') self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p','b',)), u'no tags') @@ -75,6 +99,10 @@ class UtilsMarkupTest(unittest.TestCase): u'not will removed') def test_remove_escape_chars(self): + # make sure it always return unicode + assert isinstance(remove_escape_chars('no ec'), unicode) + assert isinstance(remove_escape_chars('no ec', which_ones=('\n','\t',)), unicode) + # text without escape chars self.assertEqual(remove_escape_chars(u'no ec'), u'no ec') self.assertEqual(remove_escape_chars(u'no ec', which_ones=('\n',)), u'no ec') @@ -90,6 +118,10 @@ class UtilsMarkupTest(unittest.TestCase): sample_txt2 = u'blah&blahmoreblah<>' sample_txt3 = u'something£&morewhat"everhi, this is sample text with entities: & \xa9 although this is inside a cdata! & """") diff --git a/scrapy/trunk/scrapy/utils/markup.py b/scrapy/trunk/scrapy/utils/markup.py index a27a72794..f115ed13f 100644 --- a/scrapy/trunk/scrapy/utils/markup.py +++ b/scrapy/trunk/scrapy/utils/markup.py @@ -11,7 +11,7 @@ _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) def remove_entities(text, keep=(), remove_illegal=True): """Remove entities from the given text. - 'text' must be a unicode string. + 'text' can be a unicode string or a regular string encoded as 'utf-8' If 'keep' is passed (with a list of entity names) those entities will be kept (they won't be removed). @@ -45,24 +45,36 @@ def remove_entities(text, keep=(), remove_illegal=True): else: return u'&%s;' % m.group(2) + if not isinstance(text, unicode): + text = text.decode('utf-8') + return _ent_re.sub(convert_entity, text) def has_entities(text): + if not isinstance(text, unicode): + text = text.decode('utf-8') + return bool(_ent_re.search(text)) def replace_tags(text, token=''): """Replace all markup tags found in the given text by the given token. By default token is a null string so it just remove all tags. - 'text' must be a unicode string. + 'text' can be a unicode string or a regular string encoded as 'utf-8' Always returns a unicode string. """ + if not isinstance(text, unicode): + text = text.decode('utf-8') + return _tag_re.sub(token, text) def remove_comments(text): """ Remove HTML Comments. """ + if not isinstance(text, unicode): + text = text.decode('utf-8') + return re.sub('', u'', text, re.DOTALL) def remove_tags(text, which_ones=()): @@ -71,6 +83,9 @@ def remove_tags(text, which_ones=()): which_ones -- is a tuple of which tags we want to remove. if is empty remove all tags. """ + if not isinstance(text, unicode): + text = text.decode('utf-8') + if len(which_ones) > 0: tags = [ '<%s>|<%s .*?>|' % (tag,tag,tag) for tag in which_ones ] reg_exp_remove_tags = '|'.join(tags) @@ -85,6 +100,9 @@ def remove_tags_with_content(text, which_ones=()): which_ones -- is a tuple of which tags with its content we want to remove. if is empty do nothing. """ + if not isinstance(text, unicode): + text = text.decode('utf-8') + tags = [ '<%s.*?' % (tag,tag) for tag in which_ones ] re_tags_remove = re.compile('|'.join(tags), re.DOTALL) return re_tags_remove.sub(u'', text) @@ -95,12 +113,15 @@ def remove_escape_chars(text, which_ones=('\n','\t','\r')): which_ones -- is a tuple of which escape chars we want to remove. By default removes \n, \t, \r. """ + if not isinstance(text, unicode): + text = text.decode('utf-8') + re_escape_chars = re.compile('[%s]' % ''.join(which_ones)) return re_escape_chars.sub(u'', text) def unquote_markup(text, keep=(), remove_illegal=True): """ - This function receives markup as a text (always a unicode string) and does the following: + This function receives markup as a text (always a unicode string or a utf-8 encoded string) and does the following: - removes entities (except the ones in 'keep') from any part of it that it's not inside a CDATA - searches for CDATAs and extracts their text (if any) without modifying it. - removes the found CDATAs @@ -118,6 +139,9 @@ def unquote_markup(text, keep=(), remove_illegal=True): fragments.append(txt[offset:]) return fragments + if not isinstance(text, unicode): + text = text.decode('utf-8') + ret_text = u'' for fragment in _get_fragments(text, _cdata_re): if isinstance(fragment, basestring):