- Modified markup functions again; this time, in order to support utf-8 encoded strings (without interfering with unicode objects)

- Reverted changes in tests --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40545
2025-02-27 02:43:41 +00:00 · 2008-12-24 00:00:14 +00:00 · 2008-12-24 00:00:14 +00:00 · 95ed5d86a2
commit 95ed5d86a2
parent e2b0870957
2 changed files with 59 additions and 3 deletions
--- a/scrapy/trunk/scrapy/tests/test_utils_markup.py
+++ b/scrapy/trunk/scrapy/tests/test_utils_markup.py
@ -7,6 +7,10 @@ from scrapy.utils.markup import unquote_markup
 class UtilsMarkupTest(unittest.TestCase):

    def test_remove_entities(self):
+        # make sure it always return uncode
+        assert isinstance(remove_entities('no entities'), unicode)
+        assert isinstance(remove_entities('Price: &pound;100!'),  unicode)
+
        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
@ -27,6 +31,9 @@ class UtilsMarkupTest(unittest.TestCase):


    def test_replace_tags(self):
+        # make sure it always return uncode
+        assert isinstance(replace_tags('no entities'), unicode)
+
        self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
                         u'This text contains some tag')

@ -38,6 +45,10 @@ class UtilsMarkupTest(unittest.TestCase):
                         u'Click here')

    def test_remove_comments(self):
+        # make sure it always return unicode
+        assert isinstance(remove_comments('without comments'), unicode)
+        assert isinstance(remove_comments('<!-- with comments -->'), unicode)
+
        # text without comments 
        self.assertEqual(remove_comments(u'text without comments'), u'text without comments')

@ -46,6 +57,13 @@ class UtilsMarkupTest(unittest.TestCase):
        self.assertEqual(remove_comments(u'Hello<!--World-->'),u'Hello')

    def test_remove_tags(self):
+        # make sure it always return unicode
+        assert isinstance(remove_tags('no tags'), unicode)
+        assert isinstance(remove_tags('no tags', which_ones=('p',)), unicode)
+        assert isinstance(remove_tags('<p>one tag</p>'), unicode)
+        assert isinstance(remove_tags('<p>one tag</p>', which_ones=('p')), unicode)
+        assert isinstance(remove_tags('<a>link</a>', which_ones=('b',)), unicode)
+
        # text without tags
        self.assertEqual(remove_tags(u'no tags'), u'no tags')
        self.assertEqual(remove_tags(u'no tags', which_ones=('p','b',)), u'no tags')
@ -63,6 +81,12 @@ class UtilsMarkupTest(unittest.TestCase):
                         u'<p align="center" class="one">texty</p>')

    def test_remove_tags_with_content(self):
+        # make sure it always return unicode
+        assert isinstance(remove_tags_with_content('no tags'), unicode)
+        assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode)
+        assert isinstance(remove_tags_with_content('<p>one tag</p>', which_ones=('p',)), unicode)
+        assert isinstance(remove_tags_with_content('<a>link</a>', which_ones=('b',)), unicode)
+
        # text without tags
        self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
        self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p','b',)), u'no tags')
@ -75,6 +99,10 @@ class UtilsMarkupTest(unittest.TestCase):
                         u'<b>not will removed</b>')

    def test_remove_escape_chars(self):
+        # make sure it always return unicode
+        assert isinstance(remove_escape_chars('no ec'), unicode)
+        assert isinstance(remove_escape_chars('no ec', which_ones=('\n','\t',)), unicode)
+
        # text without escape chars
        self.assertEqual(remove_escape_chars(u'no ec'), u'no ec')
        self.assertEqual(remove_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')
@ -90,6 +118,10 @@ class UtilsMarkupTest(unittest.TestCase):
        sample_txt2 = u'<node2>blah&amp;blah<![CDATA[blahblahblah!&pound;]]>moreblah&lt;&gt;</node2>'
        sample_txt3 = u'something&pound;&amp;more<node3><![CDATA[things, stuff, and such]]>what&quot;ever</node3><node4'

+        # make sure it always return unicode
+        assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode)
+        assert isinstance(unquote_markup(sample_txt2), unicode)
+
        self.assertEqual(unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
 although this is inside a cdata! &amp; &quot;</node1>""")

--- a/scrapy/trunk/scrapy/utils/markup.py
+++ b/scrapy/trunk/scrapy/utils/markup.py
@ -11,7 +11,7 @@ _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
 def remove_entities(text, keep=(), remove_illegal=True):
    """Remove entities from the given text.

-    'text' must be a unicode string.
+    'text' can be a unicode string or a regular string encoded as 'utf-8'

    If 'keep' is passed (with a list of entity names) those entities will
    be kept (they won't be removed).
@ -45,24 +45,36 @@ def remove_entities(text, keep=(), remove_illegal=True):
            else:
                return u'&%s;' % m.group(2)

+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    return _ent_re.sub(convert_entity, text)

 def has_entities(text):
+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    return bool(_ent_re.search(text))

 def replace_tags(text, token=''):
    """Replace all markup tags found in the given text by the given token. By
    default token is a null string so it just remove all tags.

-    'text' must be a unicode string.
+    'text' can be a unicode string or a regular string encoded as 'utf-8'

    Always returns a unicode string.
    """
+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    return _tag_re.sub(token, text)


 def remove_comments(text):
    """ Remove HTML Comments. """
+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    return re.sub('<!--.*?-->', u'', text, re.DOTALL)
      
 def remove_tags(text, which_ones=()):
@ -71,6 +83,9 @@ def remove_tags(text, which_ones=()):
        which_ones -- is a tuple of which tags we want to remove.
                      if is empty remove all tags.
    """
+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    if len(which_ones) > 0:
        tags = [ '<%s>|<%s .*?>|</%s>' % (tag,tag,tag) for tag in which_ones ]
        reg_exp_remove_tags = '|'.join(tags)
@ -85,6 +100,9 @@ def remove_tags_with_content(text, which_ones=()):
        which_ones -- is a tuple of which tags with its content we want to remove.
                      if is empty do nothing.
    """
+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    tags = [ '<%s.*?</%s>' % (tag,tag) for tag in which_ones ]
    re_tags_remove = re.compile('|'.join(tags), re.DOTALL)
    return re_tags_remove.sub(u'', text)
@ -95,12 +113,15 @@ def remove_escape_chars(text, which_ones=('\n','\t','\r')):
        which_ones -- is a tuple of which escape chars we want to remove.
                      By default removes \n, \t, \r.
    """
+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    re_escape_chars = re.compile('[%s]' % ''.join(which_ones))
    return re_escape_chars.sub(u'', text)

 def unquote_markup(text, keep=(), remove_illegal=True):
    """
-    This function receives markup as a text (always a unicode string) and does the following:
+    This function receives markup as a text (always a unicode string or a utf-8 encoded string) and does the following:
     - removes entities (except the ones in 'keep') from any part of it that it's not inside a CDATA
     - searches for CDATAs and extracts their text (if any) without modifying it.
     - removes the found CDATAs
@ -118,6 +139,9 @@ def unquote_markup(text, keep=(), remove_illegal=True):
        fragments.append(txt[offset:])
        return fragments

+    if not isinstance(text, unicode):
+        text = text.decode('utf-8')
+
    ret_text = u''
    for fragment in _get_fragments(text, _cdata_re):
        if isinstance(fragment, basestring):