mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 13:23:54 +00:00
Fixed bug for unicode support.The empty string ('') in some platforms is decoding as ascii, independently of the default encoding of python, changed to u''.
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40309
This commit is contained in:
parent
a309dfeb7d
commit
05f4a26cca
@ -63,7 +63,7 @@ def replace_tags(text, token=''):
|
||||
|
||||
def remove_comments(text):
|
||||
""" Remove HTML Comments. """
|
||||
return re.sub('<!--.*?-->', '', text.decode('utf-8'), re.DOTALL)
|
||||
return re.sub('<!--.*?-->', u'', text.decode('utf-8'), re.DOTALL)
|
||||
|
||||
def remove_tags(text, which_ones=()):
|
||||
""" Remove HTML Tags only.
|
||||
@ -77,7 +77,7 @@ def remove_tags(text, which_ones=()):
|
||||
else:
|
||||
reg_exp_remove_tags = '<.*?>'
|
||||
re_tags = re.compile(reg_exp_remove_tags, re.DOTALL)
|
||||
return re_tags.sub('', text.decode('utf-8'))
|
||||
return re_tags.sub(u'', text.decode('utf-8'))
|
||||
|
||||
def remove_tags_with_content(text, which_ones=()):
|
||||
""" Remove tags and its content.
|
||||
@ -87,7 +87,7 @@ def remove_tags_with_content(text, which_ones=()):
|
||||
"""
|
||||
tags = [ '<%s.*?</%s>' % (tag,tag) for tag in which_ones ]
|
||||
re_tags_remove = re.compile('|'.join(tags), re.DOTALL)
|
||||
return re_tags_remove.sub('', text.decode('utf-8'))
|
||||
return re_tags_remove.sub(u'', text.decode('utf-8'))
|
||||
|
||||
def remove_escape_chars(text, which_ones=('\n','\t','\r')):
|
||||
""" Remove escape chars. Default : \\n, \\t, \\r
|
||||
@ -96,5 +96,5 @@ def remove_escape_chars(text, which_ones=('\n','\t','\r')):
|
||||
By default removes \n, \t, \r.
|
||||
"""
|
||||
re_escape_chars = re.compile('[%s]' % ''.join(which_ones))
|
||||
return re_escape_chars.sub('', text.decode('utf-8'))
|
||||
return re_escape_chars.sub(u'', text.decode('utf-8'))
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user