1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 17:24:38 +00:00

. Updated some adaptors docstrings

. Turned Delist and Unquote adaptors into factory functions instead of classes
. Updated tests

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40765
This commit is contained in:
elpolilla 2009-01-26 01:15:05 +00:00
parent 33df3c81c2
commit dbaf602730
6 changed files with 61 additions and 56 deletions

View File

@ -119,15 +119,6 @@ class RobustScrapedItem(ScrapedItem):
else:
raise NotImplementedError('You must override _add_single_attributes method in order to join %s values into a single value.' % attrtype.__name__)
def _clean_values(self, values):
ret = []
for val in values:
if isinstance(val, tuple):
ret.extend(val)
elif val:
ret.append(val)
return ret
def attribute(self, attrname, *values, **kwargs):
"""
Set the provided values to the provided attribute (`attrname`) by filtering them
@ -146,6 +137,15 @@ class RobustScrapedItem(ScrapedItem):
The kwargs parameter is passed to the adaptors pipeline, which manages to transmit
it to the adaptors themselves.
"""
def _clean_values(values):
ret = []
for val in values:
if isinstance(val, tuple):
ret.extend(val)
elif val:
ret.append(val)
return ret
if not values:
raise UsageError("You must specify at least one value when setting an attribute")
if attrname not in self.ATTRIBUTES:
@ -162,7 +162,7 @@ class RobustScrapedItem(ScrapedItem):
multivalued = isinstance(attrtype, list)
adaptors_pipe = self._adaptors_dict.get(attrname)
new_values = [adaptors_pipe(value, kwargs) for value in values] if adaptors_pipe else [values]
new_values = self._clean_values(new_values)
new_values = _clean_values(new_values)
if old_value and not override:
if multivalued:

View File

@ -1,4 +1,4 @@
from scrapy.contrib_exp.adaptors.extraction import extract, ExtractImageLinks
from scrapy.contrib_exp.adaptors.markup import remove_tags, remove_root, Unquote
from scrapy.contrib_exp.adaptors.misc import to_unicode, clean_spaces, strip, drop_empty, Delist, Regex
from scrapy.utils.python import flatten
from scrapy.contrib_exp.adaptors.markup import remove_tags, remove_root, unquote
from scrapy.contrib_exp.adaptors.misc import to_unicode, clean_spaces, strip, drop_empty, delist, Regex

View File

@ -3,20 +3,25 @@ from scrapy.utils.markup import replace_tags, unquote_markup
from scrapy.utils.python import str_to_unicode
from scrapy.item.adaptors import adaptize
def remove_tags(value):
def remove_tags(tags=()):
"""
Removes any tags found in each of the provided list's string.
E.g:
>> remove_tags('<head>my header</head>', '<body>my <b>body</b></body>')
u'my header', u'my body'
Factory that returns an adaptor for removing each
tag in the `tags` parameter found in the given value.
If no `tags` are specified, all of them are removed.
Input: string/unicode
Output: unicode
"""
return replace_tags(value)
def _remove_tags(value):
return replace_tags(value, tags)
return _remove_tags
_remove_root_re = re.compile(r'^\s*<.*?>(.*)</.*>\s*$', re.DOTALL)
def remove_root(value):
"""
This adaptor removes the root tag of the given string/unicode,
if it's found.
Input: string/unicode
Output: unicode
"""
@ -25,18 +30,20 @@ def remove_root(value):
value = m.group(1)
return str_to_unicode(value)
class Unquote(object):
def unquote(keep=None):
"""
Receives a list of strings, removes all of the
CDATAs and entities (except the ones in CDATAs) the strings
may have, and returns a new list.
This factory returns an adaptor that
receives a string or unicode, removes all of the
CDATAs and entities (except the ones in CDATAs, and the ones
you specify in the `keep` parameter) and then, returns a new
string or unicode.
Input: string/unicode
Output: string/unicode
"""
def __init__(self, keep=None):
self.keep = [] if keep is None else keep
default_keep = [] if keep is None else keep
def __call__(self, value, adaptor_args):
keep = adaptor_args.get('keep_entities', self.keep)
def unquote(value, adaptor_args):
keep = adaptor_args.get('keep_entities', default_keep)
return unquote_markup(value, keep=keep)
return unquote

View File

@ -8,14 +8,15 @@ from scrapy.item.adaptors import adaptize
def to_unicode(value, adaptor_args):
"""
Receives a list of strings, converts
it to unicode, and returns a new list.
Receives a string and converts it to unicode
using the given encoding (if specified, else utf-8 is used)
and returns a new unicode object.
E.g:
>> to_unicode(['it costs 20€, or 30£'])
>> to_unicode('it costs 20\xe2\x82\xac, or 30\xc2\xa3')
[u'it costs 20\u20ac, or 30\xa3']
Input: iterable of strings
Output: list of unicodes
Input: string
Output: unicode
"""
if not isinstance(value, basestring):
value = str(value)
@ -24,14 +25,13 @@ def to_unicode(value, adaptor_args):
_clean_spaces_re = re.compile("\s+", re.U)
def clean_spaces(value):
"""
Converts multispaces into single spaces for each string
in the provided iterable.
Converts multispaces into single spaces for the given string.
E.g:
>> clean_spaces(['Hello sir'])
[u'Hello sir']
>> clean_spaces(u'Hello sir')
u'Hello sir'
Input: iterable of unicodes
Output: list of unicodes
Input: string/unicode
Output: unicode
"""
return _clean_spaces_re.sub(' ', str_to_unicode(value))
@ -61,20 +61,18 @@ def drop_empty(value):
"""
return [ v for v in value if v ]
class Delist(object):
def delist(delimiter=''):
"""
Joins a list with the specified delimiter
in the adaptor's constructor.
This factory returns and adaptor that joins
an iterable with the specified delimiter.
Input: iterable of strings/unicodes
Output: string/unicode
"""
def __init__(self, delimiter=''):
self.delimiter = delimiter
def __call__(self, value, adaptor_args):
delimiter = adaptor_args.get('join_delimiter', self.delimiter)
return delimiter.join(value)
def delist(value, adaptor_args):
delim = adaptor_args.get('join_delimiter', delimiter)
return delim.join(value)
return delist
class Regex(object):
"""

View File

@ -162,18 +162,18 @@ class AdaptorsTestCase(unittest.TestCase):
['lala.com', 'pepe.co.uk', 'das.biz', 'lelelel.net'])
def test_unquote_all(self):
unquote = adaptors.Unquote()
unquote = adaptors.unquote()
self.assertEqual(unquote(u'hello&copy;&amp;welcome&lt;br /&gt;&amp;', {}),
u'hello\xa9&welcome<br />&')
def test_unquote(self):
unquote = adaptors.Unquote(keep=['amp', 'lt'])
unquote = adaptors.unquote(keep=['amp', 'lt'])
self.assertEqual(unquote(u'hello&copy;&amp;welcome&lt;br /&gt;&amp;', {}),
u'hello\xa9&amp;welcome&lt;br />&amp;')
def test_remove_tags(self):
self.assertEqual(adaptors.remove_tags('<a href="lala">adsaas<br /></a>'), 'adsaas')
self.assertEqual(adaptors.remove_tags('<div id="1"><table>dsadasf</table></div>'), 'dsadasf')
self.assertEqual(adaptors.remove_tags()('<a href="lala">adsaas<br /></a>'), 'adsaas')
self.assertEqual(adaptors.remove_tags()('<div id="1"><table>dsadasf</table></div>'), 'dsadasf')
def test_remove_root(self):
self.assertEqual(adaptors.remove_root('<div>lallaa<a href="coso">dsfsdfds</a>pepepep<br /></div>'),
@ -191,7 +191,7 @@ class AdaptorsTestCase(unittest.TestCase):
[1, 2, 5, 6, 'hi'])
def test_delist(self):
delist = adaptors.Delist(' ')
delist = adaptors.delist(' ')
self.assertEqual(delist(['hi', 'there', 'fellas.', 'this', 'is', 'my', 'test.'], {}),
'hi there fellas. this is my test.')
self.assertEqual(delist(['hi', 'there', 'fellas,', 'this', 'is', 'my', 'test.'], {'join_delimiter': '.'}),

View File

@ -61,11 +61,11 @@ class RobustScrapedItemTestCase(unittest.TestCase):
def test_set_adaptors(self):
self.assertEqual(self.item._adaptors_dict, {})
delist = adaptors.Delist()
delist = adaptors.delist()
self.item.set_adaptors({'name': [adaptors.extract, delist]})
self.assertTrue(isinstance(self.item._adaptors_dict['name'], AdaptorPipe))
self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
self.assertEqual(self.item._adaptors_dict['name'][1].name, "Delist")
self.assertEqual(self.item._adaptors_dict['name'][1].name, "delist")
self.item.set_adaptors({'description': [adaptors.extract]})
self.assertEqual(self.item._adaptors_dict['description'][0].name, "extract")
@ -78,11 +78,11 @@ class RobustScrapedItemTestCase(unittest.TestCase):
self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
self.assertEqual(self.item._adaptors_dict['name'][1].name, "strip")
unquote = adaptors.Unquote()
unquote = adaptors.unquote()
self.item.set_attrib_adaptors('name', [adaptors.extract, unquote])
self.assertTrue(isinstance(self.item._adaptors_dict['name'], AdaptorPipe))
self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
self.assertEqual(self.item._adaptors_dict['name'][1].name, "Unquote")
self.assertEqual(self.item._adaptors_dict['name'][1].name, "unquote")
def test_add_adaptor(self):
self.assertEqual(self.item._adaptors_dict, {})