mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 17:24:38 +00:00
. Updated some adaptors docstrings
. Turned Delist and Unquote adaptors into factory functions instead of classes . Updated tests --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40765
This commit is contained in:
parent
33df3c81c2
commit
dbaf602730
@ -119,15 +119,6 @@ class RobustScrapedItem(ScrapedItem):
|
||||
else:
|
||||
raise NotImplementedError('You must override _add_single_attributes method in order to join %s values into a single value.' % attrtype.__name__)
|
||||
|
||||
def _clean_values(self, values):
|
||||
ret = []
|
||||
for val in values:
|
||||
if isinstance(val, tuple):
|
||||
ret.extend(val)
|
||||
elif val:
|
||||
ret.append(val)
|
||||
return ret
|
||||
|
||||
def attribute(self, attrname, *values, **kwargs):
|
||||
"""
|
||||
Set the provided values to the provided attribute (`attrname`) by filtering them
|
||||
@ -146,6 +137,15 @@ class RobustScrapedItem(ScrapedItem):
|
||||
The kwargs parameter is passed to the adaptors pipeline, which manages to transmit
|
||||
it to the adaptors themselves.
|
||||
"""
|
||||
def _clean_values(values):
|
||||
ret = []
|
||||
for val in values:
|
||||
if isinstance(val, tuple):
|
||||
ret.extend(val)
|
||||
elif val:
|
||||
ret.append(val)
|
||||
return ret
|
||||
|
||||
if not values:
|
||||
raise UsageError("You must specify at least one value when setting an attribute")
|
||||
if attrname not in self.ATTRIBUTES:
|
||||
@ -162,7 +162,7 @@ class RobustScrapedItem(ScrapedItem):
|
||||
multivalued = isinstance(attrtype, list)
|
||||
adaptors_pipe = self._adaptors_dict.get(attrname)
|
||||
new_values = [adaptors_pipe(value, kwargs) for value in values] if adaptors_pipe else [values]
|
||||
new_values = self._clean_values(new_values)
|
||||
new_values = _clean_values(new_values)
|
||||
|
||||
if old_value and not override:
|
||||
if multivalued:
|
||||
|
@ -1,4 +1,4 @@
|
||||
from scrapy.contrib_exp.adaptors.extraction import extract, ExtractImageLinks
|
||||
from scrapy.contrib_exp.adaptors.markup import remove_tags, remove_root, Unquote
|
||||
from scrapy.contrib_exp.adaptors.misc import to_unicode, clean_spaces, strip, drop_empty, Delist, Regex
|
||||
from scrapy.utils.python import flatten
|
||||
from scrapy.contrib_exp.adaptors.markup import remove_tags, remove_root, unquote
|
||||
from scrapy.contrib_exp.adaptors.misc import to_unicode, clean_spaces, strip, drop_empty, delist, Regex
|
||||
|
||||
|
@ -3,20 +3,25 @@ from scrapy.utils.markup import replace_tags, unquote_markup
|
||||
from scrapy.utils.python import str_to_unicode
|
||||
from scrapy.item.adaptors import adaptize
|
||||
|
||||
def remove_tags(value):
|
||||
def remove_tags(tags=()):
|
||||
"""
|
||||
Removes any tags found in each of the provided list's string.
|
||||
E.g:
|
||||
>> remove_tags('<head>my header</head>', '<body>my <b>body</b></body>')
|
||||
u'my header', u'my body'
|
||||
Factory that returns an adaptor for removing each
|
||||
tag in the `tags` parameter found in the given value.
|
||||
If no `tags` are specified, all of them are removed.
|
||||
|
||||
Input: string/unicode
|
||||
Output: unicode
|
||||
"""
|
||||
return replace_tags(value)
|
||||
def _remove_tags(value):
|
||||
return replace_tags(value, tags)
|
||||
return _remove_tags
|
||||
|
||||
_remove_root_re = re.compile(r'^\s*<.*?>(.*)</.*>\s*$', re.DOTALL)
|
||||
def remove_root(value):
|
||||
"""
|
||||
This adaptor removes the root tag of the given string/unicode,
|
||||
if it's found.
|
||||
|
||||
Input: string/unicode
|
||||
Output: unicode
|
||||
"""
|
||||
@ -25,18 +30,20 @@ def remove_root(value):
|
||||
value = m.group(1)
|
||||
return str_to_unicode(value)
|
||||
|
||||
class Unquote(object):
|
||||
def unquote(keep=None):
|
||||
"""
|
||||
Receives a list of strings, removes all of the
|
||||
CDATAs and entities (except the ones in CDATAs) the strings
|
||||
may have, and returns a new list.
|
||||
This factory returns an adaptor that
|
||||
receives a string or unicode, removes all of the
|
||||
CDATAs and entities (except the ones in CDATAs, and the ones
|
||||
you specify in the `keep` parameter) and then, returns a new
|
||||
string or unicode.
|
||||
|
||||
Input: string/unicode
|
||||
Output: string/unicode
|
||||
"""
|
||||
def __init__(self, keep=None):
|
||||
self.keep = [] if keep is None else keep
|
||||
default_keep = [] if keep is None else keep
|
||||
|
||||
def __call__(self, value, adaptor_args):
|
||||
keep = adaptor_args.get('keep_entities', self.keep)
|
||||
def unquote(value, adaptor_args):
|
||||
keep = adaptor_args.get('keep_entities', default_keep)
|
||||
return unquote_markup(value, keep=keep)
|
||||
return unquote
|
||||
|
@ -8,14 +8,15 @@ from scrapy.item.adaptors import adaptize
|
||||
|
||||
def to_unicode(value, adaptor_args):
|
||||
"""
|
||||
Receives a list of strings, converts
|
||||
it to unicode, and returns a new list.
|
||||
Receives a string and converts it to unicode
|
||||
using the given encoding (if specified, else utf-8 is used)
|
||||
and returns a new unicode object.
|
||||
E.g:
|
||||
>> to_unicode(['it costs 20€, or 30£'])
|
||||
>> to_unicode('it costs 20\xe2\x82\xac, or 30\xc2\xa3')
|
||||
[u'it costs 20\u20ac, or 30\xa3']
|
||||
|
||||
Input: iterable of strings
|
||||
Output: list of unicodes
|
||||
Input: string
|
||||
Output: unicode
|
||||
"""
|
||||
if not isinstance(value, basestring):
|
||||
value = str(value)
|
||||
@ -24,14 +25,13 @@ def to_unicode(value, adaptor_args):
|
||||
_clean_spaces_re = re.compile("\s+", re.U)
|
||||
def clean_spaces(value):
|
||||
"""
|
||||
Converts multispaces into single spaces for each string
|
||||
in the provided iterable.
|
||||
Converts multispaces into single spaces for the given string.
|
||||
E.g:
|
||||
>> clean_spaces(['Hello sir'])
|
||||
[u'Hello sir']
|
||||
>> clean_spaces(u'Hello sir')
|
||||
u'Hello sir'
|
||||
|
||||
Input: iterable of unicodes
|
||||
Output: list of unicodes
|
||||
Input: string/unicode
|
||||
Output: unicode
|
||||
"""
|
||||
return _clean_spaces_re.sub(' ', str_to_unicode(value))
|
||||
|
||||
@ -61,20 +61,18 @@ def drop_empty(value):
|
||||
"""
|
||||
return [ v for v in value if v ]
|
||||
|
||||
class Delist(object):
|
||||
def delist(delimiter=''):
|
||||
"""
|
||||
Joins a list with the specified delimiter
|
||||
in the adaptor's constructor.
|
||||
This factory returns and adaptor that joins
|
||||
an iterable with the specified delimiter.
|
||||
|
||||
Input: iterable of strings/unicodes
|
||||
Output: string/unicode
|
||||
"""
|
||||
def __init__(self, delimiter=''):
|
||||
self.delimiter = delimiter
|
||||
|
||||
def __call__(self, value, adaptor_args):
|
||||
delimiter = adaptor_args.get('join_delimiter', self.delimiter)
|
||||
return delimiter.join(value)
|
||||
def delist(value, adaptor_args):
|
||||
delim = adaptor_args.get('join_delimiter', delimiter)
|
||||
return delim.join(value)
|
||||
return delist
|
||||
|
||||
class Regex(object):
|
||||
"""
|
||||
|
@ -162,18 +162,18 @@ class AdaptorsTestCase(unittest.TestCase):
|
||||
['lala.com', 'pepe.co.uk', 'das.biz', 'lelelel.net'])
|
||||
|
||||
def test_unquote_all(self):
|
||||
unquote = adaptors.Unquote()
|
||||
unquote = adaptors.unquote()
|
||||
self.assertEqual(unquote(u'hello©&welcome<br />&', {}),
|
||||
u'hello\xa9&welcome<br />&')
|
||||
|
||||
def test_unquote(self):
|
||||
unquote = adaptors.Unquote(keep=['amp', 'lt'])
|
||||
unquote = adaptors.unquote(keep=['amp', 'lt'])
|
||||
self.assertEqual(unquote(u'hello©&welcome<br />&', {}),
|
||||
u'hello\xa9&welcome<br />&')
|
||||
|
||||
def test_remove_tags(self):
|
||||
self.assertEqual(adaptors.remove_tags('<a href="lala">adsaas<br /></a>'), 'adsaas')
|
||||
self.assertEqual(adaptors.remove_tags('<div id="1"><table>dsadasf</table></div>'), 'dsadasf')
|
||||
self.assertEqual(adaptors.remove_tags()('<a href="lala">adsaas<br /></a>'), 'adsaas')
|
||||
self.assertEqual(adaptors.remove_tags()('<div id="1"><table>dsadasf</table></div>'), 'dsadasf')
|
||||
|
||||
def test_remove_root(self):
|
||||
self.assertEqual(adaptors.remove_root('<div>lallaa<a href="coso">dsfsdfds</a>pepepep<br /></div>'),
|
||||
@ -191,7 +191,7 @@ class AdaptorsTestCase(unittest.TestCase):
|
||||
[1, 2, 5, 6, 'hi'])
|
||||
|
||||
def test_delist(self):
|
||||
delist = adaptors.Delist(' ')
|
||||
delist = adaptors.delist(' ')
|
||||
self.assertEqual(delist(['hi', 'there', 'fellas.', 'this', 'is', 'my', 'test.'], {}),
|
||||
'hi there fellas. this is my test.')
|
||||
self.assertEqual(delist(['hi', 'there', 'fellas,', 'this', 'is', 'my', 'test.'], {'join_delimiter': '.'}),
|
||||
|
@ -61,11 +61,11 @@ class RobustScrapedItemTestCase(unittest.TestCase):
|
||||
def test_set_adaptors(self):
|
||||
self.assertEqual(self.item._adaptors_dict, {})
|
||||
|
||||
delist = adaptors.Delist()
|
||||
delist = adaptors.delist()
|
||||
self.item.set_adaptors({'name': [adaptors.extract, delist]})
|
||||
self.assertTrue(isinstance(self.item._adaptors_dict['name'], AdaptorPipe))
|
||||
self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
|
||||
self.assertEqual(self.item._adaptors_dict['name'][1].name, "Delist")
|
||||
self.assertEqual(self.item._adaptors_dict['name'][1].name, "delist")
|
||||
|
||||
self.item.set_adaptors({'description': [adaptors.extract]})
|
||||
self.assertEqual(self.item._adaptors_dict['description'][0].name, "extract")
|
||||
@ -78,11 +78,11 @@ class RobustScrapedItemTestCase(unittest.TestCase):
|
||||
self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
|
||||
self.assertEqual(self.item._adaptors_dict['name'][1].name, "strip")
|
||||
|
||||
unquote = adaptors.Unquote()
|
||||
unquote = adaptors.unquote()
|
||||
self.item.set_attrib_adaptors('name', [adaptors.extract, unquote])
|
||||
self.assertTrue(isinstance(self.item._adaptors_dict['name'], AdaptorPipe))
|
||||
self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
|
||||
self.assertEqual(self.item._adaptors_dict['name'][1].name, "Unquote")
|
||||
self.assertEqual(self.item._adaptors_dict['name'][1].name, "unquote")
|
||||
|
||||
def test_add_adaptor(self):
|
||||
self.assertEqual(self.item._adaptors_dict, {})
|
||||
|
Loading…
x
Reference in New Issue
Block a user