. Updated some adaptors docstrings

. Turned Delist and Unquote adaptors into factory functions instead of classes . Updated tests --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40765
2025-02-26 17:24:38 +00:00 · 2009-01-26 01:15:05 +00:00 · 2009-01-26 01:15:05 +00:00 · dbaf602730
commit dbaf602730
parent 33df3c81c2
6 changed files with 61 additions and 56 deletions
--- a/scrapy/trunk/scrapy/contrib/item/models.py
+++ b/scrapy/trunk/scrapy/contrib/item/models.py
@ -119,15 +119,6 @@ class RobustScrapedItem(ScrapedItem):
        else:
           raise NotImplementedError('You must override _add_single_attributes method in order to join %s values into a single value.' % attrtype.__name__)

-    def _clean_values(self, values):
-        ret = []
-        for val in values:
-            if isinstance(val, tuple):
-                ret.extend(val)
-            elif val:
-                ret.append(val)
-        return ret
-
    def attribute(self, attrname, *values, **kwargs):
        """
        Set the provided values to the provided attribute (`attrname`) by filtering them
@ -146,6 +137,15 @@ class RobustScrapedItem(ScrapedItem):
        The kwargs parameter is passed to the adaptors pipeline, which manages to transmit
        it to the adaptors themselves.
        """
+        def _clean_values(values):
+            ret = []
+            for val in values:
+                if isinstance(val, tuple):
+                    ret.extend(val)
+                elif val:
+                    ret.append(val)
+            return ret
+
        if not values:
            raise UsageError("You must specify at least one value when setting an attribute")
        if attrname not in self.ATTRIBUTES:
@ -162,7 +162,7 @@ class RobustScrapedItem(ScrapedItem):
        multivalued = isinstance(attrtype, list)
        adaptors_pipe = self._adaptors_dict.get(attrname)
        new_values = [adaptors_pipe(value, kwargs) for value in values] if adaptors_pipe else [values]
-        new_values = self._clean_values(new_values)
+        new_values = _clean_values(new_values)

        if old_value and not override:
            if multivalued:
--- a/scrapy/trunk/scrapy/contrib_exp/adaptors/init.py
+++ b/scrapy/trunk/scrapy/contrib_exp/adaptors/init.py
@ -1,4 +1,4 @@
 from scrapy.contrib_exp.adaptors.extraction import extract, ExtractImageLinks
-from scrapy.contrib_exp.adaptors.markup import remove_tags, remove_root, Unquote
-from scrapy.contrib_exp.adaptors.misc import to_unicode, clean_spaces, strip, drop_empty, Delist, Regex
-from scrapy.utils.python import flatten
+from scrapy.contrib_exp.adaptors.markup import remove_tags, remove_root, unquote
+from scrapy.contrib_exp.adaptors.misc import to_unicode, clean_spaces, strip, drop_empty, delist, Regex
+
--- a/scrapy/trunk/scrapy/contrib_exp/adaptors/markup.py
+++ b/scrapy/trunk/scrapy/contrib_exp/adaptors/markup.py
@ -3,20 +3,25 @@ from scrapy.utils.markup import replace_tags, unquote_markup
 from scrapy.utils.python import str_to_unicode
 from scrapy.item.adaptors import adaptize

-def remove_tags(value):
+def remove_tags(tags=()):
    """
-    Removes any tags found in each of the provided list's string.
-    E.g:
-      >> remove_tags('<head>my header</head>', '<body>my <b>body</b></body>')
-      u'my header', u'my body'
+    Factory that returns an adaptor for removing each
+    tag in the `tags` parameter found in the given value.
+    If no `tags` are specified, all of them are removed.
+
    Input: string/unicode
    Output: unicode
    """
-    return replace_tags(value)
+    def _remove_tags(value):
+        return replace_tags(value, tags)
+    return _remove_tags

 _remove_root_re = re.compile(r'^\s*<.*?>(.*)</.*>\s*$', re.DOTALL)
 def remove_root(value):
    """
+    This adaptor removes the root tag of the given string/unicode,
+    if it's found.
+
    Input: string/unicode
    Output: unicode
    """
@ -25,18 +30,20 @@ def remove_root(value):
        value = m.group(1)
    return str_to_unicode(value)

-class Unquote(object):
+def unquote(keep=None):
    """
-    Receives a list of strings, removes all of the
-    CDATAs and entities (except the ones in CDATAs) the strings
-    may have, and returns a new list.
+    This factory returns an adaptor that
+    receives a string or unicode, removes all of the
+    CDATAs and entities (except the ones in CDATAs, and the ones
+    you specify in the `keep` parameter) and then, returns a new
+    string or unicode.

    Input: string/unicode
    Output: string/unicode
    """
-    def __init__(self, keep=None):
-        self.keep = [] if keep is None else keep
+    default_keep = [] if keep is None else keep

-    def __call__(self, value, adaptor_args):
-        keep = adaptor_args.get('keep_entities', self.keep)
+    def unquote(value, adaptor_args):
+        keep = adaptor_args.get('keep_entities', default_keep)
        return unquote_markup(value, keep=keep)
+    return unquote
--- a/scrapy/trunk/scrapy/contrib_exp/adaptors/misc.py
+++ b/scrapy/trunk/scrapy/contrib_exp/adaptors/misc.py
@ -8,14 +8,15 @@ from scrapy.item.adaptors import adaptize

 def to_unicode(value, adaptor_args):
    """
-    Receives a list of strings, converts
-    it to unicode, and returns a new list.
+    Receives a string and converts it to unicode
+    using the given encoding (if specified, else utf-8 is used)
+    and returns a new unicode object.
    E.g:
-      >> to_unicode(['it costs 20€, or 30£'])
+      >> to_unicode('it costs 20\xe2\x82\xac, or 30\xc2\xa3')
        [u'it costs 20\u20ac, or 30\xa3']

-    Input: iterable of strings
-    Output: list of unicodes
+    Input: string
+    Output: unicode
    """
    if not isinstance(value, basestring):
        value = str(value)
@ -24,14 +25,13 @@ def to_unicode(value, adaptor_args):
 _clean_spaces_re = re.compile("\s+", re.U)
 def clean_spaces(value):
    """
-    Converts multispaces into single spaces for each string
-    in the provided iterable.
+    Converts multispaces into single spaces for the given string.
    E.g:
-      >> clean_spaces(['Hello   sir'])
-      [u'Hello sir']
+      >> clean_spaces(u'Hello   sir')
+      u'Hello sir'

-    Input: iterable of unicodes
-    Output: list of unicodes
+    Input: string/unicode
+    Output: unicode
    """
    return _clean_spaces_re.sub(' ', str_to_unicode(value))

@ -61,20 +61,18 @@ def drop_empty(value):
    """
    return [ v for v in value if v ]

-class Delist(object):
+def delist(delimiter=''):
    """
-    Joins a list with the specified delimiter
-    in the adaptor's constructor.
+    This factory returns and adaptor that joins
+    an iterable with the specified delimiter.

    Input: iterable of strings/unicodes
    Output: string/unicode
    """
-    def __init__(self, delimiter=''):
-        self.delimiter = delimiter
-
-    def __call__(self, value, adaptor_args):
-        delimiter = adaptor_args.get('join_delimiter', self.delimiter)
-        return delimiter.join(value)
+    def delist(value, adaptor_args):
+        delim = adaptor_args.get('join_delimiter', delimiter)
+        return delim.join(value)
+    return delist

 class Regex(object):
    """
--- a/scrapy/trunk/scrapy/tests/test_adaptors.py
+++ b/scrapy/trunk/scrapy/tests/test_adaptors.py
@ -162,18 +162,18 @@ class AdaptorsTestCase(unittest.TestCase):
                                  ['lala.com', 'pepe.co.uk', 'das.biz', 'lelelel.net'])

    def test_unquote_all(self):
-        unquote = adaptors.Unquote()
+        unquote = adaptors.unquote()
        self.assertEqual(unquote(u'hello&copy;&amp;welcome&lt;br /&gt;&amp;', {}),
            u'hello\xa9&welcome<br />&')

    def test_unquote(self):
-        unquote = adaptors.Unquote(keep=['amp', 'lt'])
+        unquote = adaptors.unquote(keep=['amp', 'lt'])
        self.assertEqual(unquote(u'hello&copy;&amp;welcome&lt;br /&gt;&amp;', {}),
            u'hello\xa9&amp;welcome&lt;br />&amp;')

    def test_remove_tags(self):
-        self.assertEqual(adaptors.remove_tags('<a href="lala">adsaas<br /></a>'), 'adsaas')
-        self.assertEqual(adaptors.remove_tags('<div id="1"><table>dsadasf</table></div>'), 'dsadasf')
+        self.assertEqual(adaptors.remove_tags()('<a href="lala">adsaas<br /></a>'), 'adsaas')
+        self.assertEqual(adaptors.remove_tags()('<div id="1"><table>dsadasf</table></div>'), 'dsadasf')

    def test_remove_root(self):
        self.assertEqual(adaptors.remove_root('<div>lallaa<a href="coso">dsfsdfds</a>pepepep<br /></div>'),
@ -191,7 +191,7 @@ class AdaptorsTestCase(unittest.TestCase):
            [1, 2, 5, 6, 'hi'])

    def test_delist(self):
-        delist = adaptors.Delist(' ')
+        delist = adaptors.delist(' ')
        self.assertEqual(delist(['hi', 'there', 'fellas.', 'this', 'is', 'my', 'test.'], {}),
            'hi there fellas. this is my test.')
        self.assertEqual(delist(['hi', 'there', 'fellas,', 'this', 'is', 'my', 'test.'], {'join_delimiter': '.'}),
--- a/scrapy/trunk/scrapy/tests/test_robustscrapeditem.py
+++ b/scrapy/trunk/scrapy/tests/test_robustscrapeditem.py
@ -61,11 +61,11 @@ class RobustScrapedItemTestCase(unittest.TestCase):
    def test_set_adaptors(self):
        self.assertEqual(self.item._adaptors_dict, {})

-        delist = adaptors.Delist()
+        delist = adaptors.delist()
        self.item.set_adaptors({'name': [adaptors.extract, delist]})
        self.assertTrue(isinstance(self.item._adaptors_dict['name'], AdaptorPipe))
        self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
-        self.assertEqual(self.item._adaptors_dict['name'][1].name, "Delist")
+        self.assertEqual(self.item._adaptors_dict['name'][1].name, "delist")

        self.item.set_adaptors({'description': [adaptors.extract]})
        self.assertEqual(self.item._adaptors_dict['description'][0].name, "extract")
@ -78,11 +78,11 @@ class RobustScrapedItemTestCase(unittest.TestCase):
        self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
        self.assertEqual(self.item._adaptors_dict['name'][1].name, "strip")

-        unquote = adaptors.Unquote()
+        unquote = adaptors.unquote()
        self.item.set_attrib_adaptors('name', [adaptors.extract, unquote])
        self.assertTrue(isinstance(self.item._adaptors_dict['name'], AdaptorPipe))
        self.assertEqual(self.item._adaptors_dict['name'][0].name, "extract")
-        self.assertEqual(self.item._adaptors_dict['name'][1].name, "Unquote")
+        self.assertEqual(self.item._adaptors_dict['name'][1].name, "unquote")

    def test_add_adaptor(self):
        self.assertEqual(self.item._adaptors_dict, {})