moved xpathselector_iternodes from scrapy.utils.xml to scrapy.utils.iterators and renamed it to "xmliter", also renamed csv_iter to csviter and added tests

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40360
2025-02-23 17:43:57 +00:00 · 2008-11-03 16:10:43 +00:00 · 2008-11-03 16:10:43 +00:00 · 9b46c20da2
commit 9b46c20da2
parent 1ef65b97b5
6 changed files with 165 additions and 78 deletions
--- a/scrapy/trunk/scrapy/contrib/spiders.py
+++ b/scrapy/trunk/scrapy/contrib/spiders.py
@ -98,7 +98,7 @@ class XMLFeedSpider(BasicSpider):
            raise NotConfigured('You must define parse_item method in order to scrape this feed')
        if self.iternodes:
-            nodes = xpathselector_iternodes(response, self.itertag)
+            nodes = xmliter(response, self.itertag)
        else:
            nodes = XmlXPathSelector(response).x('//%s' % self.itertag)
--- a/scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample3.csv
+++ b/scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample3.csv
@ -0,0 +1,6 @@
 id,name,value
 1,alpha,foobar
 2,unicode,únícódé‽
 3,multi,"foo
 bar"
 4,empty,
--- a/scrapy/trunk/scrapy/tests/test_utils_iterators.py
+++ b/scrapy/trunk/scrapy/tests/test_utils_iterators.py
@ -0,0 +1,117 @@
 import os
 import unittest
 from scrapy.utils.iterators import csviter, xmliter
 from scrapy.http import Response
 class UtilsXmlTestCase(unittest.TestCase):
    ### NOTE: Encoding issues have been found with BeautifulSoup for utf-16 files, utf-16 test removed ###
    def test_iterator(self):
        body = """<?xml version="1.0" encoding="UTF-8"?>
 <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
  <product id="001">
    <type>Type 1</type>
    <name>Name 1</name>
  </product>
  <product id="002">
    <type>Type 2</type>
    <name>Name 2</name>
  </product>
 </products>
        """
        response = Response(domain="example.com", url="http://example.com", body=body)
        attrs = []
        for x in xmliter(response, 'product'):
            attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
        self.assertEqual(attrs, 
                         [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
    def test_iterator_text(self):
        body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
        self.assertEqual([x.x("text()").extract() for x in xmliter(body, 'product')],
                         [[u'one'], [u'two']])
    def test_iterator_exception(self):
        body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
        iter = xmliter(body, 'product')
        iter.next()
        iter.next()
        self.assertRaises(StopIteration, iter.next)
 class UtilsCsvTestCase(unittest.TestCase):
    sample_feed_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds', 'feed-sample3.csv')
    def test_iterator_defaults(self):
        body = open(self.sample_feed_path).read()
        response = Response(domain="example.com", url="http://example.com/", body=body)
        csv = csviter(response)
        result = [row for row in csv]
        self.assertEqual(result,
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assert_(all((isinstance(k, unicode) for k in result_row.keys())))
            self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
    def test_iterator_delimiter(self):
        body = open(self.sample_feed_path).read().replace(',', '\t')
        response = Response(domain="example.com", url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')
        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
    def test_iterator_headers(self):
        sample = open(self.sample_feed_path).read().splitlines()
        headers, body = sample[0].split(','), '\n'.join(sample[1:])
        response = Response(domain="example.com", url="http://example.com/", body=body)
        csv = csviter(response, headers=headers)
        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
    def test_iterator_falserow(self):
        body = open(self.sample_feed_path).read()
        body = '\n'.join((body, 'a,b', 'a,b,c,d'))
        response = Response(domain="example.com", url="http://example.com/", body=body)
        csv = csviter(response)
        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
    def test_iterator_exception(self):
        body = open(self.sample_feed_path).read()
        response = Response(domain="example.com", url="http://example.com/", body=body)
        iter = csviter(response)
        iter.next()
        iter.next()
        iter.next()
        iter.next()
        self.assertRaises(StopIteration, iter.next)
 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/trunk/scrapy/tests/test_utils_xml.py
+++ b/scrapy/trunk/scrapy/tests/test_utils_xml.py
@ -1,37 +0,0 @@
 import os
 import unittest
 from scrapy.utils.xml import xpathselector_iternodes
 from scrapy.http import Response
 class UtilsXmlTestCase(unittest.TestCase):
    def test_iterator(self):
        body = """<?xml version="1.0" encoding="UTF-8"?>
 <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
  <product id="001">
    <type>Type 1</type>
    <name>Name 1</name>
  </product>
  <product id="002">
    <type>Type 2</type>
    <name>Name 2</name>
  </product>
 </products>
        """
        response = Response(domain="example.com", url="http://example.com", body=body)
        attrs = []
        for x in xpathselector_iternodes(response, 'product'):
            attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
        self.assertEqual(attrs, 
                         [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
    def test_iterator_text(self):
        body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
        self.assertEqual([x.x("text()").extract() for x in xpathselector_iternodes(body, 'product')],
                         [[u'one'], [u'two']])
 if __name__ == "__main__":
    unittest.main()   
--- a/scrapy/trunk/scrapy/utils/iterators.py
+++ b/scrapy/trunk/scrapy/utils/iterators.py
@ -1,23 +1,52 @@
-import csv
+import re, csv
 from scrapy.xpath import XmlXPathSelector
 from scrapy.http import Response
 from scrapy import log
-def csv_iter(response, delimiter=None, headers=None):
+def _normalize_input(obj):
-    if delimiter:
+    assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
-        csv_r = csv.reader(response.body.to_unicode().split('\n'), delimiter=delimiter)
+    if isinstance(obj, Response):
        return obj.body.to_unicode()
    elif isinstance(obj, str):
        return obj.decode('utf-8')
    else:
-        csv_r = csv.reader(response.body.to_unicode().split('\n'))
+        return obj
 def xmliter(obj, nodename):
    """Return a iterator of XPathSelector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.
    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    text = _normalize_input(obj)
    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = match.group()
        yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]
 def csviter(obj, delimiter=None, headers=None):
    def _getrow(csv_r):
        return [field.decode() for field in csv_r.next()]
    lines = _normalize_input(obj).splitlines(True)
    if delimiter:
        csv_r = csv.reader(lines, delimiter=delimiter)
    else:
        csv_r = csv.reader(lines)
    if not headers:
-        headers = csv_r.next()
+        headers = _getrow(csv_r)
    while True:
-        node = csv_r.next()
+        row = _getrow(csv_r)
-
+        if len(row) != len(headers):
-        if len(node) != len(headers):
+            log.msg("ignoring row %d (length: %d, should be: %d)" % (csv_r.line_num, len(row), len(headers)), log.WARNING)
            log.msg("ignoring node %d (length: %d, should be: %d)" % (csv_r.line_num, len(node), len(headers)), log.WARNING)
            continue
-
+        else:
-        yield dict(zip(headers, node))
+            yield dict(zip(headers, row))
--- a/scrapy/trunk/scrapy/utils/xml.py
+++ b/scrapy/trunk/scrapy/utils/xml.py
@ -1,28 +0,0 @@
 import re
 from scrapy.xpath import XmlXPathSelector
 from scrapy.http import Response
 def xpathselector_iternodes(obj, nodename):
    """Return a iterator of XPathSelector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.
    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
    if isinstance(obj, Response):
        text = obj.body.to_unicode()
    elif isinstance(obj, str):
        text = obj.decode('utf-8')
    else:
        text = obj
    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = match.group()
        yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]