diff --git a/scrapy/trunk/scrapy/contrib/spiders.py b/scrapy/trunk/scrapy/contrib/spiders.py index a7275e76a..9e36582bc 100644 --- a/scrapy/trunk/scrapy/contrib/spiders.py +++ b/scrapy/trunk/scrapy/contrib/spiders.py @@ -98,7 +98,7 @@ class XMLFeedSpider(BasicSpider): raise NotConfigured('You must define parse_item method in order to scrape this feed') if self.iternodes: - nodes = xpathselector_iternodes(response, self.itertag) + nodes = xmliter(response, self.itertag) else: nodes = XmlXPathSelector(response).x('//%s' % self.itertag) diff --git a/scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample3.csv b/scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample3.csv new file mode 100644 index 000000000..2914433b3 --- /dev/null +++ b/scrapy/trunk/scrapy/tests/sample_data/feeds/feed-sample3.csv @@ -0,0 +1,6 @@ +id,name,value +1,alpha,foobar +2,unicode,únícódé‽ +3,multi,"foo +bar" +4,empty, diff --git a/scrapy/trunk/scrapy/tests/test_utils_iterators.py b/scrapy/trunk/scrapy/tests/test_utils_iterators.py new file mode 100644 index 000000000..3a575ce1e --- /dev/null +++ b/scrapy/trunk/scrapy/tests/test_utils_iterators.py @@ -0,0 +1,117 @@ +import os +import unittest + +from scrapy.utils.iterators import csviter, xmliter +from scrapy.http import Response + +class UtilsXmlTestCase(unittest.TestCase): + ### NOTE: Encoding issues have been found with BeautifulSoup for utf-16 files, utf-16 test removed ### + def test_iterator(self): + body = """ + + + Type 1 + Name 1 + + + Type 2 + Name 2 + + + """ + response = Response(domain="example.com", url="http://example.com", body=body) + attrs = [] + for x in xmliter(response, 'product'): + attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract())) + + self.assertEqual(attrs, + [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])]) + + def test_iterator_text(self): + body = u"""onetwo""" + + self.assertEqual([x.x("text()").extract() for x in xmliter(body, 'product')], + [[u'one'], [u'two']]) + + def test_iterator_exception(self): + body = u"""onetwo""" + + iter = xmliter(body, 'product') + iter.next() + iter.next() + + self.assertRaises(StopIteration, iter.next) + +class UtilsCsvTestCase(unittest.TestCase): + sample_feed_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds', 'feed-sample3.csv') + + def test_iterator_defaults(self): + body = open(self.sample_feed_path).read() + + response = Response(domain="example.com", url="http://example.com/", body=body) + csv = csviter(response) + + result = [row for row in csv] + self.assertEqual(result, + [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, + {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, + {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, + {u'id': u'4', u'name': u'empty', u'value': u''}]) + + # explicit type check cuz' we no like stinkin' autocasting! yarrr + for result_row in result: + self.assert_(all((isinstance(k, unicode) for k in result_row.keys()))) + self.assert_(all((isinstance(v, unicode) for v in result_row.values()))) + + def test_iterator_delimiter(self): + body = open(self.sample_feed_path).read().replace(',', '\t') + + response = Response(domain="example.com", url="http://example.com/", body=body) + csv = csviter(response, delimiter='\t') + + self.assertEqual([row for row in csv], + [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, + {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, + {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, + {u'id': u'4', u'name': u'empty', u'value': u''}]) + + def test_iterator_headers(self): + sample = open(self.sample_feed_path).read().splitlines() + headers, body = sample[0].split(','), '\n'.join(sample[1:]) + + response = Response(domain="example.com", url="http://example.com/", body=body) + csv = csviter(response, headers=headers) + + self.assertEqual([row for row in csv], + [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, + {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, + {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, + {u'id': u'4', u'name': u'empty', u'value': u''}]) + + def test_iterator_falserow(self): + body = open(self.sample_feed_path).read() + body = '\n'.join((body, 'a,b', 'a,b,c,d')) + + response = Response(domain="example.com", url="http://example.com/", body=body) + csv = csviter(response) + + self.assertEqual([row for row in csv], + [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, + {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, + {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, + {u'id': u'4', u'name': u'empty', u'value': u''}]) + + def test_iterator_exception(self): + body = open(self.sample_feed_path).read() + + response = Response(domain="example.com", url="http://example.com/", body=body) + iter = csviter(response) + iter.next() + iter.next() + iter.next() + iter.next() + + self.assertRaises(StopIteration, iter.next) + +if __name__ == "__main__": + unittest.main() diff --git a/scrapy/trunk/scrapy/tests/test_utils_xml.py b/scrapy/trunk/scrapy/tests/test_utils_xml.py deleted file mode 100644 index fd59a446b..000000000 --- a/scrapy/trunk/scrapy/tests/test_utils_xml.py +++ /dev/null @@ -1,37 +0,0 @@ -import os -import unittest - -from scrapy.utils.xml import xpathselector_iternodes -from scrapy.http import Response - -class UtilsXmlTestCase(unittest.TestCase): - - def test_iterator(self): - body = """ - - - Type 1 - Name 1 - - - Type 2 - Name 2 - - - """ - response = Response(domain="example.com", url="http://example.com", body=body) - attrs = [] - for x in xpathselector_iternodes(response, 'product'): - attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract())) - - self.assertEqual(attrs, - [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])]) - - def test_iterator_text(self): - body = u"""onetwo""" - - self.assertEqual([x.x("text()").extract() for x in xpathselector_iternodes(body, 'product')], - [[u'one'], [u'two']]) - -if __name__ == "__main__": - unittest.main() diff --git a/scrapy/trunk/scrapy/utils/iterators.py b/scrapy/trunk/scrapy/utils/iterators.py index 5f989d0a3..6bb1854ea 100644 --- a/scrapy/trunk/scrapy/utils/iterators.py +++ b/scrapy/trunk/scrapy/utils/iterators.py @@ -1,23 +1,52 @@ -import csv +import re, csv +from scrapy.xpath import XmlXPathSelector from scrapy.http import Response from scrapy import log -def csv_iter(response, delimiter=None, headers=None): - if delimiter: - csv_r = csv.reader(response.body.to_unicode().split('\n'), delimiter=delimiter) +def _normalize_input(obj): + assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__ + if isinstance(obj, Response): + return obj.body.to_unicode() + elif isinstance(obj, str): + return obj.decode('utf-8') else: - csv_r = csv.reader(response.body.to_unicode().split('\n')) + return obj + +def xmliter(obj, nodename): + """Return a iterator of XPathSelector's over all nodes of a XML document, + given tha name of the node to iterate. Useful for parsing XML feeds. + + obj can be: + - a Response object + - a unicode string + - a string encoded as utf-8 + """ + text = _normalize_input(obj) + + r = re.compile(r"<%s[\s>].*?" % (nodename, nodename), re.DOTALL) + for match in r.finditer(text): + nodetext = match.group() + yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0] + +def csviter(obj, delimiter=None, headers=None): + def _getrow(csv_r): + return [field.decode() for field in csv_r.next()] + + lines = _normalize_input(obj).splitlines(True) + if delimiter: + csv_r = csv.reader(lines, delimiter=delimiter) + else: + csv_r = csv.reader(lines) if not headers: - headers = csv_r.next() + headers = _getrow(csv_r) while True: - node = csv_r.next() - - if len(node) != len(headers): - log.msg("ignoring node %d (length: %d, should be: %d)" % (csv_r.line_num, len(node), len(headers)), log.WARNING) + row = _getrow(csv_r) + if len(row) != len(headers): + log.msg("ignoring row %d (length: %d, should be: %d)" % (csv_r.line_num, len(row), len(headers)), log.WARNING) continue - - yield dict(zip(headers, node)) + else: + yield dict(zip(headers, row)) diff --git a/scrapy/trunk/scrapy/utils/xml.py b/scrapy/trunk/scrapy/utils/xml.py deleted file mode 100644 index acfa52363..000000000 --- a/scrapy/trunk/scrapy/utils/xml.py +++ /dev/null @@ -1,28 +0,0 @@ -import re - -from scrapy.xpath import XmlXPathSelector -from scrapy.http import Response - -def xpathselector_iternodes(obj, nodename): - """Return a iterator of XPathSelector's over all nodes of a XML document, - given tha name of the node to iterate. Useful for parsing XML feeds. - - obj can be: - - a Response object - - a unicode string - - a string encoded as utf-8 - """ - - assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__ - - if isinstance(obj, Response): - text = obj.body.to_unicode() - elif isinstance(obj, str): - text = obj.decode('utf-8') - else: - text = obj - - r = re.compile(r"<%s[\s>].*?" % (nodename, nodename), re.DOTALL) - for match in r.finditer(text): - nodetext = match.group() - yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]