mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 17:43:57 +00:00
moved xpathselector_iternodes from scrapy.utils.xml to scrapy.utils.iterators and renamed it to "xmliter", also renamed csv_iter to csviter and added tests
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40360
This commit is contained in:
parent
1ef65b97b5
commit
9b46c20da2
@ -98,7 +98,7 @@ class XMLFeedSpider(BasicSpider):
|
|||||||
raise NotConfigured('You must define parse_item method in order to scrape this feed')
|
raise NotConfigured('You must define parse_item method in order to scrape this feed')
|
||||||
|
|
||||||
if self.iternodes:
|
if self.iternodes:
|
||||||
nodes = xpathselector_iternodes(response, self.itertag)
|
nodes = xmliter(response, self.itertag)
|
||||||
else:
|
else:
|
||||||
nodes = XmlXPathSelector(response).x('//%s' % self.itertag)
|
nodes = XmlXPathSelector(response).x('//%s' % self.itertag)
|
||||||
|
|
||||||
|
@ -0,0 +1,6 @@
|
|||||||
|
id,name,value
|
||||||
|
1,alpha,foobar
|
||||||
|
2,unicode,únícódé‽
|
||||||
|
3,multi,"foo
|
||||||
|
bar"
|
||||||
|
4,empty,
|
|
117
scrapy/trunk/scrapy/tests/test_utils_iterators.py
Normal file
117
scrapy/trunk/scrapy/tests/test_utils_iterators.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from scrapy.utils.iterators import csviter, xmliter
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
class UtilsXmlTestCase(unittest.TestCase):
|
||||||
|
### NOTE: Encoding issues have been found with BeautifulSoup for utf-16 files, utf-16 test removed ###
|
||||||
|
def test_iterator(self):
|
||||||
|
body = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
|
||||||
|
<product id="001">
|
||||||
|
<type>Type 1</type>
|
||||||
|
<name>Name 1</name>
|
||||||
|
</product>
|
||||||
|
<product id="002">
|
||||||
|
<type>Type 2</type>
|
||||||
|
<name>Name 2</name>
|
||||||
|
</product>
|
||||||
|
</products>
|
||||||
|
"""
|
||||||
|
response = Response(domain="example.com", url="http://example.com", body=body)
|
||||||
|
attrs = []
|
||||||
|
for x in xmliter(response, 'product'):
|
||||||
|
attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
|
||||||
|
|
||||||
|
self.assertEqual(attrs,
|
||||||
|
[(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
|
||||||
|
|
||||||
|
def test_iterator_text(self):
|
||||||
|
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||||
|
|
||||||
|
self.assertEqual([x.x("text()").extract() for x in xmliter(body, 'product')],
|
||||||
|
[[u'one'], [u'two']])
|
||||||
|
|
||||||
|
def test_iterator_exception(self):
|
||||||
|
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||||
|
|
||||||
|
iter = xmliter(body, 'product')
|
||||||
|
iter.next()
|
||||||
|
iter.next()
|
||||||
|
|
||||||
|
self.assertRaises(StopIteration, iter.next)
|
||||||
|
|
||||||
|
class UtilsCsvTestCase(unittest.TestCase):
|
||||||
|
sample_feed_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds', 'feed-sample3.csv')
|
||||||
|
|
||||||
|
def test_iterator_defaults(self):
|
||||||
|
body = open(self.sample_feed_path).read()
|
||||||
|
|
||||||
|
response = Response(domain="example.com", url="http://example.com/", body=body)
|
||||||
|
csv = csviter(response)
|
||||||
|
|
||||||
|
result = [row for row in csv]
|
||||||
|
self.assertEqual(result,
|
||||||
|
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
|
||||||
|
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
|
||||||
|
{u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'},
|
||||||
|
{u'id': u'4', u'name': u'empty', u'value': u''}])
|
||||||
|
|
||||||
|
# explicit type check cuz' we no like stinkin' autocasting! yarrr
|
||||||
|
for result_row in result:
|
||||||
|
self.assert_(all((isinstance(k, unicode) for k in result_row.keys())))
|
||||||
|
self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
|
||||||
|
|
||||||
|
def test_iterator_delimiter(self):
|
||||||
|
body = open(self.sample_feed_path).read().replace(',', '\t')
|
||||||
|
|
||||||
|
response = Response(domain="example.com", url="http://example.com/", body=body)
|
||||||
|
csv = csviter(response, delimiter='\t')
|
||||||
|
|
||||||
|
self.assertEqual([row for row in csv],
|
||||||
|
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
|
||||||
|
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
|
||||||
|
{u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'},
|
||||||
|
{u'id': u'4', u'name': u'empty', u'value': u''}])
|
||||||
|
|
||||||
|
def test_iterator_headers(self):
|
||||||
|
sample = open(self.sample_feed_path).read().splitlines()
|
||||||
|
headers, body = sample[0].split(','), '\n'.join(sample[1:])
|
||||||
|
|
||||||
|
response = Response(domain="example.com", url="http://example.com/", body=body)
|
||||||
|
csv = csviter(response, headers=headers)
|
||||||
|
|
||||||
|
self.assertEqual([row for row in csv],
|
||||||
|
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
|
||||||
|
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
|
||||||
|
{u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'},
|
||||||
|
{u'id': u'4', u'name': u'empty', u'value': u''}])
|
||||||
|
|
||||||
|
def test_iterator_falserow(self):
|
||||||
|
body = open(self.sample_feed_path).read()
|
||||||
|
body = '\n'.join((body, 'a,b', 'a,b,c,d'))
|
||||||
|
|
||||||
|
response = Response(domain="example.com", url="http://example.com/", body=body)
|
||||||
|
csv = csviter(response)
|
||||||
|
|
||||||
|
self.assertEqual([row for row in csv],
|
||||||
|
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
|
||||||
|
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
|
||||||
|
{u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'},
|
||||||
|
{u'id': u'4', u'name': u'empty', u'value': u''}])
|
||||||
|
|
||||||
|
def test_iterator_exception(self):
|
||||||
|
body = open(self.sample_feed_path).read()
|
||||||
|
|
||||||
|
response = Response(domain="example.com", url="http://example.com/", body=body)
|
||||||
|
iter = csviter(response)
|
||||||
|
iter.next()
|
||||||
|
iter.next()
|
||||||
|
iter.next()
|
||||||
|
iter.next()
|
||||||
|
|
||||||
|
self.assertRaises(StopIteration, iter.next)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
@ -1,37 +0,0 @@
|
|||||||
import os
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
from scrapy.utils.xml import xpathselector_iternodes
|
|
||||||
from scrapy.http import Response
|
|
||||||
|
|
||||||
class UtilsXmlTestCase(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_iterator(self):
|
|
||||||
body = """<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">
|
|
||||||
<product id="001">
|
|
||||||
<type>Type 1</type>
|
|
||||||
<name>Name 1</name>
|
|
||||||
</product>
|
|
||||||
<product id="002">
|
|
||||||
<type>Type 2</type>
|
|
||||||
<name>Name 2</name>
|
|
||||||
</product>
|
|
||||||
</products>
|
|
||||||
"""
|
|
||||||
response = Response(domain="example.com", url="http://example.com", body=body)
|
|
||||||
attrs = []
|
|
||||||
for x in xpathselector_iternodes(response, 'product'):
|
|
||||||
attrs.append((x.x("@id").extract(), x.x("name/text()").extract(), x.x("./type/text()").extract()))
|
|
||||||
|
|
||||||
self.assertEqual(attrs,
|
|
||||||
[(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
|
|
||||||
|
|
||||||
def test_iterator_text(self):
|
|
||||||
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
|
||||||
|
|
||||||
self.assertEqual([x.x("text()").extract() for x in xpathselector_iternodes(body, 'product')],
|
|
||||||
[[u'one'], [u'two']])
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
@ -1,23 +1,52 @@
|
|||||||
import csv
|
import re, csv
|
||||||
|
|
||||||
|
from scrapy.xpath import XmlXPathSelector
|
||||||
from scrapy.http import Response
|
from scrapy.http import Response
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
|
|
||||||
def csv_iter(response, delimiter=None, headers=None):
|
def _normalize_input(obj):
|
||||||
if delimiter:
|
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
|
||||||
csv_r = csv.reader(response.body.to_unicode().split('\n'), delimiter=delimiter)
|
if isinstance(obj, Response):
|
||||||
|
return obj.body.to_unicode()
|
||||||
|
elif isinstance(obj, str):
|
||||||
|
return obj.decode('utf-8')
|
||||||
else:
|
else:
|
||||||
csv_r = csv.reader(response.body.to_unicode().split('\n'))
|
return obj
|
||||||
|
|
||||||
|
def xmliter(obj, nodename):
|
||||||
|
"""Return a iterator of XPathSelector's over all nodes of a XML document,
|
||||||
|
given tha name of the node to iterate. Useful for parsing XML feeds.
|
||||||
|
|
||||||
|
obj can be:
|
||||||
|
- a Response object
|
||||||
|
- a unicode string
|
||||||
|
- a string encoded as utf-8
|
||||||
|
"""
|
||||||
|
text = _normalize_input(obj)
|
||||||
|
|
||||||
|
r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
|
||||||
|
for match in r.finditer(text):
|
||||||
|
nodetext = match.group()
|
||||||
|
yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]
|
||||||
|
|
||||||
|
def csviter(obj, delimiter=None, headers=None):
|
||||||
|
def _getrow(csv_r):
|
||||||
|
return [field.decode() for field in csv_r.next()]
|
||||||
|
|
||||||
|
lines = _normalize_input(obj).splitlines(True)
|
||||||
|
if delimiter:
|
||||||
|
csv_r = csv.reader(lines, delimiter=delimiter)
|
||||||
|
else:
|
||||||
|
csv_r = csv.reader(lines)
|
||||||
|
|
||||||
if not headers:
|
if not headers:
|
||||||
headers = csv_r.next()
|
headers = _getrow(csv_r)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
node = csv_r.next()
|
row = _getrow(csv_r)
|
||||||
|
if len(row) != len(headers):
|
||||||
if len(node) != len(headers):
|
log.msg("ignoring row %d (length: %d, should be: %d)" % (csv_r.line_num, len(row), len(headers)), log.WARNING)
|
||||||
log.msg("ignoring node %d (length: %d, should be: %d)" % (csv_r.line_num, len(node), len(headers)), log.WARNING)
|
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
yield dict(zip(headers, node))
|
yield dict(zip(headers, row))
|
||||||
|
|
||||||
|
@ -1,28 +0,0 @@
|
|||||||
import re
|
|
||||||
|
|
||||||
from scrapy.xpath import XmlXPathSelector
|
|
||||||
from scrapy.http import Response
|
|
||||||
|
|
||||||
def xpathselector_iternodes(obj, nodename):
|
|
||||||
"""Return a iterator of XPathSelector's over all nodes of a XML document,
|
|
||||||
given tha name of the node to iterate. Useful for parsing XML feeds.
|
|
||||||
|
|
||||||
obj can be:
|
|
||||||
- a Response object
|
|
||||||
- a unicode string
|
|
||||||
- a string encoded as utf-8
|
|
||||||
"""
|
|
||||||
|
|
||||||
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
|
|
||||||
|
|
||||||
if isinstance(obj, Response):
|
|
||||||
text = obj.body.to_unicode()
|
|
||||||
elif isinstance(obj, str):
|
|
||||||
text = obj.decode('utf-8')
|
|
||||||
else:
|
|
||||||
text = obj
|
|
||||||
|
|
||||||
r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
|
|
||||||
for match in r.finditer(text):
|
|
||||||
nodetext = match.group()
|
|
||||||
yield XmlXPathSelector(text=nodetext).x('/' + nodename)[0]
|
|
Loading…
x
Reference in New Issue
Block a user