1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 15:48:38 +00:00

fix xmliter namespace on selected node

This commit is contained in:
nramirezuy 2014-08-19 13:57:00 -03:00
parent 8360380db0
commit 2a540206a7
2 changed files with 20 additions and 8 deletions

View File

@ -20,19 +20,27 @@ def xmliter(obj, nodename):
- a unicode string
- a string encoded as utf-8
"""
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
DOCUMENT_HEADER_RE = re.compile(r'<\?xml[^>]+>\s*', re.S)
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
END_TAG_RE = re.compile(r'<\s*/([^\s>]+)\s*>', re.S)
NAMESPACE_RE = re.compile(r'((xmlns[:A-Za-z]*)=[^>\s]+)', re.S)
text = _body_or_str(obj)
header_start = re.search(HEADER_START_RE, text)
header_start = header_start.group(1).strip() if header_start else ''
header_end = re_rsearch(HEADER_END_RE, text)
header_end = text[header_end[1]:].strip() if header_end else ''
document_header = re.search(DOCUMENT_HEADER_RE, text)
document_header = document_header.group().strip() if document_header else ''
header_end_idx = re_rsearch(HEADER_END_RE, text)
header_end = text[header_end_idx[1]:].strip() if header_end_idx else ''
namespaces = {}
if header_end:
for tagname in reversed(re.findall(END_TAG_RE, header_end)):
tag = re.search(r'<\s*%s.*?xmlns[:=][^>]*>' % tagname, text[:header_end_idx[1]], re.S)
if tag:
namespaces.update(reversed(x) for x in re.findall(NAMESPACE_RE, tag.group()))
r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
for match in r.finditer(text):
nodetext = header_start + match.group() + header_end
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
nodetext = document_header + match.group().replace(nodename, '%s %s' % (nodename, ' '.join(namespaces.values())), 1) + header_end
yield Selector(text=nodetext, type='xml')
def csviter(obj, delimiter=None, headers=None, encoding=None):

View File

@ -61,7 +61,6 @@ class XmliterTestCase(unittest.TestCase):
"""
response = XmlResponse(url='http://mydummycompany.com', body=body)
my_iter = self.xmliter(response, 'item')
node = next(my_iter)
node.register_namespace('g', 'http://base.google.com/ns/1.0')
self.assertEqual(node.xpath('title/text()').extract(), ['Item 1'])
@ -74,6 +73,11 @@ class XmliterTestCase(unittest.TestCase):
self.assertEqual(node.xpath('id/text()').extract(), [])
self.assertEqual(node.xpath('price/text()').extract(), [])
my_iter = self.xmliter(response, 'g:image_link')
node = next(my_iter)
node.register_namespace('g', 'http://base.google.com/ns/1.0')
self.assertEqual(node.xpath('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg'])
def test_xmliter_exception(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""