1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 13:08:16 +00:00

Fix namespaces nodename support for xmliter_lxml

This commit is contained in:
Adrián Chaves 2023-12-15 10:06:13 +01:00
parent 1533b69032
commit 4f72b49f97
2 changed files with 21 additions and 7 deletions

View File

@ -12,11 +12,14 @@ from typing import (
List,
Literal,
Optional,
Tuple,
Union,
cast,
overload,
)
from lxml import etree
from scrapy.http import Response, TextResponse
from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode
@ -77,15 +80,31 @@ def xmliter(
yield Selector(text=nodetext, type="xml")
def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
if ":" not in element_name:
return element_name, None, None
reader: "SupportsReadClose[bytes]" = _StreamReader(data)
node_prefix, element_name = element_name.split(":", maxsplit=1)
ns_iterator = etree.iterparse(
reader, encoding=reader.encoding, events=("start-ns",)
)
for event, (_prefix, _namespace) in ns_iterator:
if _prefix != node_prefix:
continue
return element_name, _prefix, _namespace
return f"{node_prefix}:{element_name}", None, None
def xmliter_lxml(
obj: Union[Response, str, bytes],
nodename: str,
namespace: Optional[str] = None,
prefix: str = "x",
) -> Generator[Selector, Any, None]:
from lxml import etree
if not namespace:
nodename, prefix, namespace = _resolve_xml_namespace(nodename, obj)
reader = _StreamReader(obj)
reader: "SupportsReadClose[bytes]" = _StreamReader(obj)
tag = f"{{{namespace}}}{nodename}" if namespace else nodename
iterable = etree.iterparse(
cast("SupportsReadClose[bytes]", reader), tag=tag, encoding=reader.encoding

View File

@ -1,4 +1,3 @@
from pytest import mark
from twisted.trial import unittest
from scrapy.http import Response, TextResponse, XmlResponse
@ -247,10 +246,6 @@ class XmliterTestCase(unittest.TestCase):
class LxmlXmliterTestCase(XmliterTestCase):
xmliter = staticmethod(xmliter_lxml)
@mark.xfail(reason="known bug of the current implementation")
def test_xmliter_namespaced_nodename(self):
super().test_xmliter_namespaced_nodename()
def test_xmliter_iterate_namespace(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>