mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-14 16:28:31 +00:00
Fix namespaces nodename support for xmliter_lxml
This commit is contained in:
parent
1533b69032
commit
4f72b49f97
@ -12,11 +12,14 @@ from typing import (
|
|||||||
List,
|
List,
|
||||||
Literal,
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
|
Tuple,
|
||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
overload,
|
overload,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from scrapy.http import Response, TextResponse
|
from scrapy.http import Response, TextResponse
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
from scrapy.utils.python import re_rsearch, to_unicode
|
from scrapy.utils.python import re_rsearch, to_unicode
|
||||||
@ -77,15 +80,31 @@ def xmliter(
|
|||||||
yield Selector(text=nodetext, type="xml")
|
yield Selector(text=nodetext, type="xml")
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
|
||||||
|
if ":" not in element_name:
|
||||||
|
return element_name, None, None
|
||||||
|
reader: "SupportsReadClose[bytes]" = _StreamReader(data)
|
||||||
|
node_prefix, element_name = element_name.split(":", maxsplit=1)
|
||||||
|
ns_iterator = etree.iterparse(
|
||||||
|
reader, encoding=reader.encoding, events=("start-ns",)
|
||||||
|
)
|
||||||
|
for event, (_prefix, _namespace) in ns_iterator:
|
||||||
|
if _prefix != node_prefix:
|
||||||
|
continue
|
||||||
|
return element_name, _prefix, _namespace
|
||||||
|
return f"{node_prefix}:{element_name}", None, None
|
||||||
|
|
||||||
|
|
||||||
def xmliter_lxml(
|
def xmliter_lxml(
|
||||||
obj: Union[Response, str, bytes],
|
obj: Union[Response, str, bytes],
|
||||||
nodename: str,
|
nodename: str,
|
||||||
namespace: Optional[str] = None,
|
namespace: Optional[str] = None,
|
||||||
prefix: str = "x",
|
prefix: str = "x",
|
||||||
) -> Generator[Selector, Any, None]:
|
) -> Generator[Selector, Any, None]:
|
||||||
from lxml import etree
|
if not namespace:
|
||||||
|
nodename, prefix, namespace = _resolve_xml_namespace(nodename, obj)
|
||||||
|
|
||||||
reader = _StreamReader(obj)
|
reader: "SupportsReadClose[bytes]" = _StreamReader(obj)
|
||||||
tag = f"{{{namespace}}}{nodename}" if namespace else nodename
|
tag = f"{{{namespace}}}{nodename}" if namespace else nodename
|
||||||
iterable = etree.iterparse(
|
iterable = etree.iterparse(
|
||||||
cast("SupportsReadClose[bytes]", reader), tag=tag, encoding=reader.encoding
|
cast("SupportsReadClose[bytes]", reader), tag=tag, encoding=reader.encoding
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
from pytest import mark
|
|
||||||
from twisted.trial import unittest
|
from twisted.trial import unittest
|
||||||
|
|
||||||
from scrapy.http import Response, TextResponse, XmlResponse
|
from scrapy.http import Response, TextResponse, XmlResponse
|
||||||
@ -247,10 +246,6 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
class LxmlXmliterTestCase(XmliterTestCase):
|
class LxmlXmliterTestCase(XmliterTestCase):
|
||||||
xmliter = staticmethod(xmliter_lxml)
|
xmliter = staticmethod(xmliter_lxml)
|
||||||
|
|
||||||
@mark.xfail(reason="known bug of the current implementation")
|
|
||||||
def test_xmliter_namespaced_nodename(self):
|
|
||||||
super().test_xmliter_namespaced_nodename()
|
|
||||||
|
|
||||||
def test_xmliter_iterate_namespace(self):
|
def test_xmliter_iterate_namespace(self):
|
||||||
body = b"""
|
body = b"""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user