Deprecate xmliter in favor of xmliter_lxml

2025-03-14 13:08:16 +00:00 · 2023-12-15 11:42:55 +01:00 · 2023-12-15 11:42:55 +01:00 · 150d96764b
commit 150d96764b
parent 2538c0e862
6 changed files with 110 additions and 44 deletions
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
 consume a lot of memory.

 In order to avoid parsing all the entire feed at once in memory, you can use
-the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
-module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
-under the cover.
+the :func:`~scrapy.utils.iterators.xmliter_lxml` and
+:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
+:class:`~scrapy.spiders.XMLFeedSpider` uses.
+
+.. autofunction:: scrapy.utils.iterators.xmliter_lxml
+
+.. autofunction:: scrapy.utils.iterators.csviter

 Does Scrapy manage cookies automatically?
 -----------------------------------------
--- a/docs/news.rst
+++ b/docs/news.rst
@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased)

 **Security bug fix:**

-   Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
-    ``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and
-    the :func:`~scrapy.utils.response.open_in_browser` function. Please, see
-    the `cc65-xxvf-f7r9 security advisory`_ for more information.
+-   Addressed `ReDoS vulnerabilities`_:

-    .. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
+    -   ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
+        :func:`~scrapy.utils.iterators.xmliter_lxml`, which
+        :class:`~scrapy.spiders.XMLFeedSpider` now uses.
+
+        To minimize the impact of this change on existing code,
+        :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
+        the node namespace with a prefix in the node name, and big files with
+        highly nested trees.
+
+    -   Fixed regular expressions in the implementation of the
+        :func:`~scrapy.utils.response.open_in_browser` function.
+
+    Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
+
+    .. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
    .. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9


@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased)

 **Security bug fix:**

-   Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
-    ``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`.
+-   Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
+    now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
+    which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
+
+    To minimize the impact of this change on existing code,
+    :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
+    the node namespace as a prefix in the node name, and big files with highly
+    nested trees when using lxml 4.2 or later.
+
    Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.


--- a/scrapy/spiders/feed.py
+++ b/scrapy/spiders/feed.py
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
 from scrapy.exceptions import NotConfigured, NotSupported
 from scrapy.selector import Selector
 from scrapy.spiders import Spider
-from scrapy.utils.iterators import csviter, xmliter
+from scrapy.utils.iterators import csviter, xmliter_lxml
 from scrapy.utils.spider import iterate_spider_output


@ -84,7 +84,7 @@ class XMLFeedSpider(Spider):
        return self.parse_nodes(response, nodes)

    def _iternodes(self, response):
-        for node in xmliter(response, self.itertag):
+        for node in xmliter_lxml(response, self.itertag):
            self._register_namespaces(node)
            yield node

--- a/scrapy/utils/iterators.py
+++ b/scrapy/utils/iterators.py
@ -17,9 +17,12 @@ from typing import (
    cast,
    overload,
 )
+from warnings import warn

 from lxml import etree
+from packaging.version import Version

+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse
 from scrapy.selector import Selector
 from scrapy.utils.python import re_rsearch, to_unicode
@ -29,6 +32,12 @@ if TYPE_CHECKING:

 logger = logging.getLogger(__name__)

+_LXML_VERSION = Version(etree.__version__)
+_LXML_HUGE_TREE_VERSION = Version("4.2")
+_ITERPARSE_KWARGS = {}
+if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION:
+    _ITERPARSE_KWARGS["huge_tree"] = True
+

 def xmliter(
    obj: Union[Response, str, bytes], nodename: str
@ -41,6 +50,16 @@ def xmliter(
    - a unicode string
    - a string encoded as utf-8
    """
+    warn(
+        (
+            "xmliter is deprecated and its use strongly discouraged because "
+            "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
+            "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
+        ),
+        ScrapyDeprecationWarning,
+        stacklevel=2,
+    )
+
    nodename_patt = re.escape(nodename)

    DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
        reader,
        encoding=reader.encoding,
        events=("start-ns",),
-        huge_tree=True,
+        **_ITERPARSE_KWARGS,
    )
    for event, (_prefix, _namespace) in ns_iterator:
        if _prefix != node_prefix:
@ -111,7 +130,7 @@ def xmliter_lxml(
        reader,
        tag=tag,
        encoding=reader.encoding,
-        huge_tree=True,
+        **_ITERPARSE_KWARGS,
    )
    selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
    for _, node in iterable:
--- a/tests/test_utils_iterators.py
+++ b/tests/test_utils_iterators.py
@ -1,13 +1,14 @@
+import pytest
 from twisted.trial import unittest

+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse, XmlResponse
 from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
 from tests import get_testdata


-class XmliterTestCase(unittest.TestCase):
-    xmliter = staticmethod(xmliter)
-
+class XmliterBaseTestCase:
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase):
            attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
        )

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_unusual_node(self):
        body = b"""<?xml version="1.0" encoding="UTF-8"?>
            <root>
@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase):
        ]
        self.assertEqual(nodenames, [["matchme..."]])

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_unicode(self):
        # example taken from https://github.com/scrapy/scrapy/issues/1665
        body = """<?xml version="1.0" encoding="UTF-8"?>
@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase):
                [("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
            )

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_text(self):
        body = (
            '<?xml version="1.0" encoding="UTF-8"?>'
@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase):
            [["one"], ["two"]],
        )

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_namespaces(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase):
        self.assertEqual(node.xpath("id/text()").getall(), [])
        self.assertEqual(node.xpath("price/text()").getall(), [])

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_namespaced_nodename(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase):
            ["http://www.mydummycompany.com/images/item1.jpg"],
        )

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_namespaced_nodename_missing(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase):
        with self.assertRaises(StopIteration):
            next(my_iter)

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_exception(self):
        body = (
            '<?xml version="1.0" encoding="UTF-8"?>'
@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase):

        self.assertRaises(StopIteration, next, iter)

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_objtype_exception(self):
        i = self.xmliter(42, "product")
        self.assertRaises(TypeError, next, i)

+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_encoding(self):
        body = (
            b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase):
        )


-class LxmlXmliterTestCase(XmliterTestCase):
+class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
+    xmliter = staticmethod(xmliter)
+
+    def test_deprecation(self):
+        body = b"""
+            <?xml version="1.0" encoding="UTF-8"?>
+            <products>
+              <product></product>
+            </products>
+        """
+        with pytest.warns(
+            ScrapyDeprecationWarning,
+            match="xmliter",
+        ):
+            next(self.xmliter(body, "product"))
+
+
+class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
    xmliter = staticmethod(xmliter_lxml)

    def test_xmliter_iterate_namespace(self):
--- a/tests/test_utils_response.py
+++ b/tests/test_utils_response.py
@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase):
            r5, _openfunc=check_base_url
        ), "Inject unique base url with conditional comment"

+    @pytest.mark.slow
+    def test_open_in_browser_redos_comment(self):
+        MAX_CPU_TIME = 30

-@pytest.mark.slow
-def test_open_in_browser_redos_comment():
-    MAX_CPU_TIME = 30
+        # Exploit input from
+        # https://makenowjust-labs.github.io/recheck/playground/
+        # for /<!--.*?-->/ (old pattern to remove comments).
+        body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"

-    # Exploit input from
-    # https://makenowjust-labs.github.io/recheck/playground/
-    # for /<!--.*?-->/ (old pattern to remove comments).
-    body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
+        response = HtmlResponse("https://example.com", body=body)

-    response = HtmlResponse("https://example.com", body=body)
+        start_time = process_time()

-    start_time = process_time()
+        open_in_browser(response, lambda url: True)

-    open_in_browser(response, lambda url: True)
+        end_time = process_time()
+        self.assertLess(end_time - start_time, MAX_CPU_TIME)

-    end_time = process_time()
-    assert (end_time - start_time) < MAX_CPU_TIME
+    @pytest.mark.slow
+    def test_open_in_browser_redos_head(self):
+        MAX_CPU_TIME = 15

+        # Exploit input from
+        # https://makenowjust-labs.github.io/recheck/playground/
+        # for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
+        body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)

-@pytest.mark.slow
-def test_open_in_browser_redos_head():
-    MAX_CPU_TIME = 15
+        response = HtmlResponse("https://example.com", body=body)

-    # Exploit input from
-    # https://makenowjust-labs.github.io/recheck/playground/
-    # for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
-    body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
+        start_time = process_time()

-    response = HtmlResponse("https://example.com", body=body)
+        open_in_browser(response, lambda url: True)

-    start_time = process_time()
-
-    open_in_browser(response, lambda url: True)
-
-    end_time = process_time()
-    assert (end_time - start_time) < MAX_CPU_TIME
+        end_time = process_time()
+        self.assertLess(end_time - start_time, MAX_CPU_TIME)