Deprecate xmliter in favor of xmliter_lxml

2025-03-15 02:10:36 +00:00 · 2023-12-15 11:42:55 +01:00 · 2023-12-15 11:42:55 +01:00 · 150d96764b
commit 150d96764b
parent 2538c0e862
6 changed files with 110 additions and 44 deletions
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
 consume a lot of memory.
 In order to avoid parsing all the entire feed at once in memory, you can use
-the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
+the :func:`~scrapy.utils.iterators.xmliter_lxml` and
-module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
+:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
-under the cover.
+:class:`~scrapy.spiders.XMLFeedSpider` uses.
 .. autofunction:: scrapy.utils.iterators.xmliter_lxml
 .. autofunction:: scrapy.utils.iterators.csviter
 Does Scrapy manage cookies automatically?
 -----------------------------------------
--- a/docs/news.rst
+++ b/docs/news.rst
@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased)
 **Security bug fix:**
-   Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
+-   Addressed `ReDoS vulnerabilities`_:
    ``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and
    the :func:`~scrapy.utils.response.open_in_browser` function. Please, see
    the `cc65-xxvf-f7r9 security advisory`_ for more information.
-    .. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
+    -   ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
        :func:`~scrapy.utils.iterators.xmliter_lxml`, which
        :class:`~scrapy.spiders.XMLFeedSpider` now uses.
        To minimize the impact of this change on existing code,
        :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
        the node namespace with a prefix in the node name, and big files with
        highly nested trees.
    -   Fixed regular expressions in the implementation of the
        :func:`~scrapy.utils.response.open_in_browser` function.
    Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
    .. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
    .. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased)
 **Security bug fix:**
-   Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
+-   Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
-    ``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`.
+    now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
    which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
    To minimize the impact of this change on existing code,
    :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
    the node namespace as a prefix in the node name, and big files with highly
    nested trees when using lxml 4.2 or later.
    Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
--- a/scrapy/spiders/feed.py
+++ b/scrapy/spiders/feed.py
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
 from scrapy.exceptions import NotConfigured, NotSupported
 from scrapy.selector import Selector
 from scrapy.spiders import Spider
-from scrapy.utils.iterators import csviter, xmliter
+from scrapy.utils.iterators import csviter, xmliter_lxml
 from scrapy.utils.spider import iterate_spider_output
@ -84,7 +84,7 @@ class XMLFeedSpider(Spider):
        return self.parse_nodes(response, nodes)
    def _iternodes(self, response):
-        for node in xmliter(response, self.itertag):
+        for node in xmliter_lxml(response, self.itertag):
            self._register_namespaces(node)
            yield node
--- a/scrapy/utils/iterators.py
+++ b/scrapy/utils/iterators.py
@ -17,9 +17,12 @@ from typing import (
    cast,
    overload,
 )
 from warnings import warn
 from lxml import etree
 from packaging.version import Version
 from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse
 from scrapy.selector import Selector
 from scrapy.utils.python import re_rsearch, to_unicode
@ -29,6 +32,12 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 _LXML_VERSION = Version(etree.__version__)
 _LXML_HUGE_TREE_VERSION = Version("4.2")
 _ITERPARSE_KWARGS = {}
 if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION:
    _ITERPARSE_KWARGS["huge_tree"] = True
 def xmliter(
    obj: Union[Response, str, bytes], nodename: str
@ -41,6 +50,16 @@ def xmliter(
    - a unicode string
    - a string encoded as utf-8
    """
    warn(
        (
            "xmliter is deprecated and its use strongly discouraged because "
            "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
            "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
        ),
        ScrapyDeprecationWarning,
        stacklevel=2,
    )
    nodename_patt = re.escape(nodename)
    DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
        reader,
        encoding=reader.encoding,
        events=("start-ns",),
-        huge_tree=True,
+        **_ITERPARSE_KWARGS,
    )
    for event, (_prefix, _namespace) in ns_iterator:
        if _prefix != node_prefix:
@ -111,7 +130,7 @@ def xmliter_lxml(
        reader,
        tag=tag,
        encoding=reader.encoding,
-        huge_tree=True,
+        **_ITERPARSE_KWARGS,
    )
    selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
    for _, node in iterable:
--- a/tests/test_utils_iterators.py
+++ b/tests/test_utils_iterators.py
@ -1,13 +1,14 @@
 import pytest
 from twisted.trial import unittest
 from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse, XmlResponse
 from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
 from tests import get_testdata
-class XmliterTestCase(unittest.TestCase):
+class XmliterBaseTestCase:
-    xmliter = staticmethod(xmliter)
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase):
            attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
        )
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_unusual_node(self):
        body = b"""<?xml version="1.0" encoding="UTF-8"?>
            <root>
@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase):
        ]
        self.assertEqual(nodenames, [["matchme..."]])
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_unicode(self):
        # example taken from https://github.com/scrapy/scrapy/issues/1665
        body = """<?xml version="1.0" encoding="UTF-8"?>
@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase):
                [("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
            )
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_text(self):
        body = (
            '<?xml version="1.0" encoding="UTF-8"?>'
@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase):
            [["one"], ["two"]],
        )
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_namespaces(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase):
        self.assertEqual(node.xpath("id/text()").getall(), [])
        self.assertEqual(node.xpath("price/text()").getall(), [])
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_namespaced_nodename(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase):
            ["http://www.mydummycompany.com/images/item1.jpg"],
        )
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_namespaced_nodename_missing(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase):
        with self.assertRaises(StopIteration):
            next(my_iter)
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_exception(self):
        body = (
            '<?xml version="1.0" encoding="UTF-8"?>'
@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase):
        self.assertRaises(StopIteration, next, iter)
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_objtype_exception(self):
        i = self.xmliter(42, "product")
        self.assertRaises(TypeError, next, i)
    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
    def test_xmliter_encoding(self):
        body = (
            b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase):
        )
-class LxmlXmliterTestCase(XmliterTestCase):
+class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
    xmliter = staticmethod(xmliter)
    def test_deprecation(self):
        body = b"""
            <?xml version="1.0" encoding="UTF-8"?>
            <products>
              <product></product>
            </products>
        """
        with pytest.warns(
            ScrapyDeprecationWarning,
            match="xmliter",
        ):
            next(self.xmliter(body, "product"))
 class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
    xmliter = staticmethod(xmliter_lxml)
    def test_xmliter_iterate_namespace(self):
--- a/tests/test_utils_response.py
+++ b/tests/test_utils_response.py
@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase):
            r5, _openfunc=check_base_url
        ), "Inject unique base url with conditional comment"
    @pytest.mark.slow
    def test_open_in_browser_redos_comment(self):
        MAX_CPU_TIME = 30
-@pytest.mark.slow
+        # Exploit input from
-def test_open_in_browser_redos_comment():
+        # https://makenowjust-labs.github.io/recheck/playground/
-    MAX_CPU_TIME = 30
+        # for /<!--.*?-->/ (old pattern to remove comments).
        body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
-    # Exploit input from
+        response = HtmlResponse("https://example.com", body=body)
    # https://makenowjust-labs.github.io/recheck/playground/
    # for /<!--.*?-->/ (old pattern to remove comments).
    body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
-    response = HtmlResponse("https://example.com", body=body)
+        start_time = process_time()
-    start_time = process_time()
+        open_in_browser(response, lambda url: True)
-    open_in_browser(response, lambda url: True)
+        end_time = process_time()
        self.assertLess(end_time - start_time, MAX_CPU_TIME)
-    end_time = process_time()
+    @pytest.mark.slow
-    assert (end_time - start_time) < MAX_CPU_TIME
+    def test_open_in_browser_redos_head(self):
        MAX_CPU_TIME = 15
        # Exploit input from
        # https://makenowjust-labs.github.io/recheck/playground/
        # for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
        body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
-@pytest.mark.slow
+        response = HtmlResponse("https://example.com", body=body)
 def test_open_in_browser_redos_head():
    MAX_CPU_TIME = 15
-    # Exploit input from
+        start_time = process_time()
    # https://makenowjust-labs.github.io/recheck/playground/
    # for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
    body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
-    response = HtmlResponse("https://example.com", body=body)
+        open_in_browser(response, lambda url: True)
-    start_time = process_time()
+        end_time = process_time()
-
+        self.assertLess(end_time - start_time, MAX_CPU_TIME)
    open_in_browser(response, lambda url: True)
    end_time = process_time()
    assert (end_time - start_time) < MAX_CPU_TIME