diff --git a/docs/faq.rst b/docs/faq.rst index 20dd814df..657802fd3 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and consume a lot of memory. In order to avoid parsing all the entire feed at once in memory, you can use -the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators`` -module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use -under the cover. +the :func:`~scrapy.utils.iterators.xmliter_lxml` and +:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what +:class:`~scrapy.spiders.XMLFeedSpider` uses. + +.. autofunction:: scrapy.utils.iterators.xmliter_lxml + +.. autofunction:: scrapy.utils.iterators.csviter Does Scrapy manage cookies automatically? ----------------------------------------- diff --git a/docs/news.rst b/docs/news.rst index c14815d06..57b99e94f 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased) **Security bug fix:** -- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the - ``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and - the :func:`~scrapy.utils.response.open_in_browser` function. Please, see - the `cc65-xxvf-f7r9 security advisory`_ for more information. +- Addressed `ReDoS vulnerabilities`_: - .. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS + - ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of + :func:`~scrapy.utils.iterators.xmliter_lxml`, which + :class:`~scrapy.spiders.XMLFeedSpider` now uses. + + To minimize the impact of this change on existing code, + :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating + the node namespace with a prefix in the node name, and big files with + highly nested trees. + + - Fixed regular expressions in the implementation of the + :func:`~scrapy.utils.response.open_in_browser` function. + + Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information. + + .. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS .. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9 @@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased) **Security bug fix:** -- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the - ``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`. +- Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is + now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`, + which :class:`~scrapy.spiders.XMLFeedSpider` now uses. + + To minimize the impact of this change on existing code, + :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating + the node namespace as a prefix in the node name, and big files with highly + nested trees when using lxml 4.2 or later. + Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information. diff --git a/scrapy/spiders/feed.py b/scrapy/spiders/feed.py index 6afadc577..42675c76a 100644 --- a/scrapy/spiders/feed.py +++ b/scrapy/spiders/feed.py @@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst from scrapy.exceptions import NotConfigured, NotSupported from scrapy.selector import Selector from scrapy.spiders import Spider -from scrapy.utils.iterators import csviter, xmliter +from scrapy.utils.iterators import csviter, xmliter_lxml from scrapy.utils.spider import iterate_spider_output @@ -84,7 +84,7 @@ class XMLFeedSpider(Spider): return self.parse_nodes(response, nodes) def _iternodes(self, response): - for node in xmliter(response, self.itertag): + for node in xmliter_lxml(response, self.itertag): self._register_namespaces(node) yield node diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py index b67029433..7574e377a 100644 --- a/scrapy/utils/iterators.py +++ b/scrapy/utils/iterators.py @@ -17,9 +17,12 @@ from typing import ( cast, overload, ) +from warnings import warn from lxml import etree +from packaging.version import Version +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import Response, TextResponse from scrapy.selector import Selector from scrapy.utils.python import re_rsearch, to_unicode @@ -29,6 +32,12 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) +_LXML_VERSION = Version(etree.__version__) +_LXML_HUGE_TREE_VERSION = Version("4.2") +_ITERPARSE_KWARGS = {} +if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION: + _ITERPARSE_KWARGS["huge_tree"] = True + def xmliter( obj: Union[Response, str, bytes], nodename: str @@ -41,6 +50,16 @@ def xmliter( - a unicode string - a string encoded as utf-8 """ + warn( + ( + "xmliter is deprecated and its use strongly discouraged because " + "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See " + "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9" + ), + ScrapyDeprecationWarning, + stacklevel=2, + ) + nodename_patt = re.escape(nodename) DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S) @@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]: reader, encoding=reader.encoding, events=("start-ns",), - huge_tree=True, + **_ITERPARSE_KWARGS, ) for event, (_prefix, _namespace) in ns_iterator: if _prefix != node_prefix: @@ -111,7 +130,7 @@ def xmliter_lxml( reader, tag=tag, encoding=reader.encoding, - huge_tree=True, + **_ITERPARSE_KWARGS, ) selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename) for _, node in iterable: diff --git a/tests/test_utils_iterators.py b/tests/test_utils_iterators.py index 24f03155b..505cc276c 100644 --- a/tests/test_utils_iterators.py +++ b/tests/test_utils_iterators.py @@ -1,13 +1,14 @@ +import pytest from twisted.trial import unittest +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import Response, TextResponse, XmlResponse from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml from tests import get_testdata -class XmliterTestCase(unittest.TestCase): - xmliter = staticmethod(xmliter) - +class XmliterBaseTestCase: + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter(self): body = b""" @@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase): attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])] ) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_unusual_node(self): body = b""" @@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase): ] self.assertEqual(nodenames, [["matchme..."]]) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_unicode(self): # example taken from https://github.com/scrapy/scrapy/issues/1665 body = """ @@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase): [("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])], ) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_text(self): body = ( '' @@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase): [["one"], ["two"]], ) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_namespaces(self): body = b""" @@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase): self.assertEqual(node.xpath("id/text()").getall(), []) self.assertEqual(node.xpath("price/text()").getall(), []) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_namespaced_nodename(self): body = b""" @@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase): ["http://www.mydummycompany.com/images/item1.jpg"], ) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_namespaced_nodename_missing(self): body = b""" @@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase): with self.assertRaises(StopIteration): next(my_iter) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_exception(self): body = ( '' @@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase): self.assertRaises(StopIteration, next, iter) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_objtype_exception(self): i = self.xmliter(42, "product") self.assertRaises(TypeError, next, i) + @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning") def test_xmliter_encoding(self): body = ( b'\n' @@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase): ) -class LxmlXmliterTestCase(XmliterTestCase): +class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase): + xmliter = staticmethod(xmliter) + + def test_deprecation(self): + body = b""" + + + + + """ + with pytest.warns( + ScrapyDeprecationWarning, + match="xmliter", + ): + next(self.xmliter(body, "product")) + + +class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase): xmliter = staticmethod(xmliter_lxml) def test_xmliter_iterate_namespace(self): diff --git a/tests/test_utils_response.py b/tests/test_utils_response.py index 93b9bacaf..1dbe187bf 100644 --- a/tests/test_utils_response.py +++ b/tests/test_utils_response.py @@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase): r5, _openfunc=check_base_url ), "Inject unique base url with conditional comment" + @pytest.mark.slow + def test_open_in_browser_redos_comment(self): + MAX_CPU_TIME = 30 -@pytest.mark.slow -def test_open_in_browser_redos_comment(): - MAX_CPU_TIME = 30 + # Exploit input from + # https://makenowjust-labs.github.io/recheck/playground/ + # for // (old pattern to remove comments). + body = b"->" - # Exploit input from - # https://makenowjust-labs.github.io/recheck/playground/ - # for // (old pattern to remove comments). - body = b"->" + response = HtmlResponse("https://example.com", body=body) - response = HtmlResponse("https://example.com", body=body) + start_time = process_time() - start_time = process_time() + open_in_browser(response, lambda url: True) - open_in_browser(response, lambda url: True) + end_time = process_time() + self.assertLess(end_time - start_time, MAX_CPU_TIME) - end_time = process_time() - assert (end_time - start_time) < MAX_CPU_TIME + @pytest.mark.slow + def test_open_in_browser_redos_head(self): + MAX_CPU_TIME = 15 + # Exploit input from + # https://makenowjust-labs.github.io/recheck/playground/ + # for /(|\s.*?>))/ (old pattern to find the head element). + body = b"|\s.*?>))/ (old pattern to find the head element). - body = b"