1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-15 02:10:36 +00:00

Deprecate xmliter in favor of xmliter_lxml

This commit is contained in:
Adrián Chaves 2023-12-15 11:42:55 +01:00
parent 2538c0e862
commit 150d96764b
6 changed files with 110 additions and 44 deletions

@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
consume a lot of memory. consume a lot of memory.
In order to avoid parsing all the entire feed at once in memory, you can use In order to avoid parsing all the entire feed at once in memory, you can use
the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators`` the :func:`~scrapy.utils.iterators.xmliter_lxml` and
module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use :func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
under the cover. :class:`~scrapy.spiders.XMLFeedSpider` uses.
.. autofunction:: scrapy.utils.iterators.xmliter_lxml
.. autofunction:: scrapy.utils.iterators.csviter
Does Scrapy manage cookies automatically? Does Scrapy manage cookies automatically?
----------------------------------------- -----------------------------------------

@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased)
**Security bug fix:** **Security bug fix:**
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the - Addressed `ReDoS vulnerabilities`_:
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and
the :func:`~scrapy.utils.response.open_in_browser` function. Please, see
the `cc65-xxvf-f7r9 security advisory`_ for more information.
.. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS - ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
:func:`~scrapy.utils.iterators.xmliter_lxml`, which
:class:`~scrapy.spiders.XMLFeedSpider` now uses.
To minimize the impact of this change on existing code,
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
the node namespace with a prefix in the node name, and big files with
highly nested trees.
- Fixed regular expressions in the implementation of the
:func:`~scrapy.utils.response.open_in_browser` function.
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
.. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
.. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9 .. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased)
**Security bug fix:** **Security bug fix:**
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the - Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`. now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
To minimize the impact of this change on existing code,
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
the node namespace as a prefix in the node name, and big files with highly
nested trees when using lxml 4.2 or later.
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information. Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.

@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
from scrapy.exceptions import NotConfigured, NotSupported from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.selector import Selector from scrapy.selector import Selector
from scrapy.spiders import Spider from scrapy.spiders import Spider
from scrapy.utils.iterators import csviter, xmliter from scrapy.utils.iterators import csviter, xmliter_lxml
from scrapy.utils.spider import iterate_spider_output from scrapy.utils.spider import iterate_spider_output
@ -84,7 +84,7 @@ class XMLFeedSpider(Spider):
return self.parse_nodes(response, nodes) return self.parse_nodes(response, nodes)
def _iternodes(self, response): def _iternodes(self, response):
for node in xmliter(response, self.itertag): for node in xmliter_lxml(response, self.itertag):
self._register_namespaces(node) self._register_namespaces(node)
yield node yield node

@ -17,9 +17,12 @@ from typing import (
cast, cast,
overload, overload,
) )
from warnings import warn
from lxml import etree from lxml import etree
from packaging.version import Version
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response, TextResponse from scrapy.http import Response, TextResponse
from scrapy.selector import Selector from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode from scrapy.utils.python import re_rsearch, to_unicode
@ -29,6 +32,12 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_LXML_VERSION = Version(etree.__version__)
_LXML_HUGE_TREE_VERSION = Version("4.2")
_ITERPARSE_KWARGS = {}
if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION:
_ITERPARSE_KWARGS["huge_tree"] = True
def xmliter( def xmliter(
obj: Union[Response, str, bytes], nodename: str obj: Union[Response, str, bytes], nodename: str
@ -41,6 +50,16 @@ def xmliter(
- a unicode string - a unicode string
- a string encoded as utf-8 - a string encoded as utf-8
""" """
warn(
(
"xmliter is deprecated and its use strongly discouraged because "
"it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
"https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
),
ScrapyDeprecationWarning,
stacklevel=2,
)
nodename_patt = re.escape(nodename) nodename_patt = re.escape(nodename)
DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S) DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
reader, reader,
encoding=reader.encoding, encoding=reader.encoding,
events=("start-ns",), events=("start-ns",),
huge_tree=True, **_ITERPARSE_KWARGS,
) )
for event, (_prefix, _namespace) in ns_iterator: for event, (_prefix, _namespace) in ns_iterator:
if _prefix != node_prefix: if _prefix != node_prefix:
@ -111,7 +130,7 @@ def xmliter_lxml(
reader, reader,
tag=tag, tag=tag,
encoding=reader.encoding, encoding=reader.encoding,
huge_tree=True, **_ITERPARSE_KWARGS,
) )
selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename) selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
for _, node in iterable: for _, node in iterable:

@ -1,13 +1,14 @@
import pytest
from twisted.trial import unittest from twisted.trial import unittest
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response, TextResponse, XmlResponse from scrapy.http import Response, TextResponse, XmlResponse
from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
from tests import get_testdata from tests import get_testdata
class XmliterTestCase(unittest.TestCase): class XmliterBaseTestCase:
xmliter = staticmethod(xmliter) @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter(self): def test_xmliter(self):
body = b""" body = b"""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase):
attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])] attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
) )
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_unusual_node(self): def test_xmliter_unusual_node(self):
body = b"""<?xml version="1.0" encoding="UTF-8"?> body = b"""<?xml version="1.0" encoding="UTF-8"?>
<root> <root>
@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase):
] ]
self.assertEqual(nodenames, [["matchme..."]]) self.assertEqual(nodenames, [["matchme..."]])
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_unicode(self): def test_xmliter_unicode(self):
# example taken from https://github.com/scrapy/scrapy/issues/1665 # example taken from https://github.com/scrapy/scrapy/issues/1665
body = """<?xml version="1.0" encoding="UTF-8"?> body = """<?xml version="1.0" encoding="UTF-8"?>
@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase):
[("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])], [("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
) )
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_text(self): def test_xmliter_text(self):
body = ( body = (
'<?xml version="1.0" encoding="UTF-8"?>' '<?xml version="1.0" encoding="UTF-8"?>'
@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase):
[["one"], ["two"]], [["one"], ["two"]],
) )
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaces(self): def test_xmliter_namespaces(self):
body = b""" body = b"""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase):
self.assertEqual(node.xpath("id/text()").getall(), []) self.assertEqual(node.xpath("id/text()").getall(), [])
self.assertEqual(node.xpath("price/text()").getall(), []) self.assertEqual(node.xpath("price/text()").getall(), [])
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaced_nodename(self): def test_xmliter_namespaced_nodename(self):
body = b""" body = b"""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase):
["http://www.mydummycompany.com/images/item1.jpg"], ["http://www.mydummycompany.com/images/item1.jpg"],
) )
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaced_nodename_missing(self): def test_xmliter_namespaced_nodename_missing(self):
body = b""" body = b"""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase):
with self.assertRaises(StopIteration): with self.assertRaises(StopIteration):
next(my_iter) next(my_iter)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_exception(self): def test_xmliter_exception(self):
body = ( body = (
'<?xml version="1.0" encoding="UTF-8"?>' '<?xml version="1.0" encoding="UTF-8"?>'
@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase):
self.assertRaises(StopIteration, next, iter) self.assertRaises(StopIteration, next, iter)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_objtype_exception(self): def test_xmliter_objtype_exception(self):
i = self.xmliter(42, "product") i = self.xmliter(42, "product")
self.assertRaises(TypeError, next, i) self.assertRaises(TypeError, next, i)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_encoding(self): def test_xmliter_encoding(self):
body = ( body = (
b'<?xml version="1.0" encoding="ISO-8859-9"?>\n' b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase):
) )
class LxmlXmliterTestCase(XmliterTestCase): class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
xmliter = staticmethod(xmliter)
def test_deprecation(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
<products>
<product></product>
</products>
"""
with pytest.warns(
ScrapyDeprecationWarning,
match="xmliter",
):
next(self.xmliter(body, "product"))
class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
xmliter = staticmethod(xmliter_lxml) xmliter = staticmethod(xmliter_lxml)
def test_xmliter_iterate_namespace(self): def test_xmliter_iterate_namespace(self):

@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase):
r5, _openfunc=check_base_url r5, _openfunc=check_base_url
), "Inject unique base url with conditional comment" ), "Inject unique base url with conditional comment"
@pytest.mark.slow
def test_open_in_browser_redos_comment(self):
MAX_CPU_TIME = 30
@pytest.mark.slow # Exploit input from
def test_open_in_browser_redos_comment(): # https://makenowjust-labs.github.io/recheck/playground/
MAX_CPU_TIME = 30 # for /<!--.*?-->/ (old pattern to remove comments).
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
# Exploit input from response = HtmlResponse("https://example.com", body=body)
# https://makenowjust-labs.github.io/recheck/playground/
# for /<!--.*?-->/ (old pattern to remove comments).
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
response = HtmlResponse("https://example.com", body=body) start_time = process_time()
start_time = process_time() open_in_browser(response, lambda url: True)
open_in_browser(response, lambda url: True) end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)
end_time = process_time() @pytest.mark.slow
assert (end_time - start_time) < MAX_CPU_TIME def test_open_in_browser_redos_head(self):
MAX_CPU_TIME = 15
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
@pytest.mark.slow response = HtmlResponse("https://example.com", body=body)
def test_open_in_browser_redos_head():
MAX_CPU_TIME = 15
# Exploit input from start_time = process_time()
# https://makenowjust-labs.github.io/recheck/playground/
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
response = HtmlResponse("https://example.com", body=body) open_in_browser(response, lambda url: True)
start_time = process_time() end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)
open_in_browser(response, lambda url: True)
end_time = process_time()
assert (end_time - start_time) < MAX_CPU_TIME