1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 13:08:16 +00:00

Deprecate xmliter in favor of xmliter_lxml

This commit is contained in:
Adrián Chaves 2023-12-15 11:42:55 +01:00
parent 2538c0e862
commit 150d96764b
6 changed files with 110 additions and 44 deletions

View File

@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
consume a lot of memory.
In order to avoid parsing all the entire feed at once in memory, you can use
the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
under the cover.
the :func:`~scrapy.utils.iterators.xmliter_lxml` and
:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
:class:`~scrapy.spiders.XMLFeedSpider` uses.
.. autofunction:: scrapy.utils.iterators.xmliter_lxml
.. autofunction:: scrapy.utils.iterators.csviter
Does Scrapy manage cookies automatically?
-----------------------------------------

View File

@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased)
**Security bug fix:**
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and
the :func:`~scrapy.utils.response.open_in_browser` function. Please, see
the `cc65-xxvf-f7r9 security advisory`_ for more information.
- Addressed `ReDoS vulnerabilities`_:
.. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
- ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
:func:`~scrapy.utils.iterators.xmliter_lxml`, which
:class:`~scrapy.spiders.XMLFeedSpider` now uses.
To minimize the impact of this change on existing code,
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
the node namespace with a prefix in the node name, and big files with
highly nested trees.
- Fixed regular expressions in the implementation of the
:func:`~scrapy.utils.response.open_in_browser` function.
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
.. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
.. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased)
**Security bug fix:**
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`.
- Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
To minimize the impact of this change on existing code,
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
the node namespace as a prefix in the node name, and big files with highly
nested trees when using lxml 4.2 or later.
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.selector import Selector
from scrapy.spiders import Spider
from scrapy.utils.iterators import csviter, xmliter
from scrapy.utils.iterators import csviter, xmliter_lxml
from scrapy.utils.spider import iterate_spider_output
@ -84,7 +84,7 @@ class XMLFeedSpider(Spider):
return self.parse_nodes(response, nodes)
def _iternodes(self, response):
for node in xmliter(response, self.itertag):
for node in xmliter_lxml(response, self.itertag):
self._register_namespaces(node)
yield node

View File

@ -17,9 +17,12 @@ from typing import (
cast,
overload,
)
from warnings import warn
from lxml import etree
from packaging.version import Version
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response, TextResponse
from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode
@ -29,6 +32,12 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
_LXML_VERSION = Version(etree.__version__)
_LXML_HUGE_TREE_VERSION = Version("4.2")
_ITERPARSE_KWARGS = {}
if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION:
_ITERPARSE_KWARGS["huge_tree"] = True
def xmliter(
obj: Union[Response, str, bytes], nodename: str
@ -41,6 +50,16 @@ def xmliter(
- a unicode string
- a string encoded as utf-8
"""
warn(
(
"xmliter is deprecated and its use strongly discouraged because "
"it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
"https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
),
ScrapyDeprecationWarning,
stacklevel=2,
)
nodename_patt = re.escape(nodename)
DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
reader,
encoding=reader.encoding,
events=("start-ns",),
huge_tree=True,
**_ITERPARSE_KWARGS,
)
for event, (_prefix, _namespace) in ns_iterator:
if _prefix != node_prefix:
@ -111,7 +130,7 @@ def xmliter_lxml(
reader,
tag=tag,
encoding=reader.encoding,
huge_tree=True,
**_ITERPARSE_KWARGS,
)
selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
for _, node in iterable:

View File

@ -1,13 +1,14 @@
import pytest
from twisted.trial import unittest
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response, TextResponse, XmlResponse
from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
from tests import get_testdata
class XmliterTestCase(unittest.TestCase):
xmliter = staticmethod(xmliter)
class XmliterBaseTestCase:
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase):
attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_unusual_node(self):
body = b"""<?xml version="1.0" encoding="UTF-8"?>
<root>
@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase):
]
self.assertEqual(nodenames, [["matchme..."]])
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_unicode(self):
# example taken from https://github.com/scrapy/scrapy/issues/1665
body = """<?xml version="1.0" encoding="UTF-8"?>
@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase):
[("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_text(self):
body = (
'<?xml version="1.0" encoding="UTF-8"?>'
@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase):
[["one"], ["two"]],
)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaces(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase):
self.assertEqual(node.xpath("id/text()").getall(), [])
self.assertEqual(node.xpath("price/text()").getall(), [])
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaced_nodename(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase):
["http://www.mydummycompany.com/images/item1.jpg"],
)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaced_nodename_missing(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase):
with self.assertRaises(StopIteration):
next(my_iter)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_exception(self):
body = (
'<?xml version="1.0" encoding="UTF-8"?>'
@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase):
self.assertRaises(StopIteration, next, iter)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_objtype_exception(self):
i = self.xmliter(42, "product")
self.assertRaises(TypeError, next, i)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_encoding(self):
body = (
b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase):
)
class LxmlXmliterTestCase(XmliterTestCase):
class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
xmliter = staticmethod(xmliter)
def test_deprecation(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
<products>
<product></product>
</products>
"""
with pytest.warns(
ScrapyDeprecationWarning,
match="xmliter",
):
next(self.xmliter(body, "product"))
class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
xmliter = staticmethod(xmliter_lxml)
def test_xmliter_iterate_namespace(self):

View File

@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase):
r5, _openfunc=check_base_url
), "Inject unique base url with conditional comment"
@pytest.mark.slow
def test_open_in_browser_redos_comment(self):
MAX_CPU_TIME = 30
@pytest.mark.slow
def test_open_in_browser_redos_comment():
MAX_CPU_TIME = 30
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /<!--.*?-->/ (old pattern to remove comments).
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /<!--.*?-->/ (old pattern to remove comments).
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
response = HtmlResponse("https://example.com", body=body)
response = HtmlResponse("https://example.com", body=body)
start_time = process_time()
start_time = process_time()
open_in_browser(response, lambda url: True)
open_in_browser(response, lambda url: True)
end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)
end_time = process_time()
assert (end_time - start_time) < MAX_CPU_TIME
@pytest.mark.slow
def test_open_in_browser_redos_head(self):
MAX_CPU_TIME = 15
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
@pytest.mark.slow
def test_open_in_browser_redos_head():
MAX_CPU_TIME = 15
response = HtmlResponse("https://example.com", body=body)
# Exploit input from
# https://makenowjust-labs.github.io/recheck/playground/
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
start_time = process_time()
response = HtmlResponse("https://example.com", body=body)
open_in_browser(response, lambda url: True)
start_time = process_time()
open_in_browser(response, lambda url: True)
end_time = process_time()
assert (end_time - start_time) < MAX_CPU_TIME
end_time = process_time()
self.assertLess(end_time - start_time, MAX_CPU_TIME)