mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-14 16:58:20 +00:00
Deprecate xmliter in favor of xmliter_lxml
This commit is contained in:
parent
2538c0e862
commit
150d96764b
10
docs/faq.rst
10
docs/faq.rst
@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
|
||||
consume a lot of memory.
|
||||
|
||||
In order to avoid parsing all the entire feed at once in memory, you can use
|
||||
the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
|
||||
module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
|
||||
under the cover.
|
||||
the :func:`~scrapy.utils.iterators.xmliter_lxml` and
|
||||
:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
|
||||
:class:`~scrapy.spiders.XMLFeedSpider` uses.
|
||||
|
||||
.. autofunction:: scrapy.utils.iterators.xmliter_lxml
|
||||
|
||||
.. autofunction:: scrapy.utils.iterators.csviter
|
||||
|
||||
Does Scrapy manage cookies automatically?
|
||||
-----------------------------------------
|
||||
|
@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased)
|
||||
|
||||
**Security bug fix:**
|
||||
|
||||
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
|
||||
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and
|
||||
the :func:`~scrapy.utils.response.open_in_browser` function. Please, see
|
||||
the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
||||
- Addressed `ReDoS vulnerabilities`_:
|
||||
|
||||
.. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
|
||||
- ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
|
||||
:func:`~scrapy.utils.iterators.xmliter_lxml`, which
|
||||
:class:`~scrapy.spiders.XMLFeedSpider` now uses.
|
||||
|
||||
To minimize the impact of this change on existing code,
|
||||
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
|
||||
the node namespace with a prefix in the node name, and big files with
|
||||
highly nested trees.
|
||||
|
||||
- Fixed regular expressions in the implementation of the
|
||||
:func:`~scrapy.utils.response.open_in_browser` function.
|
||||
|
||||
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
||||
|
||||
.. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
|
||||
.. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
|
||||
|
||||
|
||||
@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased)
|
||||
|
||||
**Security bug fix:**
|
||||
|
||||
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
|
||||
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`.
|
||||
- Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
|
||||
now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
|
||||
which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
|
||||
|
||||
To minimize the impact of this change on existing code,
|
||||
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
|
||||
the node namespace as a prefix in the node name, and big files with highly
|
||||
nested trees when using lxml 4.2 or later.
|
||||
|
||||
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.iterators import csviter, xmliter
|
||||
from scrapy.utils.iterators import csviter, xmliter_lxml
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
@ -84,7 +84,7 @@ class XMLFeedSpider(Spider):
|
||||
return self.parse_nodes(response, nodes)
|
||||
|
||||
def _iternodes(self, response):
|
||||
for node in xmliter(response, self.itertag):
|
||||
for node in xmliter_lxml(response, self.itertag):
|
||||
self._register_namespaces(node)
|
||||
yield node
|
||||
|
||||
|
@ -17,9 +17,12 @@ from typing import (
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
from warnings import warn
|
||||
|
||||
from lxml import etree
|
||||
from packaging.version import Version
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Response, TextResponse
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.utils.python import re_rsearch, to_unicode
|
||||
@ -29,6 +32,12 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_LXML_VERSION = Version(etree.__version__)
|
||||
_LXML_HUGE_TREE_VERSION = Version("4.2")
|
||||
_ITERPARSE_KWARGS = {}
|
||||
if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION:
|
||||
_ITERPARSE_KWARGS["huge_tree"] = True
|
||||
|
||||
|
||||
def xmliter(
|
||||
obj: Union[Response, str, bytes], nodename: str
|
||||
@ -41,6 +50,16 @@ def xmliter(
|
||||
- a unicode string
|
||||
- a string encoded as utf-8
|
||||
"""
|
||||
warn(
|
||||
(
|
||||
"xmliter is deprecated and its use strongly discouraged because "
|
||||
"it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
|
||||
"https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
|
||||
),
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
nodename_patt = re.escape(nodename)
|
||||
|
||||
DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
|
||||
@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
|
||||
reader,
|
||||
encoding=reader.encoding,
|
||||
events=("start-ns",),
|
||||
huge_tree=True,
|
||||
**_ITERPARSE_KWARGS,
|
||||
)
|
||||
for event, (_prefix, _namespace) in ns_iterator:
|
||||
if _prefix != node_prefix:
|
||||
@ -111,7 +130,7 @@ def xmliter_lxml(
|
||||
reader,
|
||||
tag=tag,
|
||||
encoding=reader.encoding,
|
||||
huge_tree=True,
|
||||
**_ITERPARSE_KWARGS,
|
||||
)
|
||||
selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
|
||||
for _, node in iterable:
|
||||
|
@ -1,13 +1,14 @@
|
||||
import pytest
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Response, TextResponse, XmlResponse
|
||||
from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
|
||||
from tests import get_testdata
|
||||
|
||||
|
||||
class XmliterTestCase(unittest.TestCase):
|
||||
xmliter = staticmethod(xmliter)
|
||||
|
||||
class XmliterBaseTestCase:
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter(self):
|
||||
body = b"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
|
||||
)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_unusual_node(self):
|
||||
body = b"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<root>
|
||||
@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
]
|
||||
self.assertEqual(nodenames, [["matchme..."]])
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_unicode(self):
|
||||
# example taken from https://github.com/scrapy/scrapy/issues/1665
|
||||
body = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
[("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
|
||||
)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_text(self):
|
||||
body = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>'
|
||||
@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
[["one"], ["two"]],
|
||||
)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_namespaces(self):
|
||||
body = b"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
self.assertEqual(node.xpath("id/text()").getall(), [])
|
||||
self.assertEqual(node.xpath("price/text()").getall(), [])
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_namespaced_nodename(self):
|
||||
body = b"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
["http://www.mydummycompany.com/images/item1.jpg"],
|
||||
)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_namespaced_nodename_missing(self):
|
||||
body = b"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
with self.assertRaises(StopIteration):
|
||||
next(my_iter)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_exception(self):
|
||||
body = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>'
|
||||
@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase):
|
||||
|
||||
self.assertRaises(StopIteration, next, iter)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_objtype_exception(self):
|
||||
i = self.xmliter(42, "product")
|
||||
self.assertRaises(TypeError, next, i)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
def test_xmliter_encoding(self):
|
||||
body = (
|
||||
b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
|
||||
@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase):
|
||||
)
|
||||
|
||||
|
||||
class LxmlXmliterTestCase(XmliterTestCase):
|
||||
class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
|
||||
xmliter = staticmethod(xmliter)
|
||||
|
||||
def test_deprecation(self):
|
||||
body = b"""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<products>
|
||||
<product></product>
|
||||
</products>
|
||||
"""
|
||||
with pytest.warns(
|
||||
ScrapyDeprecationWarning,
|
||||
match="xmliter",
|
||||
):
|
||||
next(self.xmliter(body, "product"))
|
||||
|
||||
|
||||
class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
|
||||
xmliter = staticmethod(xmliter_lxml)
|
||||
|
||||
def test_xmliter_iterate_namespace(self):
|
||||
|
@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase):
|
||||
r5, _openfunc=check_base_url
|
||||
), "Inject unique base url with conditional comment"
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_open_in_browser_redos_comment(self):
|
||||
MAX_CPU_TIME = 30
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_open_in_browser_redos_comment():
|
||||
MAX_CPU_TIME = 30
|
||||
# Exploit input from
|
||||
# https://makenowjust-labs.github.io/recheck/playground/
|
||||
# for /<!--.*?-->/ (old pattern to remove comments).
|
||||
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
|
||||
|
||||
# Exploit input from
|
||||
# https://makenowjust-labs.github.io/recheck/playground/
|
||||
# for /<!--.*?-->/ (old pattern to remove comments).
|
||||
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
|
||||
response = HtmlResponse("https://example.com", body=body)
|
||||
|
||||
response = HtmlResponse("https://example.com", body=body)
|
||||
start_time = process_time()
|
||||
|
||||
start_time = process_time()
|
||||
open_in_browser(response, lambda url: True)
|
||||
|
||||
open_in_browser(response, lambda url: True)
|
||||
end_time = process_time()
|
||||
self.assertLess(end_time - start_time, MAX_CPU_TIME)
|
||||
|
||||
end_time = process_time()
|
||||
assert (end_time - start_time) < MAX_CPU_TIME
|
||||
@pytest.mark.slow
|
||||
def test_open_in_browser_redos_head(self):
|
||||
MAX_CPU_TIME = 15
|
||||
|
||||
# Exploit input from
|
||||
# https://makenowjust-labs.github.io/recheck/playground/
|
||||
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
|
||||
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_open_in_browser_redos_head():
|
||||
MAX_CPU_TIME = 15
|
||||
response = HtmlResponse("https://example.com", body=body)
|
||||
|
||||
# Exploit input from
|
||||
# https://makenowjust-labs.github.io/recheck/playground/
|
||||
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
|
||||
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
|
||||
start_time = process_time()
|
||||
|
||||
response = HtmlResponse("https://example.com", body=body)
|
||||
open_in_browser(response, lambda url: True)
|
||||
|
||||
start_time = process_time()
|
||||
|
||||
open_in_browser(response, lambda url: True)
|
||||
|
||||
end_time = process_time()
|
||||
assert (end_time - start_time) < MAX_CPU_TIME
|
||||
end_time = process_time()
|
||||
self.assertLess(end_time - start_time, MAX_CPU_TIME)
|
||||
|
Loading…
x
Reference in New Issue
Block a user