mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-15 02:10:36 +00:00
Deprecate xmliter in favor of xmliter_lxml
This commit is contained in:
parent
2538c0e862
commit
150d96764b
docs
scrapy
tests
10
docs/faq.rst
10
docs/faq.rst
@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
|
|||||||
consume a lot of memory.
|
consume a lot of memory.
|
||||||
|
|
||||||
In order to avoid parsing all the entire feed at once in memory, you can use
|
In order to avoid parsing all the entire feed at once in memory, you can use
|
||||||
the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
|
the :func:`~scrapy.utils.iterators.xmliter_lxml` and
|
||||||
module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
|
:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
|
||||||
under the cover.
|
:class:`~scrapy.spiders.XMLFeedSpider` uses.
|
||||||
|
|
||||||
|
.. autofunction:: scrapy.utils.iterators.xmliter_lxml
|
||||||
|
|
||||||
|
.. autofunction:: scrapy.utils.iterators.csviter
|
||||||
|
|
||||||
Does Scrapy manage cookies automatically?
|
Does Scrapy manage cookies automatically?
|
||||||
-----------------------------------------
|
-----------------------------------------
|
||||||
|
@ -10,12 +10,23 @@ Scrapy 2.11.1 (unreleased)
|
|||||||
|
|
||||||
**Security bug fix:**
|
**Security bug fix:**
|
||||||
|
|
||||||
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
|
- Addressed `ReDoS vulnerabilities`_:
|
||||||
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider` and
|
|
||||||
the :func:`~scrapy.utils.response.open_in_browser` function. Please, see
|
|
||||||
the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
|
||||||
|
|
||||||
.. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
|
- ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
|
||||||
|
:func:`~scrapy.utils.iterators.xmliter_lxml`, which
|
||||||
|
:class:`~scrapy.spiders.XMLFeedSpider` now uses.
|
||||||
|
|
||||||
|
To minimize the impact of this change on existing code,
|
||||||
|
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
|
||||||
|
the node namespace with a prefix in the node name, and big files with
|
||||||
|
highly nested trees.
|
||||||
|
|
||||||
|
- Fixed regular expressions in the implementation of the
|
||||||
|
:func:`~scrapy.utils.response.open_in_browser` function.
|
||||||
|
|
||||||
|
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
||||||
|
|
||||||
|
.. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
|
||||||
.. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
|
.. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
|
||||||
|
|
||||||
|
|
||||||
@ -2892,8 +2903,15 @@ Scrapy 1.8.4 (unreleased)
|
|||||||
|
|
||||||
**Security bug fix:**
|
**Security bug fix:**
|
||||||
|
|
||||||
- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
|
- Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
|
||||||
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`.
|
now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
|
||||||
|
which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
|
||||||
|
|
||||||
|
To minimize the impact of this change on existing code,
|
||||||
|
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
|
||||||
|
the node namespace as a prefix in the node name, and big files with highly
|
||||||
|
nested trees when using lxml 4.2 or later.
|
||||||
|
|
||||||
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
|
|||||||
from scrapy.exceptions import NotConfigured, NotSupported
|
from scrapy.exceptions import NotConfigured, NotSupported
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
from scrapy.utils.iterators import csviter, xmliter
|
from scrapy.utils.iterators import csviter, xmliter_lxml
|
||||||
from scrapy.utils.spider import iterate_spider_output
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
|
||||||
|
|
||||||
@ -84,7 +84,7 @@ class XMLFeedSpider(Spider):
|
|||||||
return self.parse_nodes(response, nodes)
|
return self.parse_nodes(response, nodes)
|
||||||
|
|
||||||
def _iternodes(self, response):
|
def _iternodes(self, response):
|
||||||
for node in xmliter(response, self.itertag):
|
for node in xmliter_lxml(response, self.itertag):
|
||||||
self._register_namespaces(node)
|
self._register_namespaces(node)
|
||||||
yield node
|
yield node
|
||||||
|
|
||||||
|
@ -17,9 +17,12 @@ from typing import (
|
|||||||
cast,
|
cast,
|
||||||
overload,
|
overload,
|
||||||
)
|
)
|
||||||
|
from warnings import warn
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from packaging.version import Version
|
||||||
|
|
||||||
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||||
from scrapy.http import Response, TextResponse
|
from scrapy.http import Response, TextResponse
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
from scrapy.utils.python import re_rsearch, to_unicode
|
from scrapy.utils.python import re_rsearch, to_unicode
|
||||||
@ -29,6 +32,12 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_LXML_VERSION = Version(etree.__version__)
|
||||||
|
_LXML_HUGE_TREE_VERSION = Version("4.2")
|
||||||
|
_ITERPARSE_KWARGS = {}
|
||||||
|
if _LXML_VERSION >= _LXML_HUGE_TREE_VERSION:
|
||||||
|
_ITERPARSE_KWARGS["huge_tree"] = True
|
||||||
|
|
||||||
|
|
||||||
def xmliter(
|
def xmliter(
|
||||||
obj: Union[Response, str, bytes], nodename: str
|
obj: Union[Response, str, bytes], nodename: str
|
||||||
@ -41,6 +50,16 @@ def xmliter(
|
|||||||
- a unicode string
|
- a unicode string
|
||||||
- a string encoded as utf-8
|
- a string encoded as utf-8
|
||||||
"""
|
"""
|
||||||
|
warn(
|
||||||
|
(
|
||||||
|
"xmliter is deprecated and its use strongly discouraged because "
|
||||||
|
"it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
|
||||||
|
"https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
|
||||||
|
),
|
||||||
|
ScrapyDeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
nodename_patt = re.escape(nodename)
|
nodename_patt = re.escape(nodename)
|
||||||
|
|
||||||
DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
|
DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
|
||||||
@ -87,7 +106,7 @@ def _resolve_xml_namespace(element_name: str, data: bytes) -> Tuple[str, str]:
|
|||||||
reader,
|
reader,
|
||||||
encoding=reader.encoding,
|
encoding=reader.encoding,
|
||||||
events=("start-ns",),
|
events=("start-ns",),
|
||||||
huge_tree=True,
|
**_ITERPARSE_KWARGS,
|
||||||
)
|
)
|
||||||
for event, (_prefix, _namespace) in ns_iterator:
|
for event, (_prefix, _namespace) in ns_iterator:
|
||||||
if _prefix != node_prefix:
|
if _prefix != node_prefix:
|
||||||
@ -111,7 +130,7 @@ def xmliter_lxml(
|
|||||||
reader,
|
reader,
|
||||||
tag=tag,
|
tag=tag,
|
||||||
encoding=reader.encoding,
|
encoding=reader.encoding,
|
||||||
huge_tree=True,
|
**_ITERPARSE_KWARGS,
|
||||||
)
|
)
|
||||||
selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
|
selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
|
||||||
for _, node in iterable:
|
for _, node in iterable:
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
|
import pytest
|
||||||
from twisted.trial import unittest
|
from twisted.trial import unittest
|
||||||
|
|
||||||
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||||
from scrapy.http import Response, TextResponse, XmlResponse
|
from scrapy.http import Response, TextResponse, XmlResponse
|
||||||
from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
|
from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
|
||||||
from tests import get_testdata
|
from tests import get_testdata
|
||||||
|
|
||||||
|
|
||||||
class XmliterTestCase(unittest.TestCase):
|
class XmliterBaseTestCase:
|
||||||
xmliter = staticmethod(xmliter)
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
|
|
||||||
def test_xmliter(self):
|
def test_xmliter(self):
|
||||||
body = b"""
|
body = b"""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -39,6 +40,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
|
attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_unusual_node(self):
|
def test_xmliter_unusual_node(self):
|
||||||
body = b"""<?xml version="1.0" encoding="UTF-8"?>
|
body = b"""<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<root>
|
<root>
|
||||||
@ -52,6 +54,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
self.assertEqual(nodenames, [["matchme..."]])
|
self.assertEqual(nodenames, [["matchme..."]])
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_unicode(self):
|
def test_xmliter_unicode(self):
|
||||||
# example taken from https://github.com/scrapy/scrapy/issues/1665
|
# example taken from https://github.com/scrapy/scrapy/issues/1665
|
||||||
body = """<?xml version="1.0" encoding="UTF-8"?>
|
body = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -111,6 +114,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
[("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
|
[("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_text(self):
|
def test_xmliter_text(self):
|
||||||
body = (
|
body = (
|
||||||
'<?xml version="1.0" encoding="UTF-8"?>'
|
'<?xml version="1.0" encoding="UTF-8"?>'
|
||||||
@ -122,6 +126,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
[["one"], ["two"]],
|
[["one"], ["two"]],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_namespaces(self):
|
def test_xmliter_namespaces(self):
|
||||||
body = b"""
|
body = b"""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -161,6 +166,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
self.assertEqual(node.xpath("id/text()").getall(), [])
|
self.assertEqual(node.xpath("id/text()").getall(), [])
|
||||||
self.assertEqual(node.xpath("price/text()").getall(), [])
|
self.assertEqual(node.xpath("price/text()").getall(), [])
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_namespaced_nodename(self):
|
def test_xmliter_namespaced_nodename(self):
|
||||||
body = b"""
|
body = b"""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -189,6 +195,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
["http://www.mydummycompany.com/images/item1.jpg"],
|
["http://www.mydummycompany.com/images/item1.jpg"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_namespaced_nodename_missing(self):
|
def test_xmliter_namespaced_nodename_missing(self):
|
||||||
body = b"""
|
body = b"""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
@ -213,6 +220,7 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
with self.assertRaises(StopIteration):
|
with self.assertRaises(StopIteration):
|
||||||
next(my_iter)
|
next(my_iter)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_exception(self):
|
def test_xmliter_exception(self):
|
||||||
body = (
|
body = (
|
||||||
'<?xml version="1.0" encoding="UTF-8"?>'
|
'<?xml version="1.0" encoding="UTF-8"?>'
|
||||||
@ -225,10 +233,12 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertRaises(StopIteration, next, iter)
|
self.assertRaises(StopIteration, next, iter)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_objtype_exception(self):
|
def test_xmliter_objtype_exception(self):
|
||||||
i = self.xmliter(42, "product")
|
i = self.xmliter(42, "product")
|
||||||
self.assertRaises(TypeError, next, i)
|
self.assertRaises(TypeError, next, i)
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||||
def test_xmliter_encoding(self):
|
def test_xmliter_encoding(self):
|
||||||
body = (
|
body = (
|
||||||
b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
|
b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
|
||||||
@ -243,7 +253,24 @@ class XmliterTestCase(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class LxmlXmliterTestCase(XmliterTestCase):
|
class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
|
||||||
|
xmliter = staticmethod(xmliter)
|
||||||
|
|
||||||
|
def test_deprecation(self):
|
||||||
|
body = b"""
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<products>
|
||||||
|
<product></product>
|
||||||
|
</products>
|
||||||
|
"""
|
||||||
|
with pytest.warns(
|
||||||
|
ScrapyDeprecationWarning,
|
||||||
|
match="xmliter",
|
||||||
|
):
|
||||||
|
next(self.xmliter(body, "product"))
|
||||||
|
|
||||||
|
|
||||||
|
class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
|
||||||
xmliter = staticmethod(xmliter_lxml)
|
xmliter = staticmethod(xmliter_lxml)
|
||||||
|
|
||||||
def test_xmliter_iterate_namespace(self):
|
def test_xmliter_iterate_namespace(self):
|
||||||
|
@ -203,40 +203,38 @@ class ResponseUtilsTest(unittest.TestCase):
|
|||||||
r5, _openfunc=check_base_url
|
r5, _openfunc=check_base_url
|
||||||
), "Inject unique base url with conditional comment"
|
), "Inject unique base url with conditional comment"
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_open_in_browser_redos_comment(self):
|
||||||
|
MAX_CPU_TIME = 30
|
||||||
|
|
||||||
@pytest.mark.slow
|
# Exploit input from
|
||||||
def test_open_in_browser_redos_comment():
|
# https://makenowjust-labs.github.io/recheck/playground/
|
||||||
MAX_CPU_TIME = 30
|
# for /<!--.*?-->/ (old pattern to remove comments).
|
||||||
|
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
|
||||||
|
|
||||||
# Exploit input from
|
response = HtmlResponse("https://example.com", body=body)
|
||||||
# https://makenowjust-labs.github.io/recheck/playground/
|
|
||||||
# for /<!--.*?-->/ (old pattern to remove comments).
|
|
||||||
body = b"-><!--\x00" * (int(DOWNLOAD_MAXSIZE / 7) - 10) + b"->\n<!---->"
|
|
||||||
|
|
||||||
response = HtmlResponse("https://example.com", body=body)
|
start_time = process_time()
|
||||||
|
|
||||||
start_time = process_time()
|
open_in_browser(response, lambda url: True)
|
||||||
|
|
||||||
open_in_browser(response, lambda url: True)
|
end_time = process_time()
|
||||||
|
self.assertLess(end_time - start_time, MAX_CPU_TIME)
|
||||||
|
|
||||||
end_time = process_time()
|
@pytest.mark.slow
|
||||||
assert (end_time - start_time) < MAX_CPU_TIME
|
def test_open_in_browser_redos_head(self):
|
||||||
|
MAX_CPU_TIME = 15
|
||||||
|
|
||||||
|
# Exploit input from
|
||||||
|
# https://makenowjust-labs.github.io/recheck/playground/
|
||||||
|
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
|
||||||
|
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
|
||||||
|
|
||||||
@pytest.mark.slow
|
response = HtmlResponse("https://example.com", body=body)
|
||||||
def test_open_in_browser_redos_head():
|
|
||||||
MAX_CPU_TIME = 15
|
|
||||||
|
|
||||||
# Exploit input from
|
start_time = process_time()
|
||||||
# https://makenowjust-labs.github.io/recheck/playground/
|
|
||||||
# for /(<head(?:>|\s.*?>))/ (old pattern to find the head element).
|
|
||||||
body = b"<head\t" * int(DOWNLOAD_MAXSIZE / 6)
|
|
||||||
|
|
||||||
response = HtmlResponse("https://example.com", body=body)
|
open_in_browser(response, lambda url: True)
|
||||||
|
|
||||||
start_time = process_time()
|
end_time = process_time()
|
||||||
|
self.assertLess(end_time - start_time, MAX_CPU_TIME)
|
||||||
open_in_browser(response, lambda url: True)
|
|
||||||
|
|
||||||
end_time = process_time()
|
|
||||||
assert (end_time - start_time) < MAX_CPU_TIME
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user