From cc32f6ec66fed89f5ee6b13d51589de9a7dee267 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 13 Jul 2010 19:46:53 -0300 Subject: [PATCH] Applied patch to ClientForm to fix bug with wrong entities. Also added tests and left patch in repo in case we upgrade ClientForm in the future and need to re-apply it --- scrapy/tests/test_clientform.py | 8 ++++++++ scrapy/xlib/ClientForm.patch | 15 +++++++++++++++ scrapy/xlib/ClientForm.py | 7 ++++++- 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 scrapy/tests/test_clientform.py create mode 100644 scrapy/xlib/ClientForm.patch diff --git a/scrapy/tests/test_clientform.py b/scrapy/tests/test_clientform.py new file mode 100644 index 000000000..90ce671ec --- /dev/null +++ b/scrapy/tests/test_clientform.py @@ -0,0 +1,8 @@ +import unittest + +from scrapy.xlib import ClientForm + +class ClientFormPatchTests(unittest.TestCase): + + def test_patched_unescape_charref(self): + self.assertEqual(ClientForm.unescape_charref('c', 'utf-8'), 'c') diff --git a/scrapy/xlib/ClientForm.patch b/scrapy/xlib/ClientForm.patch new file mode 100644 index 000000000..b3887dff7 --- /dev/null +++ b/scrapy/xlib/ClientForm.patch @@ -0,0 +1,15 @@ +diff --git a/scrapy/xlib/ClientForm.py b/scrapy/xlib/ClientForm.py +--- a/scrapy/xlib/ClientForm.py ++++ b/scrapy/xlib/ClientForm.py +@@ -242,5 +242,10 @@ def unescape_charref(data, encoding): + if name.startswith("x"): + name, base= name[1:], 16 +- uc = unichr(int(name, base)) ++ try: ++ uc = unichr(int(name, base)) ++ except ValueError: ++ # invalid literal for int() ++ # or integer not in unichr()'s range ++ uc = name + if encoding is None: + return uc diff --git a/scrapy/xlib/ClientForm.py b/scrapy/xlib/ClientForm.py index 7e9203d21..5831078f2 100644 --- a/scrapy/xlib/ClientForm.py +++ b/scrapy/xlib/ClientForm.py @@ -241,7 +241,12 @@ def unescape_charref(data, encoding): name, base = data, 10 if name.startswith("x"): name, base= name[1:], 16 - uc = unichr(int(name, base)) + try: + uc = unichr(int(name, base)) + except ValueError: + # invalid literal for int() + # or integer not in unichr()'s range + uc = name if encoding is None: return uc else: