Add link extractor test for non-ASCII characters in query part of URL

2025-02-23 14:24:19 +00:00 · 2016-04-09 15:15:01 +02:00 · 2016-04-09 15:15:01 +02:00 · 7b5243a263
commit 7b5243a263
parent 1656fbcffa
2 changed files with 23 additions and 13 deletions
--- a/tests/sample_data/link_extractor/linkextractor_latin1.html
+++ b/tests/sample_data/link_extractor/linkextractor_latin1.html
@ -1,15 +1,18 @@
 <html>
-<head>
-<meta http-equiv="Content-Type" content="text/html; charset=latin-1">
-<base href='http://example.com' />
-<title>Sample page with links for testing RegexLinkExtractor</title>
-</head>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=latin-1">
+    <base href='http://example.com' />
+    <title>Sample page with links for testing RegexLinkExtractor</title>
+  </head>
 <body>
-<div id='wrapper'>
-<div id='subwrapper'>
-<a href='sample_ñ.html'><img src='sample2.jpg'/></a>
-</div>
-<a href='sample_á.html' title='sample á'>sample á text</a>
-</div>
+  <div id='wrapper'>
+    <div id='subwrapper'>
+      <a href='sample_ñ.html'><img src='sample2.jpg'/></a>
+    </div>
+    <a href='sample_á.html' title='sample á'>sample á text</a>
+    <div id='subwrapper2'>
+      <a href='sample_ö.html?price=£32&µ=unit'><img src='sample3.jpg'/></a>
+    </div>
+  </div>
 </body>
 </html>
--- a/tests/test_linkextractors_deprecated.py
+++ b/tests/test_linkextractors_deprecated.py
@ -84,12 +84,19 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase):

        # document encoding does not affect URL path component, only query part
        # >>> u'sample_ñ.html'.encode('utf8')
-        # 'sample_\xc3\xb1.html'
+        # b'sample_\xc3\xb1.html'
        # >>> u"sample_á.html".encode('utf8')
-        # 'sample_\xc3\xa1.html'
+        # b'sample_\xc3\xa1.html'
+        # >>> u"sample_ö.html".encode('utf8')
+        # b'sample_\xc3\xb6.html'
+        # >>> u"£32".encode('latin1')
+        # b'\xa332'
+        # >>> u"µ".encode('latin1')
+        # b'\xb5'
        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
+            Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''),
        ])

    def test_matches(self):