Fix link extractor tests for non-ASCII characters from latin1 document

URL path component should use UTF-8 before percent-encoding (that's what browsers do when you open scrapy/tests/sample_data/link_extractor/linkextractor_latin1.html and follow the links) This matches current w3lib v1.14.1
2025-02-23 23:03:42 +00:00 · 2016-04-08 23:25:50 +02:00 · 2016-04-08 23:25:50 +02:00 · 1656fbcffa
commit 1656fbcffa
parent 0ede017d2a
1 changed files with 8 additions and 2 deletions
--- a/tests/test_linkextractors_deprecated.py
+++ b/tests/test_linkextractors_deprecated.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import unittest
 from scrapy.linkextractors.regex import RegexLinkExtractor
 from scrapy.http import HtmlResponse
@ -81,9 +82,14 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase):
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

+        # document encoding does not affect URL path component, only query part
+        # >>> u'sample_ñ.html'.encode('utf8')
+        # 'sample_\xc3\xb1.html'
+        # >>> u"sample_á.html".encode('utf8')
+        # 'sample_\xc3\xa1.html'
        self.assertEqual(lx.extract_links(response_latin1), [
-            Link(url='http://example.com/sample_%F1.html', text=''),
-            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
+            Link(url='http://example.com/sample_%C3%B1.html', text=''),
+            Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
        ])

    def test_matches(self):