From 1656fbcffa28f3b69862109631741a9c0118ac80 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Fri, 8 Apr 2016 23:25:50 +0200 Subject: [PATCH] Fix link extractor tests for non-ASCII characters from latin1 document URL path component should use UTF-8 before percent-encoding (that's what browsers do when you open scrapy/tests/sample_data/link_extractor/linkextractor_latin1.html and follow the links) This matches current w3lib v1.14.1 --- tests/test_linkextractors_deprecated.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_linkextractors_deprecated.py b/tests/test_linkextractors_deprecated.py index 89dcb75c2..7759575f3 100644 --- a/tests/test_linkextractors_deprecated.py +++ b/tests/test_linkextractors_deprecated.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import unittest from scrapy.linkextractors.regex import RegexLinkExtractor from scrapy.http import HtmlResponse @@ -81,9 +82,14 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase): Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) + # document encoding does not affect URL path component, only query part + # >>> u'sample_ñ.html'.encode('utf8') + # 'sample_\xc3\xb1.html' + # >>> u"sample_á.html".encode('utf8') + # 'sample_\xc3\xa1.html' self.assertEqual(lx.extract_links(response_latin1), [ - Link(url='http://example.com/sample_%F1.html', text=''), - Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), + Link(url='http://example.com/sample_%C3%B1.html', text=''), + Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')), ]) def test_matches(self):