From 1656fbcffa28f3b69862109631741a9c0118ac80 Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Fri, 8 Apr 2016 23:25:50 +0200
Subject: [PATCH] Fix link extractor tests for non-ASCII characters from latin1
 document

URL path component should use UTF-8 before percent-encoding (that's what
browsers do when you open scrapy/tests/sample_data/link_extractor/linkextractor_latin1.html
and follow the links)
This matches current w3lib v1.14.1
---
 tests/test_linkextractors_deprecated.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_linkextractors_deprecated.py b/tests/test_linkextractors_deprecated.py
index 89dcb75c2..7759575f3 100644
--- a/tests/test_linkextractors_deprecated.py
+++ b/tests/test_linkextractors_deprecated.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import unittest
 from scrapy.linkextractors.regex import RegexLinkExtractor
 from scrapy.http import HtmlResponse
@@ -81,9 +82,14 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase):
             Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
         ])
 
+        # document encoding does not affect URL path component, only query part
+        # >>> u'sample_ñ.html'.encode('utf8')
+        # 'sample_\xc3\xb1.html'
+        # >>> u"sample_á.html".encode('utf8')
+        # 'sample_\xc3\xa1.html'
         self.assertEqual(lx.extract_links(response_latin1), [
-            Link(url='http://example.com/sample_%F1.html', text=''),
-            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
+            Link(url='http://example.com/sample_%C3%B1.html', text=''),
+            Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
         ])
 
     def test_matches(self):