mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 23:03:42 +00:00
Fix link extractor tests for non-ASCII characters from latin1 document
URL path component should use UTF-8 before percent-encoding (that's what browsers do when you open scrapy/tests/sample_data/link_extractor/linkextractor_latin1.html and follow the links) This matches current w3lib v1.14.1
This commit is contained in:
parent
0ede017d2a
commit
1656fbcffa
@ -1,3 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import unittest
|
||||
from scrapy.linkextractors.regex import RegexLinkExtractor
|
||||
from scrapy.http import HtmlResponse
|
||||
@ -81,9 +82,14 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
|
||||
])
|
||||
|
||||
# document encoding does not affect URL path component, only query part
|
||||
# >>> u'sample_ñ.html'.encode('utf8')
|
||||
# 'sample_\xc3\xb1.html'
|
||||
# >>> u"sample_á.html".encode('utf8')
|
||||
# 'sample_\xc3\xa1.html'
|
||||
self.assertEqual(lx.extract_links(response_latin1), [
|
||||
Link(url='http://example.com/sample_%F1.html', text=''),
|
||||
Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
|
||||
Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
||||
Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
|
||||
])
|
||||
|
||||
def test_matches(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user