1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 14:24:19 +00:00

Add link extractor test for non-ASCII characters in query part of URL

This commit is contained in:
Paul Tremberth 2016-04-09 15:15:01 +02:00
parent 1656fbcffa
commit 7b5243a263
2 changed files with 23 additions and 13 deletions

View File

@ -1,15 +1,18 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=latin-1">
<base href='http://example.com' />
<title>Sample page with links for testing RegexLinkExtractor</title>
</head>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=latin-1">
<base href='http://example.com' />
<title>Sample page with links for testing RegexLinkExtractor</title>
</head>
<body>
<div id='wrapper'>
<div id='subwrapper'>
<a href='sample_ñ.html'><img src='sample2.jpg'/></a>
</div>
<a href='sample_á.html' title='sample á'>sample á text</a>
</div>
<div id='wrapper'>
<div id='subwrapper'>
<a href='sample_ñ.html'><img src='sample2.jpg'/></a>
</div>
<a href='sample_á.html' title='sample á'>sample á text</a>
<div id='subwrapper2'>
<a href='sample_ö.html?price=£32&µ=unit'><img src='sample3.jpg'/></a>
</div>
</div>
</body>
</html>

View File

@ -84,12 +84,19 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase):
# document encoding does not affect URL path component, only query part
# >>> u'sample_ñ.html'.encode('utf8')
# 'sample_\xc3\xb1.html'
# b'sample_\xc3\xb1.html'
# >>> u"sample_á.html".encode('utf8')
# 'sample_\xc3\xa1.html'
# b'sample_\xc3\xa1.html'
# >>> u"sample_ö.html".encode('utf8')
# b'sample_\xc3\xb6.html'
# >>> u"£32".encode('latin1')
# b'\xa332'
# >>> u"µ".encode('latin1')
# b'\xb5'
self.assertEqual(lx.extract_links(response_latin1), [
Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''),
])
def test_matches(self):