mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 14:24:19 +00:00
Add link extractor test for non-ASCII characters in query part of URL
This commit is contained in:
parent
1656fbcffa
commit
7b5243a263
@ -1,15 +1,18 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=latin-1">
|
||||
<base href='http://example.com' />
|
||||
<title>Sample page with links for testing RegexLinkExtractor</title>
|
||||
</head>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=latin-1">
|
||||
<base href='http://example.com' />
|
||||
<title>Sample page with links for testing RegexLinkExtractor</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id='wrapper'>
|
||||
<div id='subwrapper'>
|
||||
<a href='sample_ñ.html'><img src='sample2.jpg'/></a>
|
||||
</div>
|
||||
<a href='sample_á.html' title='sample á'>sample á text</a>
|
||||
</div>
|
||||
<div id='wrapper'>
|
||||
<div id='subwrapper'>
|
||||
<a href='sample_ñ.html'><img src='sample2.jpg'/></a>
|
||||
</div>
|
||||
<a href='sample_á.html' title='sample á'>sample á text</a>
|
||||
<div id='subwrapper2'>
|
||||
<a href='sample_ö.html?price=£32&µ=unit'><img src='sample3.jpg'/></a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -84,12 +84,19 @@ class BaseSgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
|
||||
# document encoding does not affect URL path component, only query part
|
||||
# >>> u'sample_ñ.html'.encode('utf8')
|
||||
# 'sample_\xc3\xb1.html'
|
||||
# b'sample_\xc3\xb1.html'
|
||||
# >>> u"sample_á.html".encode('utf8')
|
||||
# 'sample_\xc3\xa1.html'
|
||||
# b'sample_\xc3\xa1.html'
|
||||
# >>> u"sample_ö.html".encode('utf8')
|
||||
# b'sample_\xc3\xb6.html'
|
||||
# >>> u"£32".encode('latin1')
|
||||
# b'\xa332'
|
||||
# >>> u"µ".encode('latin1')
|
||||
# b'\xb5'
|
||||
self.assertEqual(lx.extract_links(response_latin1), [
|
||||
Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
||||
Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
|
||||
Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''),
|
||||
])
|
||||
|
||||
def test_matches(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user