mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 18:44:20 +00:00
SgmlLinkExtractor - fix for parsing <area> tag with Unicode present
This commit is contained in:
parent
774ab74ad2
commit
e4689556f0
@ -67,6 +67,7 @@ class BaseSgmlLinkExtractor(SGMLParser):
|
|||||||
SGMLParser.reset(self)
|
SGMLParser.reset(self)
|
||||||
self.links = []
|
self.links = []
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
|
self.current_link = None
|
||||||
|
|
||||||
def unknown_starttag(self, tag, attrs):
|
def unknown_starttag(self, tag, attrs):
|
||||||
if tag == 'base':
|
if tag == 'base':
|
||||||
|
@ -284,6 +284,17 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
|||||||
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
|
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
|
||||||
fragment='', nofollow=False)])
|
fragment='', nofollow=False)])
|
||||||
|
|
||||||
|
def test_area_tag_with_unicode_present(self):
|
||||||
|
body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
|
||||||
|
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
|
||||||
|
lx = self.extractor_cls()
|
||||||
|
lx.extract_links(response)
|
||||||
|
lx.extract_links(response)
|
||||||
|
lx.extract_links(response)
|
||||||
|
self.assertEqual(lx.extract_links(response),
|
||||||
|
[Link(url='http://example.org/foo', text=u'',
|
||||||
|
fragment='', nofollow=False)])
|
||||||
|
|
||||||
def test_encoded_url(self):
|
def test_encoded_url(self):
|
||||||
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
|
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
|
||||||
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
|
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user