SgmlLinkExtractor - fix for parsing <area> tag with Unicode present

2025-02-24 14:04:01 +00:00 · 2014-08-28 18:47:49 +02:00 · 2014-08-28 18:47:49 +02:00 · e4689556f0
commit e4689556f0
parent 774ab74ad2
2 changed files with 12 additions and 0 deletions
--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@ -67,6 +67,7 @@ class BaseSgmlLinkExtractor(SGMLParser):
        SGMLParser.reset(self)
        self.links = []
        self.base_url = None
+        self.current_link = None

    def unknown_starttag(self, tag, attrs):
        if tag == 'base':
--- a/tests/test_contrib_linkextractors.py
+++ b/tests/test_contrib_linkextractors.py
@ -284,6 +284,17 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
                         [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                               fragment='', nofollow=False)])

+    def test_area_tag_with_unicode_present(self):
+        body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
+        response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
+        lx = self.extractor_cls()
+        lx.extract_links(response)
+        lx.extract_links(response)
+        lx.extract_links(response)
+        self.assertEqual(lx.extract_links(response),
+                         [Link(url='http://example.org/foo', text=u'',
+                               fragment='', nofollow=False)])
+
    def test_encoded_url(self):
        body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
        response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')