linkextractor: unique after urljoin_rfc

Now, '/foo.html' and 'http://example.org/foo.html' are considered as the same and only one is kept. Signed-off-by: Ping Yin <pkufranky@gmail.com>
2025-02-24 15:43:48 +00:00 · 2010-04-02 19:45:30 +08:00 · 2010-04-02 19:45:30 +08:00 · d42e5fdbac
commit d42e5fdbac
parent 1868ede549
3 changed files with 11 additions and 4 deletions
--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@ -25,16 +25,16 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
        self.feed(response_text)
        self.close()
        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
-        for link in links:
+        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)
        ret = unique_list(ret, key=lambda link: link.url) if self.unique else ret
        return ret
    def extract_links(self, response):
--- a/scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html
+++ b/scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html
@ -9,7 +9,7 @@
 <area href='sample1.html' />
 <a href='sample2.html'>sample 2<img src='sample2.jpg'/></a>
 </div>
-<a href='sample3.html' title='sample 3'>sample 3 text</a>
+<a href='http://example.com/sample3.html' title='sample 3'>sample 3 text</a>
 <a href='sample3.html'>sample 3 repetition</a>
 <a href='http://www.google.com/something'></a>
 </div>
--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@ -110,6 +110,13 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
              Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
              ])
        lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),