mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 11:03:56 +00:00
linkextractor: unique after urljoin_rfc
Now, '/foo.html' and 'http://example.org/foo.html' are considered as the same and only one is kept. Signed-off-by: Ping Yin <pkufranky@gmail.com>
This commit is contained in:
parent
1868ede549
commit
d42e5fdbac
@ -25,16 +25,16 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||
for link in links:
|
||||
for link in self.links:
|
||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = str_to_unicode(link.text, response_encoding)
|
||||
ret.append(link)
|
||||
|
||||
ret = unique_list(ret, key=lambda link: link.url) if self.unique else ret
|
||||
|
||||
return ret
|
||||
|
||||
def extract_links(self, response):
|
||||
|
@ -9,7 +9,7 @@
|
||||
<area href='sample1.html' />
|
||||
<a href='sample2.html'>sample 2<img src='sample2.jpg'/></a>
|
||||
</div>
|
||||
<a href='sample3.html' title='sample 3'>sample 3 text</a>
|
||||
<a href='http://example.com/sample3.html' title='sample 3'>sample 3 text</a>
|
||||
<a href='sample3.html'>sample 3 repetition</a>
|
||||
<a href='http://www.google.com/something'></a>
|
||||
</div>
|
||||
|
@ -110,6 +110,13 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
|
||||
|
||||
lx = SgmlLinkExtractor(allow=('sample', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||
])
|
||||
|
||||
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
|
Loading…
x
Reference in New Issue
Block a user