1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 15:43:48 +00:00

linkextractor: unique after urljoin_rfc

Now, '/foo.html' and 'http://example.org/foo.html' are considered
as the same and only one is kept.

Signed-off-by: Ping Yin <pkufranky@gmail.com>
This commit is contained in:
Ping Yin 2010-04-02 19:45:30 +08:00
parent 1868ede549
commit d42e5fdbac
3 changed files with 11 additions and 4 deletions

View File

@ -25,16 +25,16 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
self.feed(response_text) self.feed(response_text)
self.close() self.close()
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = [] ret = []
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in links: for link in self.links:
link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding) link.text = str_to_unicode(link.text, response_encoding)
ret.append(link) ret.append(link)
ret = unique_list(ret, key=lambda link: link.url) if self.unique else ret
return ret return ret
def extract_links(self, response): def extract_links(self, response):

View File

@ -9,7 +9,7 @@
<area href='sample1.html' /> <area href='sample1.html' />
<a href='sample2.html'>sample 2<img src='sample2.jpg'/></a> <a href='sample2.html'>sample 2<img src='sample2.jpg'/></a>
</div> </div>
<a href='sample3.html' title='sample 3'>sample 3 text</a> <a href='http://example.com/sample3.html' title='sample 3'>sample 3 text</a>
<a href='sample3.html'>sample 3 repetition</a> <a href='sample3.html'>sample 3 repetition</a>
<a href='http://www.google.com/something'></a> <a href='http://www.google.com/something'></a>
</div> </div>

View File

@ -110,6 +110,13 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ]) Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
lx = SgmlLinkExtractor(allow=('sample', ))
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
])
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', )) lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
self.assertEqual([link for link in lx.extract_links(self.response)], self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''), [ Link(url='http://example.com/sample1.html', text=u''),