mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 15:43:48 +00:00
linkextractor: unique after urljoin_rfc
Now, '/foo.html' and 'http://example.org/foo.html' are considered as the same and only one is kept. Signed-off-by: Ping Yin <pkufranky@gmail.com>
This commit is contained in:
parent
1868ede549
commit
d42e5fdbac
@ -25,16 +25,16 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
|
|||||||
self.feed(response_text)
|
self.feed(response_text)
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
|
||||||
|
|
||||||
ret = []
|
ret = []
|
||||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||||
for link in links:
|
for link in self.links:
|
||||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||||
link.url = safe_url_string(link.url, response_encoding)
|
link.url = safe_url_string(link.url, response_encoding)
|
||||||
link.text = str_to_unicode(link.text, response_encoding)
|
link.text = str_to_unicode(link.text, response_encoding)
|
||||||
ret.append(link)
|
ret.append(link)
|
||||||
|
|
||||||
|
ret = unique_list(ret, key=lambda link: link.url) if self.unique else ret
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def extract_links(self, response):
|
def extract_links(self, response):
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
<area href='sample1.html' />
|
<area href='sample1.html' />
|
||||||
<a href='sample2.html'>sample 2<img src='sample2.jpg'/></a>
|
<a href='sample2.html'>sample 2<img src='sample2.jpg'/></a>
|
||||||
</div>
|
</div>
|
||||||
<a href='sample3.html' title='sample 3'>sample 3 text</a>
|
<a href='http://example.com/sample3.html' title='sample 3'>sample 3 text</a>
|
||||||
<a href='sample3.html'>sample 3 repetition</a>
|
<a href='sample3.html'>sample 3 repetition</a>
|
||||||
<a href='http://www.google.com/something'></a>
|
<a href='http://www.google.com/something'></a>
|
||||||
</div>
|
</div>
|
||||||
|
@ -110,6 +110,13 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
|||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
|
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
|
||||||
|
|
||||||
|
lx = SgmlLinkExtractor(allow=('sample', ))
|
||||||
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||||
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||||
|
])
|
||||||
|
|
||||||
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
|
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user