mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 10:43:48 +00:00
The Link object has a 'nofollow' attribute. Adding support in the SGML link extractors.
This commit is contained in:
parent
da7e414fe9
commit
3d0e962cbb
@ -71,7 +71,7 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||
if self.scan_attr(attr):
|
||||
url = self.process_value(value)
|
||||
if url is not None:
|
||||
link = Link(url=url)
|
||||
link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
|
||||
self.links.append(link)
|
||||
self.current_link = link
|
||||
|
||||
|
@ -85,6 +85,17 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
self.assertEqual(lx.matches(url1), True)
|
||||
self.assertEqual(lx.matches(url2), True)
|
||||
|
||||
def test_link_nofollow(self):
|
||||
html = """
|
||||
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
|
||||
<a href="about.html">About us</a>
|
||||
"""
|
||||
response = HtmlResponse("http://example.org/page.html", body=html)
|
||||
lx = SgmlLinkExtractor()
|
||||
self.assertEqual([link for link in lx.extract_links(response)],
|
||||
[ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
|
||||
Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
|
||||
|
||||
class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
||||
|
Loading…
x
Reference in New Issue
Block a user