1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 15:43:48 +00:00

The Link object has a 'nofollow' attribute. Adding support in the SGML link extractors.

This commit is contained in:
Pedro Faustino 2012-11-10 16:25:41 +01:00
parent da7e414fe9
commit 3d0e962cbb
2 changed files with 12 additions and 1 deletions

View File

@ -71,7 +71,7 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
if self.scan_attr(attr):
url = self.process_value(value)
if url is not None:
link = Link(url=url)
link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
self.links.append(link)
self.current_link = link

View File

@ -85,6 +85,17 @@ class LinkExtractorTestCase(unittest.TestCase):
self.assertEqual(lx.matches(url1), True)
self.assertEqual(lx.matches(url2), True)
def test_link_nofollow(self):
html = """
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
<a href="about.html">About us</a>
"""
response = HtmlResponse("http://example.org/page.html", body=html)
lx = SgmlLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)],
[ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
class SgmlLinkExtractorTestCase(unittest.TestCase):
def setUp(self):
body = get_testdata('link_extractor', 'sgml_linkextractor.html')