The Link object has a 'nofollow' attribute. Adding support in the SGML link extractors.

2025-02-24 15:43:48 +00:00 · 2012-11-10 16:25:41 +01:00 · 2012-11-10 16:25:41 +01:00 · 3d0e962cbb
commit 3d0e962cbb
parent da7e414fe9
2 changed files with 12 additions and 1 deletions
--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@ -71,7 +71,7 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
                if self.scan_attr(attr):
                    url = self.process_value(value)
                    if url is not None:
-                        link = Link(url=url)
+                        link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
                        self.links.append(link)
                        self.current_link = link

--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@ -85,6 +85,17 @@ class LinkExtractorTestCase(unittest.TestCase):
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), True)

+    def test_link_nofollow(self):
+        html = """
+        <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
+        <a href="about.html">About us</a>
+        """
+        response = HtmlResponse("http://example.org/page.html", body=html)
+        lx = SgmlLinkExtractor()
+        self.assertEqual([link for link in lx.extract_links(response)],
+            [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
+              Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
+
 class SgmlLinkExtractorTestCase(unittest.TestCase):
    def setUp(self):
        body = get_testdata('link_extractor', 'sgml_linkextractor.html')