Merge pull request #1214 from rgtk/link-rel

[MRG+1] Support link rel attribute with multiple values
2025-02-24 19:44:33 +00:00 · 2015-08-27 19:57:54 -03:00 · 2015-08-27 19:57:54 -03:00 · aa31811cfd
commit aa31811cfd
parent 9bfab53075 cb3007c066
4 changed files with 22 additions and 6 deletions
--- a/scrapy/linkextractors/lxmlhtml.py
+++ b/scrapy/linkextractors/lxmlhtml.py
@ -9,7 +9,7 @@ import lxml.etree as etree

 from scrapy.selector import Selector
 from scrapy.link import Link
-from scrapy.utils.misc import arg_to_iter
+from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
 from scrapy.utils.python import unique as unique_list
 from scrapy.linkextractors import FilteringLinkExtractor
 from scrapy.utils.response import get_base_url
@ -62,7 +62,7 @@ class LxmlParserLinkExtractor(object):
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
-                nofollow=True if el.get('rel') == 'nofollow' else False)
+                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
--- a/scrapy/linkextractors/sgml.py
+++ b/scrapy/linkextractors/sgml.py
@ -9,7 +9,7 @@ from w3lib.url import safe_url_string
 from scrapy.selector import Selector
 from scrapy.link import Link
 from scrapy.linkextractors import FilteringLinkExtractor
-from scrapy.utils.misc import arg_to_iter
+from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
 from scrapy.utils.python import unique as unique_list, to_unicode
 from scrapy.utils.response import get_base_url
 from scrapy.exceptions import ScrapyDeprecationWarning
@ -80,7 +80,7 @@ class BaseSgmlLinkExtractor(SGMLParser):
                if self.scan_attr(attr):
                    url = self.process_value(value)
                    if url is not None:
-                        link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
+                        link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
                        self.links.append(link)
                        self.current_link = link

--- a/scrapy/utils/misc.py
+++ b/scrapy/utils/misc.py
@ -112,3 +112,8 @@ def md5sum(file):
            break
        m.update(d)
    return m.hexdigest()
+
+def rel_has_nofollow(rel):
+    """Return True if link rel attribute has nofollow type"""
+    return True if rel is not None and 'nofollow' in rel.split() else False
+    
--- a/tests/test_linkextractors.py
+++ b/tests/test_linkextractors.py
@ -96,12 +96,14 @@ class LinkExtractorTestCase(unittest.TestCase):
        html = """
        <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
        <a href="about.html">About us</a>
+        <a href="http://google.com/something" rel="external nofollow">Something</a>
        """
        response = HtmlResponse("http://example.org/page.html", body=html)
        lx = SgmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(response)], [
            Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
            Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
+            Link(url='http://google.com/something', text=u'Something', nofollow=True),
        ])


@ -205,6 +207,9 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
        <div>
        <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
        </div>
+        <div>
+        <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
+        </div>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)

@ -214,6 +219,7 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
            Link(url='http://example.org/follow.html', text=u'Follow this link'),
            Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
            Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
+            Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True),
        ])

    def test_matches(self):
@ -467,6 +473,9 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
    <div>
    <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
    </div>
+    <div>
+    <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
+    </div>
 </body>
 </html>
        """
@ -478,7 +487,8 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
                         [Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
                          Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
                          Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
-                          Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
+                          Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
+                          Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
                        )

        response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
@ -488,7 +498,8 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
                         [Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
                          Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
                          Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
-                          Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
+                          Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
+                          Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
                        )

    def test_link_wrong_href(self):