1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 19:44:33 +00:00

Merge pull request #1214 from rgtk/link-rel

[MRG+1] Support link rel attribute with multiple values
This commit is contained in:
Julia Medina 2015-08-27 19:57:54 -03:00
commit aa31811cfd
4 changed files with 22 additions and 6 deletions

View File

@ -9,7 +9,7 @@ import lxml.etree as etree
from scrapy.selector import Selector
from scrapy.link import Link
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.response import get_base_url
@ -62,7 +62,7 @@ class LxmlParserLinkExtractor(object):
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
nofollow=rel_has_nofollow(el.get('rel')))
links.append(link)
return unique_list(links, key=lambda link: link.url) \

View File

@ -9,7 +9,7 @@ from w3lib.url import safe_url_string
from scrapy.selector import Selector
from scrapy.link import Link
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list, to_unicode
from scrapy.utils.response import get_base_url
from scrapy.exceptions import ScrapyDeprecationWarning
@ -80,7 +80,7 @@ class BaseSgmlLinkExtractor(SGMLParser):
if self.scan_attr(attr):
url = self.process_value(value)
if url is not None:
link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
self.links.append(link)
self.current_link = link

View File

@ -112,3 +112,8 @@ def md5sum(file):
break
m.update(d)
return m.hexdigest()
def rel_has_nofollow(rel):
"""Return True if link rel attribute has nofollow type"""
return True if rel is not None and 'nofollow' in rel.split() else False

View File

@ -96,12 +96,14 @@ class LinkExtractorTestCase(unittest.TestCase):
html = """
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
<a href="about.html">About us</a>
<a href="http://google.com/something" rel="external nofollow">Something</a>
"""
response = HtmlResponse("http://example.org/page.html", body=html)
lx = SgmlLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
Link(url='http://google.com/something', text=u'Something', nofollow=True),
])
@ -205,6 +207,9 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
<div>
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
</div>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
@ -214,6 +219,7 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
Link(url='http://example.org/follow.html', text=u'Follow this link'),
Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True),
])
def test_matches(self):
@ -467,6 +473,9 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
<div>
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
</div>
</body>
</html>
"""
@ -478,7 +487,8 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
)
response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
@ -488,7 +498,8 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
)
def test_link_wrong_href(self):