mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 19:44:33 +00:00
Merge pull request #1214 from rgtk/link-rel
[MRG+1] Support link rel attribute with multiple values
This commit is contained in:
commit
aa31811cfd
@ -9,7 +9,7 @@ import lxml.etree as etree
|
||||
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.linkextractors import FilteringLinkExtractor
|
||||
from scrapy.utils.response import get_base_url
|
||||
@ -62,7 +62,7 @@ class LxmlParserLinkExtractor(object):
|
||||
# to fix relative links after process_value
|
||||
url = urljoin(response_url, url)
|
||||
link = Link(url, _collect_string_content(el) or u'',
|
||||
nofollow=True if el.get('rel') == 'nofollow' else False)
|
||||
nofollow=rel_has_nofollow(el.get('rel')))
|
||||
links.append(link)
|
||||
|
||||
return unique_list(links, key=lambda link: link.url) \
|
||||
|
@ -9,7 +9,7 @@ from w3lib.url import safe_url_string
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.link import Link
|
||||
from scrapy.linkextractors import FilteringLinkExtractor
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||
from scrapy.utils.python import unique as unique_list, to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
@ -80,7 +80,7 @@ class BaseSgmlLinkExtractor(SGMLParser):
|
||||
if self.scan_attr(attr):
|
||||
url = self.process_value(value)
|
||||
if url is not None:
|
||||
link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
|
||||
link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
|
||||
self.links.append(link)
|
||||
self.current_link = link
|
||||
|
||||
|
@ -112,3 +112,8 @@ def md5sum(file):
|
||||
break
|
||||
m.update(d)
|
||||
return m.hexdigest()
|
||||
|
||||
def rel_has_nofollow(rel):
|
||||
"""Return True if link rel attribute has nofollow type"""
|
||||
return True if rel is not None and 'nofollow' in rel.split() else False
|
||||
|
||||
|
@ -96,12 +96,14 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
html = """
|
||||
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
|
||||
<a href="about.html">About us</a>
|
||||
<a href="http://google.com/something" rel="external nofollow">Something</a>
|
||||
"""
|
||||
response = HtmlResponse("http://example.org/page.html", body=html)
|
||||
lx = SgmlLinkExtractor()
|
||||
self.assertEqual([link for link in lx.extract_links(response)], [
|
||||
Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
|
||||
Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
|
||||
Link(url='http://google.com/something', text=u'Something', nofollow=True),
|
||||
])
|
||||
|
||||
|
||||
@ -205,6 +207,9 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
<div>
|
||||
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
|
||||
</div>
|
||||
<div>
|
||||
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
|
||||
</div>
|
||||
</body></html>"""
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
||||
|
||||
@ -214,6 +219,7 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
Link(url='http://example.org/follow.html', text=u'Follow this link'),
|
||||
Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
|
||||
Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
|
||||
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True),
|
||||
])
|
||||
|
||||
def test_matches(self):
|
||||
@ -467,6 +473,9 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
<div>
|
||||
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
|
||||
</div>
|
||||
<div>
|
||||
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
@ -478,7 +487,8 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
|
||||
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
|
||||
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
|
||||
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
|
||||
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
|
||||
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
|
||||
)
|
||||
|
||||
response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
|
||||
@ -488,7 +498,8 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
|
||||
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
|
||||
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
|
||||
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
|
||||
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
|
||||
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
|
||||
)
|
||||
|
||||
def test_link_wrong_href(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user