import re import unittest from warnings import catch_warnings import pytest from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import HtmlResponse, XmlResponse from scrapy.link import Link from scrapy.linkextractors import FilteringLinkExtractor from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from tests import get_testdata # a hack to skip base class tests in pytest class Base: class LinkExtractorTestCase(unittest.TestCase): extractor_cls = None escapes_whitespace = False def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def test_urls_type(self): ''' Test that the resulting urls are str objects ''' lx = self.extractor_cls() self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response))) def test_extract_all_links(self): lx = self.extractor_cls() if self.escapes_whitespace: page4_url = 'http://example.com/page%204.html' else: page4_url = 'http://example.com/page 4.html' self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ]) def test_extract_filter_allow(self): lx = self.extractor_cls(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment') ]) def test_extract_filter_allow_with_duplicates(self): lx = self.extractor_cls(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment') ]) def test_extract_filter_allow_with_duplicates_canonicalize(self): lx = self.extractor_cls(allow=('sample', ), unique=False, canonicalize=True) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), Link(url='http://example.com/sample3.html', text='sample 3 repetition with fragment') ]) def test_extract_filter_allow_no_duplicates_canonicalize(self): lx = self.extractor_cls(allow=('sample',), unique=True, canonicalize=True) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) def test_extract_filter_allow_and_deny(self): lx = self.extractor_cls(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) def test_extract_filter_allowed_domains(self): lx = self.extractor_cls(allow_domains=('google.com', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = self.extractor_cls(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment') ]) lx = self.extractor_cls(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = self.extractor_cls(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) lx = self.extractor_cls(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) def test_nofollow(self): '''Test the extractor's behaviour for links with rel="nofollow"''' html = b"""Page title<title> <body> <div class='links'> <p><a href="/about.html">About us</a></p> </div> <div> <p><a href="/follow.html">Follow this link</a></p> </div> <div> <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p> </div> <div> <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p> </div> <div> <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p> </div> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/about.html', text=u'About us'), Link(url='http://example.org/follow.html', text=u'Follow this link'), Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True), Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'), Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True), ]) def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = self.extractor_cls(allow=(r'stuff1', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = self.extractor_cls(deny=(r'uglystuff', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = self.extractor_cls(allow_domains=('evenmorestuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = self.extractor_cls(deny_domains=('lotsofstuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = self.extractor_cls(allow=('blah1',), deny=('blah2',), allow_domains=('blah1.com',), deny_domains=('blah2.com',)) self.assertEqual(lx.matches('http://blah1.com/blah1'), True) self.assertEqual(lx.matches('http://blah1.com/blah2'), False) self.assertEqual(lx.matches('http://blah2.com/blah1'), False) self.assertEqual(lx.matches('http://blah2.com/blah2'), False) def test_restrict_xpaths(self): lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) def test_restrict_xpaths_encoding(self): """Test restrict_xpaths with encodings""" html = b"""<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <div class='links'> <p><a href="/about.html">About us\xa3</a></p> </div> <div> <p><a href="/nofollow.html">This shouldn't be followed</a></p> </div> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') lx = self.extractor_cls(restrict_xpaths="//div[@class='links']") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/about.html', text=u'About us\xa3')]) def test_restrict_xpaths_with_html_entities(self): html = b'<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>' response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15') links = self.extractor_cls(restrict_xpaths='//p').extract_links(response) self.assertEqual(links, [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')]) def test_restrict_xpaths_concat_in_handle_data(self): """html entities cause SGMLParser to call handle_data hook twice""" body = b"""<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>""" response = HtmlResponse("http://example.org", body=body, encoding='gb18030') lx = self.extractor_cls(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c', fragment='', nofollow=False)]) def test_restrict_css(self): lx = self.extractor_cls(restrict_css=('#subwrapper a',)) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample2.html', text=u'sample 2') ]) def test_restrict_css_and_restrict_xpaths_together(self): lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ), restrict_css=('#subwrapper + a', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) def test_area_tag_with_unicode_present(self): body = b"""<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>""" response = HtmlResponse("http://example.org", body=body, encoding='utf-8') lx = self.extractor_cls() lx.extract_links(response) lx.extract_links(response) lx.extract_links(response) self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/foo', text=u'', fragment='', nofollow=False)]) def test_encoded_url(self): body = b"""<html><body><div><a href="?page=2">BinB</a></body></html>""" response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8') lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False), ]) def test_encoded_url_in_restricted_xpath(self): body = b"""<html><body><div><a href="?page=2">BinB</a></body></html>""" response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8') lx = self.extractor_cls(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [ Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False), ]) def test_ignored_extensions(self): # jpg is ignored by default html = b"""<a href="page.html">asd</a> and <a href="photo.jpg">""" response = HtmlResponse("http://example.org/", body=html) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ]) # override denied extensions lx = self.extractor_cls(deny_extensions=['html']) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/photo.jpg'), ]) def test_process_value(self): """Test restrict_xpaths with encodings""" html = b""" <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a> <a href="/about.html">About us</a> """ response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') def process_value(value): m = re.search(r"javascript:goToPage\('(.*?)'", value) if m: return m.group(1) lx = self.extractor_cls(process_value=process_value) self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/other/page.html', text='Link text')]) def test_base_url_with_restrict_xpaths(self): html = b"""<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = self.extractor_cls(restrict_xpaths="//p") self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) def test_attrs(self): lx = self.extractor_cls(attrs="href") if self.escapes_whitespace: page4_url = 'http://example.com/page%204.html' else: page4_url = 'http://example.com/page 4.html' self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ]) lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) def test_tags(self): html = b"""<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = self.extractor_cls(tags=None) self.assertEqual(lx.extract_links(response), []) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = self.extractor_cls(tags="area") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ]) lx = self.extractor_cls(tags="a") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = self.extractor_cls(tags=("a", "img"), attrs=("href", "src"), deny_extensions=()) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), ]) def test_tags_attrs(self): html = b""" <html><body> <div id="item1" data-url="get?id=1"><a href="#">Item 1</a></div> <div id="item2" data-url="get?id=2"><a href="#">Item 2</a></div> </body></html> """ response = HtmlResponse("http://example.com/index.html", body=html) lx = self.extractor_cls(tags='div', attrs='data-url') self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False), Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False) ]) lx = self.extractor_cls(tags=('div',), attrs=('data-url',)) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False), Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False) ]) def test_xhtml(self): xhtml = b""" <?xml version="1.0"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>XHTML document title

Follow this link

Dont follow this one

Choose to follow or not

External link not to follow

""" response = HtmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False), Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False), Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True), Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False), Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)] ) response = XmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False), Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False), Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True), Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False), Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)] ) def test_link_wrong_href(self): html = b""" Item 1 Item 2 Item 3 """ response = HtmlResponse("http://example.org/index.html", body=html) lx = self.extractor_cls() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), ]) def test_ftp_links(self): body = b"""
An Item
""" response = HtmlResponse("http://www.example.com/index.html", body=body, encoding='utf8') lx = self.extractor_cls() self.assertEqual(lx.extract_links(response), [ Link(url='ftp://www.external.com/', text=u'An Item', fragment='', nofollow=False), ]) class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase): extractor_cls = LxmlLinkExtractor def test_link_wrong_href(self): html = b""" Item 1 Item 2 Item 3 """ response = HtmlResponse("http://example.org/index.html", body=html) lx = self.extractor_cls() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), ]) def test_link_restrict_text(self): html = b""" Pic of a cat Pic of a dog Pic of a cow """ response = HtmlResponse("http://example.org/index.html", body=html) # Simple text inclusion test lx = self.extractor_cls(restrict_text='dog') self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item2.html', text=u'Pic of a dog', nofollow=False), ]) # Unique regex test lx = self.extractor_cls(restrict_text=r'of.*dog') self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item2.html', text=u'Pic of a dog', nofollow=False), ]) # Multiple regex test lx = self.extractor_cls(restrict_text=[r'of.*dog', r'of.*cat']) self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item1.html', text=u'Pic of a cat', nofollow=False), Link(url='http://example.org/item2.html', text=u'Pic of a dog', nofollow=False), ]) @pytest.mark.xfail def test_restrict_xpaths_with_html_entities(self): super(LxmlLinkExtractorTestCase, self).test_restrict_xpaths_with_html_entities() def test_filteringlinkextractor_deprecation_warning(self): """Make sure the FilteringLinkExtractor deprecation warning is not issued for LxmlLinkExtractor""" with catch_warnings(record=True) as warnings: LxmlLinkExtractor() self.assertEqual(len(warnings), 0) class SubclassedLxmlLinkExtractor(LxmlLinkExtractor): pass SubclassedLxmlLinkExtractor() self.assertEqual(len(warnings), 0) class FilteringLinkExtractorTest(unittest.TestCase): def test_deprecation_warning(self): args = [None] * 10 with catch_warnings(record=True) as warnings: FilteringLinkExtractor(*args) self.assertEqual(len(warnings), 1) self.assertEqual(warnings[0].category, ScrapyDeprecationWarning) with catch_warnings(record=True) as warnings: class SubclassedFilteringLinkExtractor(FilteringLinkExtractor): pass SubclassedFilteringLinkExtractor(*args) self.assertEqual(len(warnings), 1) self.assertEqual(warnings[0].category, ScrapyDeprecationWarning)