import re import unittest from warnings import catch_warnings import pytest from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import HtmlResponse, XmlResponse from scrapy.link import Link from scrapy.linkextractors import FilteringLinkExtractor from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from tests import get_testdata # a hack to skip base class tests in pytest class Base: class LinkExtractorTestCase(unittest.TestCase): extractor_cls = None escapes_whitespace = False def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def test_urls_type(self): ''' Test that the resulting urls are str objects ''' lx = self.extractor_cls() self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response))) def test_extract_all_links(self): lx = self.extractor_cls() if self.escapes_whitespace: page4_url = 'http://example.com/page%204.html' else: page4_url = 'http://example.com/page 4.html' self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), Link(url=page4_url, text=u'href with whitespaces'), ]) def test_extract_filter_allow(self): lx = self.extractor_cls(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment') ]) def test_extract_filter_allow_with_duplicates(self): lx = self.extractor_cls(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment') ]) def test_extract_filter_allow_with_duplicates_canonicalize(self): lx = self.extractor_cls(allow=('sample', ), unique=False, canonicalize=True) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), Link(url='http://example.com/sample3.html', text='sample 3 repetition with fragment') ]) def test_extract_filter_allow_no_duplicates_canonicalize(self): lx = self.extractor_cls(allow=('sample',), unique=True, canonicalize=True) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) def test_extract_filter_allow_and_deny(self): lx = self.extractor_cls(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) def test_extract_filter_allowed_domains(self): lx = self.extractor_cls(allow_domains=('google.com', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = self.extractor_cls(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html#foo', text='sample 3 repetition with fragment') ]) lx = self.extractor_cls(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = self.extractor_cls(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) lx = self.extractor_cls(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) def test_nofollow(self): '''Test the extractor's behaviour for links with rel="nofollow"''' html = b"""