import re import unittest import pytest from scrapy.http import HtmlResponse, XmlResponse from scrapy.link import Link from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from tests import get_testdata # a hack to skip base class tests in pytest class Base: class LinkExtractorTestCase(unittest.TestCase): extractor_cls = None def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body) def test_urls_type(self): ''' Test that the resulting urls are str objects ''' lx = self.extractor_cls() self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response))) def test_extract_all_links(self): lx = self.extractor_cls() self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) def test_extract_filter_allow(self): lx = self.extractor_cls(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) def test_extract_filter_allow_with_duplicates(self): lx = self.extractor_cls(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), ]) def test_extract_filter_allow_and_deny(self): lx = self.extractor_cls(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) def test_extract_filter_allowed_domains(self): lx = self.extractor_cls(allow_domains=('google.com', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = self.extractor_cls(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) lx = self.extractor_cls(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = self.extractor_cls(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) lx = self.extractor_cls(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) def test_nofollow(self): '''Test the extractor's behaviour for links with rel="nofollow"''' html = b"""