2009-05-04 13:43:37 +00:00
|
|
|
import re
|
2008-06-28 23:37:28 +00:00
|
|
|
import unittest
|
2014-01-11 15:12:54 +06:00
|
|
|
from scrapy.contrib.linkextractors.regex import RegexLinkExtractor
|
2014-07-10 13:04:21 +02:00
|
|
|
from scrapy.http import HtmlResponse, XmlResponse
|
2009-05-18 19:19:37 -03:00
|
|
|
from scrapy.link import Link
|
2014-01-11 14:30:27 +06:00
|
|
|
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
|
2009-05-18 19:19:37 -03:00
|
|
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
|
2014-01-23 17:22:24 +01:00
|
|
|
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
|
2014-07-30 16:53:28 -03:00
|
|
|
from tests import get_testdata
|
2008-06-28 23:37:28 +00:00
|
|
|
|
2013-01-29 18:08:32 -02:00
|
|
|
|
2008-06-28 23:37:28 +00:00
|
|
|
class LinkExtractorTestCase(unittest.TestCase):
|
|
|
|
def test_basic(self):
|
|
|
|
html = """<html><head><title>Page title<title>
|
|
|
|
<body><p><a href="item/12.html">Item 12</a></p>
|
|
|
|
<p><a href="/about.html">About us</a></p>
|
|
|
|
<img src="/logo.png" alt="Company logo (not a link)" />
|
|
|
|
<p><a href="../othercat.html">Other category</a></p>
|
2010-09-19 19:28:35 -03:00
|
|
|
<p><a href="/">>></a></p>
|
2008-06-28 23:37:28 +00:00
|
|
|
<p><a href="/" /></p>
|
|
|
|
</body></html>"""
|
2009-01-26 02:57:03 +00:00
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
2008-06-28 23:37:28 +00:00
|
|
|
|
2009-05-18 19:19:37 -03:00
|
|
|
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
|
2009-01-02 14:57:33 +00:00
|
|
|
self.assertEqual(lx.extract_links(response),
|
2013-01-29 18:08:32 -02:00
|
|
|
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
2008-11-03 10:24:50 +00:00
|
|
|
Link(url='http://example.org/about.html', text='About us'),
|
2013-01-29 18:08:32 -02:00
|
|
|
Link(url='http://example.org/othercat.html', text='Other category'),
|
2010-09-19 19:28:35 -03:00
|
|
|
Link(url='http://example.org/', text='>>'),
|
2008-11-03 10:24:50 +00:00
|
|
|
Link(url='http://example.org/', text='')])
|
2008-06-28 23:37:28 +00:00
|
|
|
|
|
|
|
def test_base_url(self):
|
|
|
|
html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
|
|
|
|
<body><p><a href="item/12.html">Item 12</a></p>
|
|
|
|
</body></html>"""
|
2009-01-26 02:57:03 +00:00
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
2008-06-28 23:37:28 +00:00
|
|
|
|
2009-05-18 19:19:37 -03:00
|
|
|
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
|
2009-01-02 14:57:33 +00:00
|
|
|
self.assertEqual(lx.extract_links(response),
|
2008-11-03 10:24:50 +00:00
|
|
|
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
2008-06-28 23:37:28 +00:00
|
|
|
|
2010-03-25 12:38:37 -03:00
|
|
|
# base url is an absolute path and relative to host
|
|
|
|
html = """<html><head><title>Page title<title><base href="/" />
|
|
|
|
<body><p><a href="item/12.html">Item 12</a></p></body></html>"""
|
|
|
|
response = HtmlResponse("https://example.org/somepage/index.html", body=html)
|
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='https://example.org/item/12.html', text='Item 12')])
|
|
|
|
|
|
|
|
# base url has no scheme
|
|
|
|
html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
|
|
|
|
<body><p><a href="item/12.html">Item 12</a></p></body></html>"""
|
|
|
|
response = HtmlResponse("https://example.org/somepage/index.html", body=html)
|
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
|
|
|
|
|
2011-02-10 17:12:03 -02:00
|
|
|
def test_link_text_wrong_encoding(self):
|
|
|
|
html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
|
|
|
|
response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
|
|
|
|
lx = BaseSgmlLinkExtractor()
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'),
|
|
|
|
])
|
2011-02-10 17:12:03 -02:00
|
|
|
|
2009-01-02 18:32:55 +00:00
|
|
|
def test_extraction_encoding(self):
|
2009-05-15 15:03:42 -03:00
|
|
|
body = get_testdata('link_extractor', 'linkextractor_noenc.html')
|
2009-01-26 02:57:03 +00:00
|
|
|
response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
|
|
|
|
response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
|
2009-05-15 15:03:42 -03:00
|
|
|
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
|
2009-01-26 02:57:03 +00:00
|
|
|
response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2009-05-18 19:19:37 -03:00
|
|
|
lx = BaseSgmlLinkExtractor()
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual(lx.extract_links(response_utf8), [
|
|
|
|
Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
|
|
|
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual(lx.extract_links(response_noenc), [
|
|
|
|
Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
|
|
|
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual(lx.extract_links(response_latin1), [
|
|
|
|
Link(url='http://example.com/sample_%F1.html', text=''),
|
|
|
|
Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2008-12-02 12:51:32 +00:00
|
|
|
def test_matches(self):
|
|
|
|
url1 = 'http://lotsofstuff.com/stuff1/index'
|
|
|
|
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
|
|
|
|
2009-05-18 19:19:37 -03:00
|
|
|
lx = BaseSgmlLinkExtractor()
|
2008-12-02 12:51:32 +00:00
|
|
|
self.assertEqual(lx.matches(url1), True)
|
|
|
|
self.assertEqual(lx.matches(url2), True)
|
|
|
|
|
2012-11-10 16:25:41 +01:00
|
|
|
def test_link_nofollow(self):
|
|
|
|
html = """
|
|
|
|
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
|
|
|
|
<a href="about.html">About us</a>
|
|
|
|
"""
|
|
|
|
response = HtmlResponse("http://example.org/page.html", body=html)
|
|
|
|
lx = SgmlLinkExtractor()
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(response)], [
|
|
|
|
Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
|
|
|
|
Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
|
|
|
|
])
|
|
|
|
|
2012-11-10 16:25:41 +01:00
|
|
|
|
2009-05-18 19:19:37 -03:00
|
|
|
class SgmlLinkExtractorTestCase(unittest.TestCase):
|
2014-01-23 17:22:24 +01:00
|
|
|
extractor_cls = SgmlLinkExtractor
|
|
|
|
|
2009-01-02 18:32:55 +00:00
|
|
|
def setUp(self):
|
2009-05-18 19:19:37 -03:00
|
|
|
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
2009-01-26 02:57:03 +00:00
|
|
|
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
2009-01-02 18:32:55 +00:00
|
|
|
|
|
|
|
def test_urls_type(self):
|
|
|
|
'''Test that the resulting urls are regular strings and not a unicode objects'''
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls()
|
2009-01-02 18:32:55 +00:00
|
|
|
self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
|
|
|
|
|
|
|
|
def test_extraction(self):
|
|
|
|
'''Test the extractor's behaviour among different situations'''
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls()
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
2013-12-07 20:42:21 +01:00
|
|
|
Link(url='http://example.com/innertag.html', text=u'inner tag'),
|
2013-01-29 18:08:32 -02:00
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow=('sample', ))
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow=('sample', ), unique=False)
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow=('sample', ))
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
])
|
2010-04-02 19:45:30 +08:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow=('sample', ), deny=('3', ))
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow_domains=('google.com', ))
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
])
|
2009-01-02 18:32:55 +00:00
|
|
|
|
2009-03-27 06:05:54 +00:00
|
|
|
def test_extraction_using_single_values(self):
|
|
|
|
'''Test the extractor's behaviour among different situations'''
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow='sample')
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
])
|
2009-03-27 06:05:54 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow='sample', deny='3')
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
])
|
2009-03-27 06:05:54 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow_domains='google.com')
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
])
|
2009-03-27 06:05:54 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(deny_domains='example.com')
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
])
|
2009-03-27 06:05:54 +00:00
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
def test_nofollow(self):
|
|
|
|
'''Test the extractor's behaviour for links with rel="nofollow"'''
|
|
|
|
|
|
|
|
html = """<html><head><title>Page title<title>
|
|
|
|
<body>
|
|
|
|
<div class='links'>
|
|
|
|
<p><a href="/about.html">About us</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/follow.html">Follow this link</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
|
|
|
|
</div>
|
|
|
|
</body></html>"""
|
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
|
|
|
|
|
|
|
lx = self.extractor_cls()
|
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.org/about.html', text=u'About us'),
|
|
|
|
Link(url='http://example.org/follow.html', text=u'Follow this link'),
|
|
|
|
Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
|
|
|
|
Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
|
|
|
|
])
|
|
|
|
|
2009-01-02 18:32:55 +00:00
|
|
|
def test_matches(self):
|
|
|
|
url1 = 'http://lotsofstuff.com/stuff1/index'
|
|
|
|
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow=(r'stuff1', ))
|
2008-12-02 12:51:32 +00:00
|
|
|
self.assertEqual(lx.matches(url1), True)
|
|
|
|
self.assertEqual(lx.matches(url2), False)
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(deny=(r'uglystuff', ))
|
2008-12-02 12:51:32 +00:00
|
|
|
self.assertEqual(lx.matches(url1), True)
|
|
|
|
self.assertEqual(lx.matches(url2), False)
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow_domains=('evenmorestuff.com', ))
|
2008-12-02 12:51:32 +00:00
|
|
|
self.assertEqual(lx.matches(url1), False)
|
|
|
|
self.assertEqual(lx.matches(url2), True)
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(deny_domains=('lotsofstuff.com', ))
|
2008-12-02 12:51:32 +00:00
|
|
|
self.assertEqual(lx.matches(url1), False)
|
|
|
|
self.assertEqual(lx.matches(url2), True)
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(allow=('blah1',), deny=('blah2',),
|
2013-01-29 18:08:32 -02:00
|
|
|
allow_domains=('blah1.com',),
|
|
|
|
deny_domains=('blah2.com',))
|
2008-12-02 12:51:32 +00:00
|
|
|
self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
|
|
|
|
self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
|
|
|
|
self.assertEqual(lx.matches('http://blah2.com/blah1'), False)
|
|
|
|
self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
|
|
|
|
|
2009-04-29 19:04:27 +00:00
|
|
|
def test_restrict_xpaths(self):
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ))
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
])
|
2009-04-29 19:04:27 +00:00
|
|
|
|
|
|
|
def test_restrict_xpaths_encoding(self):
|
|
|
|
"""Test restrict_xpaths with encodings"""
|
|
|
|
html = """<html><head><title>Page title<title>
|
|
|
|
<body><p><a href="item/12.html">Item 12</a></p>
|
|
|
|
<div class='links'>
|
|
|
|
<p><a href="/about.html">About us\xa3</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/nofollow.html">This shouldn't be followed</a></p>
|
|
|
|
</div>
|
|
|
|
</body></html>"""
|
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(restrict_xpaths="//div[@class='links']")
|
2009-04-29 19:04:27 +00:00
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
|
|
|
|
|
2014-01-27 11:37:09 -02:00
|
|
|
def test_restrict_xpaths_with_html_entities(self):
|
|
|
|
html = '<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>'
|
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
|
|
|
|
links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
|
|
|
|
self.assertEqual(links,
|
|
|
|
[Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
|
|
|
|
|
2013-01-29 17:56:34 -02:00
|
|
|
def test_restrict_xpaths_concat_in_handle_data(self):
|
|
|
|
"""html entities cause SGMLParser to call handle_data hook twice"""
|
|
|
|
body = """<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>"""
|
|
|
|
response = HtmlResponse("http://example.org", body=body, encoding='gb18030')
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(restrict_xpaths="//div")
|
2013-01-29 17:56:34 -02:00
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
|
|
|
|
fragment='', nofollow=False)])
|
|
|
|
|
2014-12-11 18:20:30 -02:00
|
|
|
def test_restrict_css(self):
|
|
|
|
lx = self.extractor_cls(restrict_css=('#subwrapper a',))
|
|
|
|
self.assertEqual(lx.extract_links(self.response), [
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2')
|
|
|
|
])
|
|
|
|
|
|
|
|
def test_restrict_css_and_restrict_xpaths_together(self):
|
|
|
|
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
|
|
|
|
restrict_css=('#subwrapper + a', ))
|
|
|
|
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
])
|
|
|
|
|
2014-08-28 18:47:49 +02:00
|
|
|
def test_area_tag_with_unicode_present(self):
|
|
|
|
body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
|
|
|
|
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
|
|
|
|
lx = self.extractor_cls()
|
|
|
|
lx.extract_links(response)
|
|
|
|
lx.extract_links(response)
|
|
|
|
lx.extract_links(response)
|
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://example.org/foo', text=u'',
|
|
|
|
fragment='', nofollow=False)])
|
|
|
|
|
2013-01-30 11:22:03 -02:00
|
|
|
def test_encoded_url(self):
|
|
|
|
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
|
|
|
|
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls()
|
2013-01-30 11:22:03 -02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
|
|
|
|
])
|
|
|
|
|
|
|
|
def test_encoded_url_in_restricted_xpath(self):
|
|
|
|
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
|
|
|
|
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(restrict_xpaths="//div")
|
2013-01-30 11:22:03 -02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
|
|
|
|
])
|
|
|
|
|
2011-05-18 12:32:34 -03:00
|
|
|
def test_deny_extensions(self):
|
|
|
|
html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
|
|
|
|
response = HtmlResponse("http://example.org/", body=html)
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls()
|
2013-01-29 18:08:32 -02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.org/page.html', text=u'asd'),
|
|
|
|
])
|
2011-05-18 12:32:34 -03:00
|
|
|
|
2014-03-20 03:13:47 +02:00
|
|
|
lx = SgmlLinkExtractor(deny_extensions="jpg")
|
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.org/page.html', text=u'asd'),
|
|
|
|
])
|
|
|
|
|
2009-05-04 13:43:37 +00:00
|
|
|
def test_process_value(self):
|
|
|
|
"""Test restrict_xpaths with encodings"""
|
|
|
|
html = """
|
|
|
|
<a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
|
|
|
|
<a href="/about.html">About us</a>
|
|
|
|
"""
|
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
|
|
|
|
|
|
|
|
def process_value(value):
|
|
|
|
m = re.search("javascript:goToPage\('(.*?)'", value)
|
|
|
|
if m:
|
|
|
|
return m.group(1)
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(process_value=process_value)
|
2009-05-04 13:43:37 +00:00
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://example.org/other/page.html', text='Link text')])
|
2009-04-29 19:04:27 +00:00
|
|
|
|
2010-11-23 17:28:29 -02:00
|
|
|
def test_base_url_with_restrict_xpaths(self):
|
|
|
|
html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
|
|
|
|
<body><p><a href="item/12.html">Item 12</a></p>
|
|
|
|
</body></html>"""
|
|
|
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(restrict_xpaths="//p")
|
2010-11-23 17:28:29 -02:00
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
|
|
|
|
2014-03-20 03:13:47 +02:00
|
|
|
def test_attrs(self):
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(attrs="href")
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(self.response), [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
Link(url='http://example.com/innertag.html', text=u'inner tag'),
|
|
|
|
])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(self.response), [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample2.jpg', text=u''),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
Link(url='http://example.com/innertag.html', text=u'inner tag'),
|
|
|
|
])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(attrs=None)
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(self.response), [])
|
|
|
|
|
|
|
|
html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
|
|
|
|
response = HtmlResponse("http://example.com/index.html", body=html)
|
|
|
|
lx = SgmlLinkExtractor(attrs=("href"))
|
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
])
|
|
|
|
|
|
|
|
def test_tags(self):
|
|
|
|
html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
|
|
|
|
response = HtmlResponse("http://example.com/index.html", body=html)
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(tags=None)
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(response), [])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls()
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(tags="area")
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/sample1.html', text=u''),
|
|
|
|
])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(tags="a")
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
lx = self.extractor_cls(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
|
2014-03-20 03:13:47 +02:00
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample2.jpg', text=u''),
|
|
|
|
])
|
|
|
|
|
2014-01-23 17:22:24 +01:00
|
|
|
def test_tags_attrs(self):
|
|
|
|
html = """
|
|
|
|
<html><body>
|
|
|
|
<div id="item1" data-url="get?id=1"><a href="#">Item 1</a></div>
|
|
|
|
<div id="item2" data-url="get?id=2"><a href="#">Item 2</a></div>
|
|
|
|
</body></html>
|
|
|
|
"""
|
|
|
|
response = HtmlResponse("http://example.com/index.html", body=html)
|
|
|
|
|
|
|
|
lx = self.extractor_cls(tags='div', attrs='data-url')
|
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False),
|
|
|
|
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
|
|
|
|
])
|
|
|
|
|
|
|
|
lx = self.extractor_cls(tags=('div',), attrs=('data-url',))
|
|
|
|
self.assertEqual(lx.extract_links(response), [
|
|
|
|
Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False),
|
|
|
|
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
|
|
|
|
])
|
|
|
|
|
2014-07-10 13:04:21 +02:00
|
|
|
def test_xhtml(self):
|
|
|
|
xhtml = """
|
|
|
|
<?xml version="1.0"?>
|
|
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
|
|
<head>
|
|
|
|
<title>XHTML document title</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<div class='links'>
|
|
|
|
<p><a href="/about.html">About us</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/follow.html">Follow this link</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
|
|
|
|
</div>
|
|
|
|
<div>
|
|
|
|
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
|
|
|
|
</div>
|
|
|
|
</body>
|
|
|
|
</html>
|
|
|
|
"""
|
|
|
|
|
|
|
|
response = HtmlResponse("http://example.com/index.xhtml", body=xhtml)
|
|
|
|
|
|
|
|
lx = self.extractor_cls()
|
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
|
|
|
|
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
|
|
|
|
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
|
|
|
|
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
|
|
|
|
)
|
|
|
|
|
|
|
|
response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
|
|
|
|
|
|
|
|
lx = self.extractor_cls()
|
|
|
|
self.assertEqual(lx.extract_links(response),
|
|
|
|
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
|
|
|
|
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
|
|
|
|
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
|
|
|
|
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
|
|
|
|
)
|
2014-01-23 17:22:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
|
|
|
|
extractor_cls = LxmlLinkExtractor
|
|
|
|
|
2014-03-20 03:13:47 +02:00
|
|
|
|
2014-01-11 14:30:27 +06:00
|
|
|
class HtmlParserLinkExtractorTestCase(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
|
|
|
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
|
|
|
|
|
|
|
def test_extraction(self):
|
|
|
|
# Default arguments
|
|
|
|
lx = HtmlParserLinkExtractor()
|
2014-01-30 15:16:31 +01:00
|
|
|
self.assertEqual(lx.extract_links(self.response),
|
|
|
|
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
|
2014-02-01 22:47:30 +01:00
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
|
2014-01-11 14:30:27 +06:00
|
|
|
|
|
|
|
|
2014-01-11 15:12:54 +06:00
|
|
|
class RegexLinkExtractorTestCase(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
|
|
|
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
|
|
|
|
|
|
|
def test_extraction(self):
|
|
|
|
# Default arguments
|
|
|
|
lx = RegexLinkExtractor()
|
2014-01-30 15:16:31 +01:00
|
|
|
self.assertEqual(lx.extract_links(self.response),
|
|
|
|
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
|
|
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
2014-02-01 22:47:30 +01:00
|
|
|
Link(url='http://www.google.com/something', text=u''),
|
|
|
|
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
|
2014-01-11 15:12:54 +06:00
|
|
|
|
|
|
|
|
2008-06-28 23:37:28 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|