"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
])
def test_encoded_url_in_restricted_xpath(self):
body = """
BinB"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
lx = self.extractor_cls(restrict_xpaths="//div")
self.assertEqual(lx.extract_links(response), [
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
])
def test_deny_extensions(self):
html = """
asd and
"""
response = HtmlResponse("http://example.org/", body=html)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/page.html', text=u'asd'),
])
lx = SgmlLinkExtractor(deny_extensions="jpg")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/page.html', text=u'asd'),
])
def test_process_value(self):
"""Test restrict_xpaths with encodings"""
html = """
Link text
About us
"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
def process_value(value):
m = re.search("javascript:goToPage\('(.*?)'", value)
if m:
return m.group(1)
lx = self.extractor_cls(process_value=process_value)
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/other/page.html', text='Link text')])
def test_base_url_with_restrict_xpaths(self):
html = """
Page title
Item 12
"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = self.extractor_cls(restrict_xpaths="//p")
self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_attrs(self):
lx = self.extractor_cls(attrs="href")
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
lx = self.extractor_cls(attrs=None)
self.assertEqual(lx.extract_links(self.response), [])
html = """
sample text 2"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = SgmlLinkExtractor(attrs=("href"))
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])
def test_tags(self):
html = """
sample 2
"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = self.extractor_cls(tags=None)
self.assertEqual(lx.extract_links(response), [])
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = self.extractor_cls(tags="area")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])
lx = self.extractor_cls(tags="a")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = self.extractor_cls(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
])
def test_tags_attrs(self):
html = """
"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = self.extractor_cls(tags='div', attrs='data-url')
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False),
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
])
lx = self.extractor_cls(tags=('div',), attrs=('data-url',))
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False),
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
])
def test_xhtml(self):
xhtml = """
XHTML document title
"""
response = HtmlResponse("http://example.com/index.xhtml", body=xhtml)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)
response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)
class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor
class HtmlParserLinkExtractorTestCase(unittest.TestCase):
def setUp(self):
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
self.response = HtmlResponse(url='http://example.com/index', body=body)
def test_extraction(self):
# Default arguments
lx = HtmlParserLinkExtractor()
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
class RegexLinkExtractorTestCase(unittest.TestCase):
def setUp(self):
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
self.response = HtmlResponse(url='http://example.com/index', body=body)
def test_extraction(self):
# Default arguments
lx = RegexLinkExtractor()
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
if __name__ == "__main__":
unittest.main()