2010-10-27 08:37:02 -02:00
|
|
|
import re
|
2013-10-14 10:35:02 -02:00
|
|
|
import warnings
|
2010-10-27 08:37:02 -02:00
|
|
|
import weakref
|
|
|
|
from twisted.trial import unittest
|
2013-10-14 10:35:02 -02:00
|
|
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
2010-10-27 08:37:02 -02:00
|
|
|
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
|
2013-10-14 10:35:02 -02:00
|
|
|
from scrapy.selector import Selector
|
|
|
|
from scrapy.selector.lxmlsel import XmlXPathSelector, HtmlXPathSelector, XPathSelector
|
2013-10-10 19:02:55 -02:00
|
|
|
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
class SelectorTestCase(unittest.TestCase):
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
sscls = Selector
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_simple_selection(self):
|
2010-10-27 08:37:02 -02:00
|
|
|
"""Simple selector tests"""
|
|
|
|
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
|
|
|
|
response = TextResponse(url="http://example.com", body=body)
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(response)
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-15 15:58:36 -02:00
|
|
|
xl = sel.xpath('//input')
|
2010-10-27 08:37:02 -02:00
|
|
|
self.assertEqual(2, len(xl))
|
|
|
|
for x in xl:
|
2013-10-14 10:35:02 -02:00
|
|
|
assert isinstance(x, self.sscls)
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual(sel.xpath('//input').extract(),
|
|
|
|
[x.extract() for x in sel.xpath('//input')])
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")],
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'a'])
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'12.0'])
|
|
|
|
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'xpathrules'])
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'12'])
|
|
|
|
|
2014-07-03 13:04:37 +03:00
|
|
|
def test_representation_slice(self):
|
2014-07-02 16:03:21 +03:00
|
|
|
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
|
|
|
|
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
|
|
|
|
sel = self.sscls(response)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
map(repr, sel.xpath('//input/@name')),
|
2014-07-03 09:32:52 +03:00
|
|
|
["<Selector xpath='//input/@name' data=u'{}'>".format(40 * 'b')]
|
2014-07-02 16:03:21 +03:00
|
|
|
)
|
|
|
|
|
2014-07-03 13:05:20 +03:00
|
|
|
def test_representation_unicode_query(self):
|
|
|
|
body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
|
|
|
|
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
|
|
|
|
sel = self.sscls(response)
|
|
|
|
self.assertEqual(
|
|
|
|
map(repr, sel.xpath(u'//input[@value="\xa9"]/@value')),
|
|
|
|
["<Selector xpath=u'//input[@value=\"\\xa9\"]/@value' data=u'\\xa9'>"]
|
|
|
|
)
|
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_select_unicode_query(self):
|
2012-04-04 16:17:18 -03:00
|
|
|
body = u"<p><input name='\xa9' value='1'/></p>"
|
|
|
|
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(response)
|
|
|
|
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
|
2012-04-04 16:17:18 -03:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_list_elements_type(self):
|
|
|
|
"""Test Selector returning the same type in selection methods"""
|
2010-10-27 08:37:02 -02:00
|
|
|
text = '<p>test<p>'
|
2013-10-14 10:35:02 -02:00
|
|
|
assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls)
|
|
|
|
assert isinstance(self.sscls(text=text).css("p")[0], self.sscls)
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_boolean_result(self):
|
2010-10-27 08:54:32 -02:00
|
|
|
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
|
|
|
|
response = TextResponse(url="http://example.com", body=body)
|
2013-10-14 10:35:02 -02:00
|
|
|
xs = self.sscls(response)
|
|
|
|
self.assertEquals(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1'])
|
|
|
|
self.assertEquals(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0'])
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_differences_parsing_xml_vs_html(self):
|
|
|
|
"""Test that XML and HTML Selector's behave differently"""
|
2010-10-27 08:37:02 -02:00
|
|
|
# some text which is parsed differently by XML and HTML flavors
|
|
|
|
text = '<div><img src="a.jpg"><p>Hello</div>'
|
2013-10-16 17:37:22 -02:00
|
|
|
hs = self.sscls(text=text, type='html')
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(hs.xpath("//div").extract(),
|
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></div>'])
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-16 17:37:22 -02:00
|
|
|
xs = self.sscls(text=text, type='xml')
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(xs.xpath("//div").extract(),
|
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
|
|
|
|
|
|
|
|
def test_flavor_detection(self):
|
|
|
|
text = '<div><img src="a.jpg"><p>Hello</div>'
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(XmlResponse('http://example.com', body=text))
|
2013-10-16 17:37:22 -02:00
|
|
|
self.assertEqual(sel.type, 'xml')
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual(sel.xpath("//div").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
|
|
|
|
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(HtmlResponse('http://example.com', body=text))
|
2013-10-16 17:37:22 -02:00
|
|
|
self.assertEqual(sel.type, 'html')
|
2013-10-15 15:58:36 -02:00
|
|
|
self.assertEqual(sel.xpath("//div").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></div>'])
|
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_nested_selectors(self):
|
2010-10-27 08:37:02 -02:00
|
|
|
"""Nested selector tests"""
|
|
|
|
body = """<body>
|
|
|
|
<div class='one'>
|
|
|
|
<ul>
|
|
|
|
<li>one</li><li>two</li>
|
|
|
|
</ul>
|
|
|
|
</div>
|
|
|
|
<div class='two'>
|
|
|
|
<ul>
|
|
|
|
<li>four</li><li>five</li><li>six</li>
|
|
|
|
</ul>
|
|
|
|
</div>
|
|
|
|
</body>"""
|
|
|
|
|
|
|
|
response = HtmlResponse(url="http://example.com", body=body)
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
|
|
|
divtwo = x.xpath('//div[@class="two"]')
|
|
|
|
self.assertEqual(divtwo.xpath("//li").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
["<li>one</li>", "<li>two</li>", "<li>four</li>", "<li>five</li>", "<li>six</li>"])
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(divtwo.xpath("./ul/li").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
["<li>four</li>", "<li>five</li>", "<li>six</li>"])
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(divtwo.xpath(".//li").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
["<li>four</li>", "<li>five</li>", "<li>six</li>"])
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(divtwo.xpath("./li").extract(), [])
|
|
|
|
|
|
|
|
def test_mixed_nested_selectors(self):
|
|
|
|
body = '''<body>
|
|
|
|
<div id=1>not<span>me</span></div>
|
|
|
|
<div class="dos"><p>text</p><a href='#'>foo</a></div>
|
|
|
|
</body>'''
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(text=body)
|
|
|
|
self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
|
|
|
|
self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me'])
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2010-10-27 21:39:28 -02:00
|
|
|
def test_dont_strip(self):
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
|
|
|
|
self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz'])
|
2010-10-27 21:39:28 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_namespaces_simple(self):
|
2010-10-27 08:54:32 -02:00
|
|
|
body = """
|
|
|
|
<test xmlns:somens="http://scrapy.org">
|
|
|
|
<somens:a id="foo">take this</a>
|
|
|
|
<a id="bar">found</a>
|
|
|
|
</test>
|
|
|
|
"""
|
|
|
|
|
|
|
|
response = XmlResponse(url="http://example.com", body=body)
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
2010-10-27 08:54:32 -02:00
|
|
|
|
|
|
|
x.register_namespace("somens", "http://scrapy.org")
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(x.xpath("//somens:a/text()").extract(),
|
2010-10-27 08:54:32 -02:00
|
|
|
[u'take this'])
|
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_namespaces_multiple(self):
|
2010-10-27 08:54:32 -02:00
|
|
|
body = """<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
|
|
|
|
xmlns:b="http://somens.com"
|
|
|
|
xmlns:p="http://www.scrapy.org/product" >
|
|
|
|
<b:Operation>hello</b:Operation>
|
|
|
|
<TestTag b:att="value"><Other>value</Other></TestTag>
|
|
|
|
<p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
|
|
|
|
</BrowseNode>
|
|
|
|
"""
|
|
|
|
response = XmlResponse(url="http://example.com", body=body)
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
2010-10-27 08:54:32 -02:00
|
|
|
x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
|
|
|
|
x.register_namespace("p", "http://www.scrapy.org/product")
|
|
|
|
x.register_namespace("b", "http://somens.com")
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
|
|
|
|
self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], 'hello')
|
|
|
|
self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], 'value')
|
|
|
|
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90')
|
|
|
|
self.assertEqual(x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), '90')
|
|
|
|
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
|
|
|
|
|
|
|
|
def test_re(self):
|
2010-10-27 08:37:02 -02:00
|
|
|
body = """<div>Name: Mary
|
|
|
|
<ul>
|
|
|
|
<li>Name: John</li>
|
|
|
|
<li>Age: 10</li>
|
|
|
|
<li>Name: Paul</li>
|
|
|
|
<li>Age: 20</li>
|
|
|
|
</ul>
|
|
|
|
Age: 20
|
2013-10-14 10:35:02 -02:00
|
|
|
</div>"""
|
2010-10-27 08:37:02 -02:00
|
|
|
response = HtmlResponse(url="http://example.com", body=body)
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
2010-10-27 08:37:02 -02:00
|
|
|
|
|
|
|
name_re = re.compile("Name: (\w+)")
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(x.xpath("//ul/li").re(name_re),
|
2010-10-27 08:37:02 -02:00
|
|
|
["John", "Paul"])
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
|
2010-10-27 08:37:02 -02:00
|
|
|
["10", "20"])
|
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_re_intl(self):
|
2012-04-01 00:41:03 -03:00
|
|
|
body = """<div>Evento: cumplea\xc3\xb1os</div>"""
|
|
|
|
response = HtmlResponse(url="http://example.com", body=body, encoding='utf-8')
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
|
|
|
self.assertEqual(x.xpath("//div").re("Evento: (\w+)"), [u'cumplea\xf1os'])
|
2012-04-01 00:41:03 -03:00
|
|
|
|
2010-10-27 08:37:02 -02:00
|
|
|
def test_selector_over_text(self):
|
2013-10-14 10:35:02 -02:00
|
|
|
hs = self.sscls(text='<root>lala</root>')
|
|
|
|
self.assertEqual(hs.extract(), u'<html><body><root>lala</root></body></html>')
|
2013-10-16 17:37:22 -02:00
|
|
|
xs = self.sscls(text='<root>lala</root>', type='xml')
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEqual(xs.extract(), u'<root>lala</root>')
|
|
|
|
self.assertEqual(xs.xpath('.').extract(), [u'<root>lala</root>'])
|
2010-10-27 08:37:02 -02:00
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
def test_invalid_xpath(self):
|
2010-10-27 08:37:02 -02:00
|
|
|
response = XmlResponse(url="http://example.com", body="<html></html>")
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
2010-10-27 08:37:02 -02:00
|
|
|
xpath = "//test[@foo='bar]"
|
|
|
|
try:
|
2013-10-14 10:35:02 -02:00
|
|
|
x.xpath(xpath)
|
2013-10-20 01:59:35 +06:00
|
|
|
except ValueError as e:
|
2010-10-27 08:37:02 -02:00
|
|
|
assert xpath in str(e), "Exception message does not contain invalid xpath"
|
|
|
|
except Exception:
|
|
|
|
raise AssertionError("A invalid XPath does not raise ValueError")
|
|
|
|
else:
|
|
|
|
raise AssertionError("A invalid XPath does not raise an exception")
|
|
|
|
|
|
|
|
def test_http_header_encoding_precedence(self):
|
|
|
|
# u'\xa3' = pound symbol in unicode
|
|
|
|
# u'\xc2\xa3' = pound symbol in utf-8
|
|
|
|
# u'\xa3' = pound symbol in latin-1 (iso-8859-1)
|
|
|
|
|
|
|
|
meta = u'<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
|
|
|
|
head = u'<head>' + meta + u'</head>'
|
|
|
|
body_content = u'<span id="blank">\xa3</span>'
|
|
|
|
body = u'<body>' + body_content + u'</body>'
|
|
|
|
html = u'<html>' + head + body + u'</html>'
|
|
|
|
encoding = 'utf-8'
|
|
|
|
html_utf8 = html.encode(encoding)
|
|
|
|
|
|
|
|
headers = {'Content-Type': ['text/html; charset=utf-8']}
|
|
|
|
response = HtmlResponse(url="http://example.com", headers=headers, body=html_utf8)
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls(response)
|
|
|
|
self.assertEquals(x.xpath("//span[@id='blank']/text()").extract(),
|
2010-10-27 08:37:02 -02:00
|
|
|
[u'\xa3'])
|
|
|
|
|
|
|
|
def test_empty_bodies(self):
|
2012-04-13 00:21:48 -03:00
|
|
|
# shouldn't raise errors
|
2010-10-27 08:37:02 -02:00
|
|
|
r1 = TextResponse('http://www.example.com', body='')
|
2013-10-14 10:35:02 -02:00
|
|
|
self.sscls(r1).xpath('//text()').extract()
|
2012-04-13 00:21:48 -03:00
|
|
|
|
|
|
|
def test_null_bytes(self):
|
|
|
|
# shouldn't raise errors
|
|
|
|
r1 = TextResponse('http://www.example.com', \
|
|
|
|
body='<root>pre\x00post</root>', \
|
|
|
|
encoding='utf-8')
|
2013-10-14 10:35:02 -02:00
|
|
|
self.sscls(r1).xpath('//text()').extract()
|
2012-04-13 00:21:48 -03:00
|
|
|
|
|
|
|
def test_badly_encoded_body(self):
|
|
|
|
# \xe9 alone isn't valid utf8 sequence
|
|
|
|
r1 = TextResponse('http://www.example.com', \
|
|
|
|
body='<html><p>an Jos\xe9 de</p><html>', \
|
|
|
|
encoding='utf-8')
|
2013-10-14 10:35:02 -02:00
|
|
|
self.sscls(r1).xpath('//text()').extract()
|
2012-04-13 00:21:48 -03:00
|
|
|
|
|
|
|
def test_select_on_unevaluable_nodes(self):
|
2013-10-14 10:35:02 -02:00
|
|
|
r = self.sscls(text=u'<span class="big">some text</span>')
|
2012-04-13 00:21:48 -03:00
|
|
|
# Text node
|
2013-10-14 10:35:02 -02:00
|
|
|
x1 = r.xpath('//text()')
|
2012-04-13 00:21:48 -03:00
|
|
|
self.assertEquals(x1.extract(), [u'some text'])
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEquals(x1.xpath('.//b').extract(), [])
|
2012-04-13 00:21:48 -03:00
|
|
|
# Tag attribute
|
2013-10-14 10:35:02 -02:00
|
|
|
x1 = r.xpath('//span/@class')
|
2012-04-13 00:21:48 -03:00
|
|
|
self.assertEquals(x1.extract(), [u'big'])
|
2013-10-14 10:35:02 -02:00
|
|
|
self.assertEquals(x1.xpath('.//text()').extract(), [])
|
2012-04-13 00:21:48 -03:00
|
|
|
|
|
|
|
def test_select_on_text_nodes(self):
|
2013-10-14 10:35:02 -02:00
|
|
|
r = self.sscls(text=u'<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
|
|
|
|
x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]")
|
2012-04-13 00:21:48 -03:00
|
|
|
self.assertEquals(x1.extract(), [u'opt1'])
|
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
x1 = r.xpath("//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]")
|
2012-04-13 00:21:48 -03:00
|
|
|
self.assertEquals(x1.extract(), [u'<b>Options:</b>'])
|
|
|
|
|
2012-05-07 17:23:27 +02:00
|
|
|
def test_nested_select_on_text_nodes(self):
|
|
|
|
# FIXME: does not work with lxml backend [upstream]
|
2013-10-14 10:35:02 -02:00
|
|
|
r = self.sscls(text=u'<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
|
|
|
|
x1 = r.xpath("//div/descendant::text()")
|
|
|
|
x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]")
|
2012-04-13 00:21:48 -03:00
|
|
|
self.assertEquals(x2.extract(), [u'<b>Options:</b>'])
|
2013-10-14 10:35:02 -02:00
|
|
|
test_nested_select_on_text_nodes.skip = "Text nodes lost parent node reference in lxml"
|
2010-10-27 08:37:02 -02:00
|
|
|
|
|
|
|
def test_weakref_slots(self):
|
|
|
|
"""Check that classes are using slots and are weak-referenceable"""
|
2013-10-14 10:35:02 -02:00
|
|
|
x = self.sscls()
|
|
|
|
weakref.ref(x)
|
|
|
|
assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \
|
|
|
|
x.__class__.__name__
|
|
|
|
|
|
|
|
def test_remove_namespaces(self):
|
|
|
|
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
|
|
|
|
<link type="text/html">
|
|
|
|
<link type="application/atom+xml">
|
|
|
|
</feed>
|
|
|
|
"""
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
|
|
|
|
self.assertEqual(len(sel.xpath("//link")), 0)
|
|
|
|
sel.remove_namespaces()
|
|
|
|
self.assertEqual(len(sel.xpath("//link")), 2)
|
2013-10-14 10:35:02 -02:00
|
|
|
|
|
|
|
def test_remove_attributes_namespaces(self):
|
|
|
|
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
|
|
|
|
<link atom:type="text/html">
|
|
|
|
<link atom:type="application/atom+xml">
|
|
|
|
</feed>
|
|
|
|
"""
|
2013-10-15 15:58:36 -02:00
|
|
|
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
|
|
|
|
self.assertEqual(len(sel.xpath("//link/@type")), 0)
|
|
|
|
sel.remove_namespaces()
|
|
|
|
self.assertEqual(len(sel.xpath("//link/@type")), 2)
|
2013-10-14 10:35:02 -02:00
|
|
|
|
2014-01-17 00:04:20 +01:00
|
|
|
def test_smart_strings(self):
|
|
|
|
"""Lxml smart strings return values"""
|
|
|
|
|
|
|
|
class SmartStringsSelector(Selector):
|
|
|
|
_lxml_smart_strings = True
|
|
|
|
|
|
|
|
body = """<body>
|
|
|
|
<div class='one'>
|
|
|
|
<ul>
|
|
|
|
<li>one</li><li>two</li>
|
|
|
|
</ul>
|
|
|
|
</div>
|
|
|
|
<div class='two'>
|
|
|
|
<ul>
|
|
|
|
<li>four</li><li>five</li><li>six</li>
|
|
|
|
</ul>
|
|
|
|
</div>
|
|
|
|
</body>"""
|
|
|
|
|
|
|
|
response = HtmlResponse(url="http://example.com", body=body)
|
|
|
|
|
|
|
|
# .getparent() is available for text nodes and attributes
|
|
|
|
# only when smart_strings are on
|
|
|
|
x = self.sscls(response)
|
|
|
|
li_text = x.xpath('//li/text()')
|
2014-01-20 17:29:16 +01:00
|
|
|
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), li_text)))
|
2014-01-17 00:04:20 +01:00
|
|
|
div_class = x.xpath('//div/@class')
|
2014-01-20 17:29:16 +01:00
|
|
|
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), div_class)))
|
2014-01-17 00:04:20 +01:00
|
|
|
|
|
|
|
x = SmartStringsSelector(response)
|
|
|
|
li_text = x.xpath('//li/text()')
|
2014-01-20 17:29:16 +01:00
|
|
|
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), li_text)))
|
2014-01-17 00:04:20 +01:00
|
|
|
div_class = x.xpath('//div/@class')
|
2014-01-20 17:29:16 +01:00
|
|
|
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), div_class)))
|
2014-01-17 00:04:20 +01:00
|
|
|
|
2014-04-05 00:13:27 +08:00
|
|
|
def test_xml_entity_expansion(self):
|
|
|
|
malicious_xml = '<?xml version="1.0" encoding="ISO-8859-1"?>'\
|
|
|
|
'<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\
|
|
|
|
'"file:///etc/passwd" >]><foo>&xxe;</foo>'
|
|
|
|
|
|
|
|
response = XmlResponse('http://example.com', body=malicious_xml)
|
|
|
|
sel = self.sscls(response=response)
|
|
|
|
|
|
|
|
self.assertEqual(sel.extract(), '<foo>&xxe;</foo>')
|
|
|
|
|
2013-10-14 10:35:02 -02:00
|
|
|
|
|
|
|
class DeprecatedXpathSelectorTest(unittest.TestCase):
|
|
|
|
|
|
|
|
text = '<div><img src="a.jpg"><p>Hello</div>'
|
|
|
|
|
2014-01-03 17:32:40 -02:00
|
|
|
def test_warnings_xpathselector(self):
|
|
|
|
cls = XPathSelector
|
|
|
|
with warnings.catch_warnings(record=True) as w:
|
|
|
|
class UserClass(cls):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# subclassing must issue a warning
|
|
|
|
self.assertEqual(len(w), 1, str(cls))
|
2014-04-10 03:32:00 +06:00
|
|
|
self.assertIn('scrapy.Selector', str(w[0].message))
|
2014-01-03 17:32:40 -02:00
|
|
|
|
|
|
|
# subclass instance doesn't issue a warning
|
|
|
|
usel = UserClass(text=self.text)
|
|
|
|
self.assertEqual(len(w), 1)
|
|
|
|
|
|
|
|
# class instance must issue a warning
|
|
|
|
sel = cls(text=self.text)
|
|
|
|
self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
|
2014-04-10 03:32:00 +06:00
|
|
|
self.assertIn('scrapy.Selector', str(w[1].message))
|
2014-01-03 17:32:40 -02:00
|
|
|
|
|
|
|
# subclass and instance checks
|
|
|
|
self.assertTrue(issubclass(cls, Selector))
|
|
|
|
self.assertTrue(isinstance(sel, Selector))
|
|
|
|
self.assertTrue(isinstance(usel, Selector))
|
|
|
|
|
|
|
|
def test_warnings_xmlxpathselector(self):
|
|
|
|
cls = XmlXPathSelector
|
|
|
|
with warnings.catch_warnings(record=True) as w:
|
|
|
|
class UserClass(cls):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# subclassing must issue a warning
|
|
|
|
self.assertEqual(len(w), 1, str(cls))
|
2014-04-10 03:32:00 +06:00
|
|
|
self.assertIn('scrapy.Selector', str(w[0].message))
|
2014-01-03 17:32:40 -02:00
|
|
|
|
|
|
|
# subclass instance doesn't issue a warning
|
|
|
|
usel = UserClass(text=self.text)
|
|
|
|
self.assertEqual(len(w), 1)
|
|
|
|
|
|
|
|
# class instance must issue a warning
|
|
|
|
sel = cls(text=self.text)
|
|
|
|
self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
|
2014-04-10 03:32:00 +06:00
|
|
|
self.assertIn('scrapy.Selector', str(w[1].message))
|
2014-01-03 17:32:40 -02:00
|
|
|
|
|
|
|
# subclass and instance checks
|
|
|
|
self.assertTrue(issubclass(cls, Selector))
|
|
|
|
self.assertTrue(issubclass(cls, XPathSelector))
|
|
|
|
self.assertTrue(isinstance(sel, Selector))
|
|
|
|
self.assertTrue(isinstance(usel, Selector))
|
|
|
|
self.assertTrue(isinstance(sel, XPathSelector))
|
|
|
|
self.assertTrue(isinstance(usel, XPathSelector))
|
|
|
|
|
|
|
|
def test_warnings_htmlxpathselector(self):
|
|
|
|
cls = HtmlXPathSelector
|
|
|
|
with warnings.catch_warnings(record=True) as w:
|
|
|
|
class UserClass(cls):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# subclassing must issue a warning
|
|
|
|
self.assertEqual(len(w), 1, str(cls))
|
2014-04-10 03:32:00 +06:00
|
|
|
self.assertIn('scrapy.Selector', str(w[0].message))
|
2014-01-03 17:32:40 -02:00
|
|
|
|
|
|
|
# subclass instance doesn't issue a warning
|
|
|
|
usel = UserClass(text=self.text)
|
|
|
|
self.assertEqual(len(w), 1)
|
|
|
|
|
|
|
|
# class instance must issue a warning
|
|
|
|
sel = cls(text=self.text)
|
|
|
|
self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
|
2014-04-10 03:32:00 +06:00
|
|
|
self.assertIn('scrapy.Selector', str(w[1].message))
|
2014-01-03 17:32:40 -02:00
|
|
|
|
|
|
|
# subclass and instance checks
|
|
|
|
self.assertTrue(issubclass(cls, Selector))
|
|
|
|
self.assertTrue(issubclass(cls, XPathSelector))
|
|
|
|
self.assertTrue(isinstance(sel, Selector))
|
|
|
|
self.assertTrue(isinstance(usel, Selector))
|
|
|
|
self.assertTrue(isinstance(sel, XPathSelector))
|
|
|
|
self.assertTrue(isinstance(usel, XPathSelector))
|
2013-10-14 10:35:02 -02:00
|
|
|
|
|
|
|
def test_xpathselector(self):
|
|
|
|
with warnings.catch_warnings(record=True):
|
|
|
|
hs = XPathSelector(text=self.text)
|
|
|
|
self.assertEqual(hs.select("//div").extract(),
|
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></div>'])
|
|
|
|
self.assertRaises(RuntimeError, hs.css, 'div')
|
|
|
|
|
|
|
|
def test_htmlxpathselector(self):
|
|
|
|
with warnings.catch_warnings(record=True):
|
|
|
|
hs = HtmlXPathSelector(text=self.text)
|
|
|
|
self.assertEqual(hs.select("//div").extract(),
|
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></div>'])
|
|
|
|
self.assertRaises(RuntimeError, hs.css, 'div')
|
|
|
|
|
|
|
|
def test_xmlxpathselector(self):
|
|
|
|
with warnings.catch_warnings(record=True):
|
|
|
|
xs = XmlXPathSelector(text=self.text)
|
|
|
|
self.assertEqual(xs.select("//div").extract(),
|
|
|
|
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
|
|
|
|
self.assertRaises(RuntimeError, xs.css, 'div')
|
2013-11-24 04:41:14 +01:00
|
|
|
|
|
|
|
|
|
|
|
class ExsltTestCase(unittest.TestCase):
|
|
|
|
|
|
|
|
sscls = Selector
|
|
|
|
|
|
|
|
def test_regexp(self):
|
|
|
|
"""EXSLT regular expression tests"""
|
|
|
|
body = """
|
|
|
|
<p><input name='a' value='1'/><input name='b' value='2'/></p>
|
|
|
|
<div class="links">
|
|
|
|
<a href="/first.html">first link</a>
|
|
|
|
<a href="/second.html">second link</a>
|
|
|
|
<a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
response = TextResponse(url="http://example.com", body=body)
|
|
|
|
sel = self.sscls(response)
|
|
|
|
|
2014-01-15 15:00:25 +01:00
|
|
|
# re:test()
|
2014-01-15 12:28:25 +01:00
|
|
|
self.assertEqual(
|
|
|
|
sel.xpath(
|
2014-01-15 15:00:25 +01:00
|
|
|
'//input[re:test(@name, "[A-Z]+", "i")]').extract(),
|
|
|
|
[x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')])
|
2014-01-15 12:28:25 +01:00
|
|
|
self.assertEqual(
|
|
|
|
[x.extract()
|
|
|
|
for x in sel.xpath(
|
2014-01-15 15:00:25 +01:00
|
|
|
'//a[re:test(@href, "\.html$")]/text()')],
|
2014-01-15 12:28:25 +01:00
|
|
|
[u'first link', u'second link'])
|
|
|
|
self.assertEqual(
|
|
|
|
[x.extract()
|
|
|
|
for x in sel.xpath(
|
2014-01-15 15:00:25 +01:00
|
|
|
'//a[re:test(@href, "first")]/text()')],
|
2014-01-15 12:28:25 +01:00
|
|
|
[u'first link'])
|
|
|
|
self.assertEqual(
|
|
|
|
[x.extract()
|
|
|
|
for x in sel.xpath(
|
2014-01-15 15:00:25 +01:00
|
|
|
'//a[re:test(@href, "second")]/text()')],
|
2014-01-15 12:28:25 +01:00
|
|
|
[u'second link'])
|
|
|
|
|
2013-11-24 04:41:14 +01:00
|
|
|
|
2014-01-15 15:00:25 +01:00
|
|
|
# re:match() is rather special: it returns a node-set of <match> nodes
|
2013-11-24 04:41:14 +01:00
|
|
|
#[u'<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>',
|
|
|
|
#u'<match>http</match>',
|
|
|
|
#u'<match>www.bayes.co.uk</match>',
|
|
|
|
#u'<match></match>',
|
|
|
|
#u'<match>/xml/index.xml?/xml/utils/rechecker.xml</match>']
|
2014-01-15 12:28:25 +01:00
|
|
|
self.assertEqual(
|
2014-01-15 15:00:25 +01:00
|
|
|
sel.xpath('re:match(//a[re:test(@href, "\.xml$")]/@href,'
|
2014-01-15 12:28:25 +01:00
|
|
|
'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(),
|
|
|
|
[u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
|
|
|
|
u'http',
|
|
|
|
u'www.bayes.co.uk',
|
|
|
|
u'',
|
|
|
|
u'/xml/index.xml?/xml/utils/rechecker.xml'])
|
|
|
|
|
|
|
|
|
2013-11-24 04:41:14 +01:00
|
|
|
|
2014-01-15 15:00:25 +01:00
|
|
|
# re:replace()
|
2014-01-15 12:28:25 +01:00
|
|
|
self.assertEqual(
|
2014-01-15 15:00:25 +01:00
|
|
|
sel.xpath('re:replace(//a[re:test(@href, "\.xml$")]/@href,'
|
2014-01-15 12:28:25 +01:00
|
|
|
'"(\w+)://(.+)(\.xml)", "","https://\\2.html")').extract(),
|
|
|
|
[u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html'])
|
2014-01-09 19:08:34 +01:00
|
|
|
|
|
|
|
def test_set(self):
|
|
|
|
"""EXSLT set manipulation tests"""
|
|
|
|
# microdata example from http://schema.org/Event
|
|
|
|
body="""
|
|
|
|
<div itemscope itemtype="http://schema.org/Event">
|
|
|
|
<a itemprop="url" href="nba-miami-philidelphia-game3.html">
|
|
|
|
NBA Eastern Conference First Round Playoff Tickets:
|
|
|
|
<span itemprop="name"> Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1) </span>
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<meta itemprop="startDate" content="2016-04-21T20:00">
|
|
|
|
Thu, 04/21/16
|
|
|
|
8:00 p.m.
|
|
|
|
|
|
|
|
<div itemprop="location" itemscope itemtype="http://schema.org/Place">
|
|
|
|
<a itemprop="url" href="wells-fargo-center.html">
|
|
|
|
Wells Fargo Center
|
|
|
|
</a>
|
|
|
|
<div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
|
|
|
|
<span itemprop="addressLocality">Philadelphia</span>,
|
|
|
|
<span itemprop="addressRegion">PA</span>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<div itemprop="offers" itemscope itemtype="http://schema.org/AggregateOffer">
|
|
|
|
Priced from: <span itemprop="lowPrice">$35</span>
|
|
|
|
<span itemprop="offerCount">1938</span> tickets left
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
response = TextResponse(url="http://example.com", body=body)
|
|
|
|
sel = self.sscls(response)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
sel.xpath('''//div[@itemtype="http://schema.org/Event"]
|
|
|
|
//@itemprop''').extract(),
|
|
|
|
[u'url',
|
|
|
|
u'name',
|
|
|
|
u'startDate',
|
|
|
|
u'location',
|
|
|
|
u'url',
|
|
|
|
u'address',
|
|
|
|
u'addressLocality',
|
|
|
|
u'addressRegion',
|
|
|
|
u'offers',
|
|
|
|
u'lowPrice',
|
|
|
|
u'offerCount']
|
2014-01-15 12:28:25 +01:00
|
|
|
)
|
|
|
|
|
2014-01-09 19:08:34 +01:00
|
|
|
self.assertEqual(sel.xpath('''
|
|
|
|
set:difference(//div[@itemtype="http://schema.org/Event"]
|
|
|
|
//@itemprop,
|
|
|
|
//div[@itemtype="http://schema.org/Event"]
|
|
|
|
//*[@itemscope]/*/@itemprop)''').extract(),
|
2014-01-15 12:28:25 +01:00
|
|
|
[u'url', u'name', u'startDate', u'location', u'offers'])
|