import re
import warnings
import weakref
import six
from twisted.trial import unittest
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
from scrapy.selector import Selector
from scrapy.selector.lxmlsel import XmlXPathSelector, HtmlXPathSelector, XPathSelector
class SelectorTestCase(unittest.TestCase):
sscls = Selector
def test_simple_selection(self):
"""Simple selector tests"""
body = "
"
response = TextResponse(url="http://example.com", body=body)
sel = self.sscls(response)
xl = sel.xpath('//input')
self.assertEqual(2, len(xl))
for x in xl:
assert isinstance(x, self.sscls)
self.assertEqual(sel.xpath('//input').extract(),
[x.extract() for x in sel.xpath('//input')])
self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")],
[u'a'])
self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
[u'12.0'])
self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(),
[u'xpathrules'])
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
[u'12'])
def test_representation_slice(self):
body = u"
".format(50 * 'b')
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
sel = self.sscls(response)
self.assertEqual(
map(repr, sel.xpath('//input/@name')),
["".format(40 * 'b')]
)
def test_representation_unicode_query(self):
body = u"
".format(50 * 'b')
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
sel = self.sscls(response)
self.assertEqual(
map(repr, sel.xpath(u'//input[@value="\xa9"]/@value')),
[""]
)
def test_extract_first(self):
"""Test if extract_first() returns first element"""
body = ''
response = TextResponse(url="http://example.com", body=body)
sel = self.sscls(response)
self.assertEqual(sel.xpath('//ul/li/text()').extract_first(),
sel.xpath('//ul/li/text()').extract()[0])
self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').extract_first(),
sel.xpath('//ul/li[@id="1"]/text()').extract()[0])
self.assertEqual(sel.xpath('//ul/li[2]/text()').extract_first(),
sel.xpath('//ul/li/text()').extract()[1])
self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None)
def test_re_first(self):
"""Test if re_first() returns first matched element"""
body = ''
response = TextResponse(url="http://example.com", body=body)
sel = self.sscls(response)
self.assertEqual(sel.xpath('//ul/li/text()').re_first('\d'),
sel.xpath('//ul/li/text()').re('\d')[0])
self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').re_first('\d'),
sel.xpath('//ul/li[@id="1"]/text()').re('\d')[0])
self.assertEqual(sel.xpath('//ul/li[2]/text()').re_first('\d'),
sel.xpath('//ul/li/text()').re('\d')[1])
self.assertEqual(sel.xpath('/ul/li/text()').re_first('\w+'), None)
self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first('\d'), None)
def test_select_unicode_query(self):
body = u"
"
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
sel = self.sscls(response)
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
def test_list_elements_type(self):
"""Test Selector returning the same type in selection methods"""
text = 'test
'
assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls)
assert isinstance(self.sscls(text=text).css("p")[0], self.sscls)
def test_boolean_result(self):
body = "
"
response = TextResponse(url="http://example.com", body=body)
xs = self.sscls(response)
self.assertEquals(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1'])
self.assertEquals(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0'])
def test_differences_parsing_xml_vs_html(self):
"""Test that XML and HTML Selector's behave differently"""
# some text which is parsed differently by XML and HTML flavors
text = 'Hello
'
hs = self.sscls(text=text, type='html')
self.assertEqual(hs.xpath("//div").extract(),
[u'Hello
'])
xs = self.sscls(text=text, type='xml')
self.assertEqual(xs.xpath("//div").extract(),
[u'Hello
'])
def test_flavor_detection(self):
text = 'Hello
'
sel = self.sscls(XmlResponse('http://example.com', body=text))
self.assertEqual(sel.type, 'xml')
self.assertEqual(sel.xpath("//div").extract(),
[u'Hello
'])
sel = self.sscls(HtmlResponse('http://example.com', body=text))
self.assertEqual(sel.type, 'html')
self.assertEqual(sel.xpath("//div").extract(),
[u'Hello
'])
def test_nested_selectors(self):
"""Nested selector tests"""
body = """
"""
response = HtmlResponse(url="http://example.com", body=body)
x = self.sscls(response)
divtwo = x.xpath('//div[@class="two"]')
self.assertEqual(divtwo.xpath("//li").extract(),
["one ", "two ", "four ", "five ", "six "])
self.assertEqual(divtwo.xpath("./ul/li").extract(),
["four ", "five ", "six "])
self.assertEqual(divtwo.xpath(".//li").extract(),
["four ", "five ", "six "])
self.assertEqual(divtwo.xpath("./li").extract(), [])
def test_mixed_nested_selectors(self):
body = '''
notme
'''
sel = self.sscls(text=body)
self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me'])
def test_dont_strip(self):
sel = self.sscls(text='')
self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz'])
def test_namespaces_simple(self):
body = """
take this
found
"""
response = XmlResponse(url="http://example.com", body=body)
x = self.sscls(response)
x.register_namespace("somens", "http://scrapy.org")
self.assertEqual(x.xpath("//somens:a/text()").extract(),
[u'take this'])
def test_namespaces_multiple(self):
body = """
hello
value
iron 90 Dried Rose
"""
response = XmlResponse(url="http://example.com", body=body)
x = self.sscls(response)
x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05")
x.register_namespace("p", "http://www.scrapy.org/product")
x.register_namespace("b", "http://somens.com")
self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], 'hello')
self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], 'value')
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90')
self.assertEqual(x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), '90')
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
def test_re(self):
body = """Name: Mary
Name: John
Age: 10
Name: Paul
Age: 20
Age: 20
"""
response = HtmlResponse(url="http://example.com", body=body)
x = self.sscls(response)
name_re = re.compile("Name: (\w+)")
self.assertEqual(x.xpath("//ul/li").re(name_re),
["John", "Paul"])
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
["10", "20"])
def test_re_intl(self):
body = """Evento: cumplea\xc3\xb1os
"""
response = HtmlResponse(url="http://example.com", body=body, encoding='utf-8')
x = self.sscls(response)
self.assertEqual(x.xpath("//div").re("Evento: (\w+)"), [u'cumplea\xf1os'])
def test_selector_over_text(self):
hs = self.sscls(text='lala ')
self.assertEqual(hs.extract(), u'lala ')
xs = self.sscls(text='lala ', type='xml')
self.assertEqual(xs.extract(), u'lala ')
self.assertEqual(xs.xpath('.').extract(), [u'lala '])
def test_invalid_xpath(self):
"Test invalid xpath raises ValueError with the invalid xpath"
response = XmlResponse(url="http://example.com", body="")
x = self.sscls(response)
xpath = "//test[@foo='bar]"
self.assertRaisesRegexp(ValueError, re.escape(xpath), x.xpath, xpath)
def test_invalid_xpath_unicode(self):
"Test *Unicode* invalid xpath raises ValueError with the invalid xpath"
response = XmlResponse(url="http://example.com", body="")
x = self.sscls(response)
xpath = u"//test[@foo='\u0431ar]"
encoded = xpath if six.PY3 else xpath.encode('unicode_escape')
self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath)
def test_http_header_encoding_precedence(self):
# u'\xa3' = pound symbol in unicode
# u'\xc2\xa3' = pound symbol in utf-8
# u'\xa3' = pound symbol in latin-1 (iso-8859-1)
meta = u' '
head = u'' + meta + u''
body_content = u'\xa3 '
body = u'' + body_content + u''
html = u'' + head + body + u''
encoding = 'utf-8'
html_utf8 = html.encode(encoding)
headers = {'Content-Type': ['text/html; charset=utf-8']}
response = HtmlResponse(url="http://example.com", headers=headers, body=html_utf8)
x = self.sscls(response)
self.assertEquals(x.xpath("//span[@id='blank']/text()").extract(),
[u'\xa3'])
def test_empty_bodies(self):
# shouldn't raise errors
r1 = TextResponse('http://www.example.com', body='')
self.sscls(r1).xpath('//text()').extract()
def test_null_bytes(self):
# shouldn't raise errors
r1 = TextResponse('http://www.example.com', \
body='pre\x00post ', \
encoding='utf-8')
self.sscls(r1).xpath('//text()').extract()
def test_badly_encoded_body(self):
# \xe9 alone isn't valid utf8 sequence
r1 = TextResponse('http://www.example.com', \
body='an Jos\xe9 de
', \
encoding='utf-8')
self.sscls(r1).xpath('//text()').extract()
def test_select_on_unevaluable_nodes(self):
r = self.sscls(text=u'some text ')
# Text node
x1 = r.xpath('//text()')
self.assertEquals(x1.extract(), [u'some text'])
self.assertEquals(x1.xpath('.//b').extract(), [])
# Tag attribute
x1 = r.xpath('//span/@class')
self.assertEquals(x1.extract(), [u'big'])
self.assertEquals(x1.xpath('.//text()').extract(), [])
def test_select_on_text_nodes(self):
r = self.sscls(text=u'Options: opt1
Other opt2
')
x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]")
self.assertEquals(x1.extract(), [u'opt1'])
x1 = r.xpath("//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]")
self.assertEquals(x1.extract(), [u'Options: '])
def test_nested_select_on_text_nodes(self):
# FIXME: does not work with lxml backend [upstream]
r = self.sscls(text=u'Options: opt1
Other opt2
')
x1 = r.xpath("//div/descendant::text()")
x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]")
self.assertEquals(x2.extract(), [u'Options: '])
test_nested_select_on_text_nodes.skip = "Text nodes lost parent node reference in lxml"
def test_weakref_slots(self):
"""Check that classes are using slots and are weak-referenceable"""
x = self.sscls()
weakref.ref(x)
assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \
x.__class__.__name__
def test_remove_namespaces(self):
xml = """
"""
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
self.assertEqual(len(sel.xpath("//link")), 0)
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link")), 2)
def test_remove_attributes_namespaces(self):
xml = """
"""
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
self.assertEqual(len(sel.xpath("//link/@type")), 0)
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link/@type")), 2)
def test_smart_strings(self):
"""Lxml smart strings return values"""
class SmartStringsSelector(Selector):
_lxml_smart_strings = True
body = """
"""
response = HtmlResponse(url="http://example.com", body=body)
# .getparent() is available for text nodes and attributes
# only when smart_strings are on
x = self.sscls(response)
li_text = x.xpath('//li/text()')
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), li_text)))
div_class = x.xpath('//div/@class')
self.assertFalse(any(map(lambda e: hasattr(e._root, 'getparent'), div_class)))
x = SmartStringsSelector(response)
li_text = x.xpath('//li/text()')
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), li_text)))
div_class = x.xpath('//div/@class')
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), div_class)))
def test_xml_entity_expansion(self):
malicious_xml = ''\
' ]>&xxe; '
response = XmlResponse('http://example.com', body=malicious_xml)
sel = self.sscls(response=response)
self.assertEqual(sel.extract(), '&xxe; ')
class DeprecatedXpathSelectorTest(unittest.TestCase):
text = 'Hello
'
def test_warnings_xpathselector(self):
cls = XPathSelector
with warnings.catch_warnings(record=True) as w:
class UserClass(cls):
pass
# subclassing must issue a warning
self.assertEqual(len(w), 1, str(cls))
self.assertIn('scrapy.Selector', str(w[0].message))
# subclass instance doesn't issue a warning
usel = UserClass(text=self.text)
self.assertEqual(len(w), 1)
# class instance must issue a warning
sel = cls(text=self.text)
self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
self.assertIn('scrapy.Selector', str(w[1].message))
# subclass and instance checks
self.assertTrue(issubclass(cls, Selector))
self.assertTrue(isinstance(sel, Selector))
self.assertTrue(isinstance(usel, Selector))
def test_warnings_xmlxpathselector(self):
cls = XmlXPathSelector
with warnings.catch_warnings(record=True) as w:
class UserClass(cls):
pass
# subclassing must issue a warning
self.assertEqual(len(w), 1, str(cls))
self.assertIn('scrapy.Selector', str(w[0].message))
# subclass instance doesn't issue a warning
usel = UserClass(text=self.text)
self.assertEqual(len(w), 1)
# class instance must issue a warning
sel = cls(text=self.text)
self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
self.assertIn('scrapy.Selector', str(w[1].message))
# subclass and instance checks
self.assertTrue(issubclass(cls, Selector))
self.assertTrue(issubclass(cls, XPathSelector))
self.assertTrue(isinstance(sel, Selector))
self.assertTrue(isinstance(usel, Selector))
self.assertTrue(isinstance(sel, XPathSelector))
self.assertTrue(isinstance(usel, XPathSelector))
def test_warnings_htmlxpathselector(self):
cls = HtmlXPathSelector
with warnings.catch_warnings(record=True) as w:
class UserClass(cls):
pass
# subclassing must issue a warning
self.assertEqual(len(w), 1, str(cls))
self.assertIn('scrapy.Selector', str(w[0].message))
# subclass instance doesn't issue a warning
usel = UserClass(text=self.text)
self.assertEqual(len(w), 1)
# class instance must issue a warning
sel = cls(text=self.text)
self.assertEqual(len(w), 2, str((cls, [x.message for x in w])))
self.assertIn('scrapy.Selector', str(w[1].message))
# subclass and instance checks
self.assertTrue(issubclass(cls, Selector))
self.assertTrue(issubclass(cls, XPathSelector))
self.assertTrue(isinstance(sel, Selector))
self.assertTrue(isinstance(usel, Selector))
self.assertTrue(isinstance(sel, XPathSelector))
self.assertTrue(isinstance(usel, XPathSelector))
def test_xpathselector(self):
with warnings.catch_warnings(record=True):
hs = XPathSelector(text=self.text)
self.assertEqual(hs.select("//div").extract(),
[u'Hello
'])
self.assertRaises(RuntimeError, hs.css, 'div')
def test_htmlxpathselector(self):
with warnings.catch_warnings(record=True):
hs = HtmlXPathSelector(text=self.text)
self.assertEqual(hs.select("//div").extract(),
[u'Hello
'])
self.assertRaises(RuntimeError, hs.css, 'div')
def test_xmlxpathselector(self):
with warnings.catch_warnings(record=True):
xs = XmlXPathSelector(text=self.text)
self.assertEqual(xs.select("//div").extract(),
[u'Hello
'])
self.assertRaises(RuntimeError, xs.css, 'div')
class ExsltTestCase(unittest.TestCase):
sscls = Selector
def test_regexp(self):
"""EXSLT regular expression tests"""
body = """
"""
response = TextResponse(url="http://example.com", body=body)
sel = self.sscls(response)
# re:test()
self.assertEqual(
sel.xpath(
'//input[re:test(@name, "[A-Z]+", "i")]').extract(),
[x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')])
self.assertEqual(
[x.extract()
for x in sel.xpath(
'//a[re:test(@href, "\.html$")]/text()')],
[u'first link', u'second link'])
self.assertEqual(
[x.extract()
for x in sel.xpath(
'//a[re:test(@href, "first")]/text()')],
[u'first link'])
self.assertEqual(
[x.extract()
for x in sel.xpath(
'//a[re:test(@href, "second")]/text()')],
[u'second link'])
# re:match() is rather special: it returns a node-set of nodes
#[u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml ',
#u'http ',
#u'www.bayes.co.uk ',
#u' ',
#u'/xml/index.xml?/xml/utils/rechecker.xml ']
self.assertEqual(
sel.xpath('re:match(//a[re:test(@href, "\.xml$")]/@href,'
'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(),
[u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
u'http',
u'www.bayes.co.uk',
u'',
u'/xml/index.xml?/xml/utils/rechecker.xml'])
# re:replace()
self.assertEqual(
sel.xpath('re:replace(//a[re:test(@href, "\.xml$")]/@href,'
'"(\w+)://(.+)(\.xml)", "","https://\\2.html")').extract(),
[u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html'])
def test_set(self):
"""EXSLT set manipulation tests"""
# microdata example from http://schema.org/Event
body="""
"""
response = TextResponse(url="http://example.com", body=body)
sel = self.sscls(response)
self.assertEqual(
sel.xpath('''//div[@itemtype="http://schema.org/Event"]
//@itemprop''').extract(),
[u'url',
u'name',
u'startDate',
u'location',
u'url',
u'address',
u'addressLocality',
u'addressRegion',
u'offers',
u'lowPrice',
u'offerCount']
)
self.assertEqual(sel.xpath('''
set:difference(//div[@itemtype="http://schema.org/Event"]
//@itemprop,
//div[@itemtype="http://schema.org/Event"]
//*[@itemscope]/*/@itemprop)''').extract(),
[u'url', u'name', u'startDate', u'location', u'offers'])