diff --git a/scrapy/trunk/scrapy/tests/test_xpath.py b/scrapy/trunk/scrapy/tests/test_xpath.py index 481ea4293..e58aad374 100644 --- a/scrapy/trunk/scrapy/tests/test_xpath.py +++ b/scrapy/trunk/scrapy/tests/test_xpath.py @@ -5,8 +5,8 @@ import unittest import libxml2 from scrapy.http import Response -from scrapy.xpath.selector import XPathSelector -from scrapy.xpath.constructors import xmlDoc_from_xml +from scrapy.xpath.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector +#from scrapy.xpath.constructors import xmlDoc_from_xml, xmlDoc_from_html from scrapy.xpath.iterator import XMLNodeIterator class XPathTestCase(unittest.TestCase): @@ -23,7 +23,7 @@ class XPathTestCase(unittest.TestCase): """Simple selector tests""" body = "

" response = Response(domain="example.com", url="http://example.com", body=body) - xpath = XPathSelector(response) + xpath = HtmlXPathSelector(response) xl = xpath.x('//input') self.assertEqual(2, len(xl)) @@ -59,7 +59,7 @@ class XPathTestCase(unittest.TestCase): """ response = Response(domain="example.com", url="http://example.com", body=body) - x = XPathSelector(response) + x = HtmlXPathSelector(response) divtwo = x.x('//div[@class="two"]') self.assertEqual(divtwo.x("//li").extract(), @@ -84,7 +84,7 @@ class XPathTestCase(unittest.TestCase): """ response = Response(domain="example.com", url="http://example.com", body=body) - x = XPathSelector(response) + x = HtmlXPathSelector(response) name_re = re.compile("Name: (\w+)") self.assertEqual(x.x("//ul/li").re(name_re), @@ -92,6 +92,20 @@ class XPathTestCase(unittest.TestCase): self.assertEqual(x.x("//ul/li").re("Age: (\d+)"), ["10", "20"]) + def test_selector_over_text(self): + hxs = HtmlXPathSelector(text='lala') + self.assertEqual(hxs.extract(), + u'lala') + + xxs = XmlXPathSelector(text='lala') + self.assertEqual(xxs.extract(), + u'lala') + + xxs = XmlXPathSelector(text='lala') + self.assertEqual(xxs.x('.').extract(), + [u'lala']) + + def test_selector_namespaces_simple(self): body = """ @@ -101,7 +115,7 @@ class XPathTestCase(unittest.TestCase): """ response = Response(domain="example.com", url="http://example.com", body=body) - x = XPathSelector(response, constructor=xmlDoc_from_xml) + x = XmlXPathSelector(response) x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.x("//somens:a").extract(), @@ -119,7 +133,7 @@ class XPathTestCase(unittest.TestCase): """ response = Response(domain="example.com", url="http://example.com", body=body) - x = XPathSelector(response, constructor=xmlDoc_from_xml) + x = XmlXPathSelector(response) x.register_namespace("xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") x.register_namespace("p", "http://www.scrapy.org/product") @@ -146,10 +160,19 @@ class XPathTestCase(unittest.TestCase): headers = {'Content-Type': ['text/html; charset=utf-8']} response = Response(domain="example.com", url="http://example.com", headers=headers, body=html_utf8) - x = XPathSelector(response) + x = HtmlXPathSelector(response) self.assertEquals(x.x("//span[@id='blank']/text()").extract(), [u'\xa3']) + def test_null_bytes(self): + hxs = HtmlXPathSelector(text='la\x00la') + self.assertEqual(hxs.extract(), + u'lala') + + xxs = XmlXPathSelector(text='la\x00la') + self.assertEqual(xxs.extract(), + u'lala') + def test_iterator(self): body = """ diff --git a/scrapy/trunk/scrapy/xpath/constructors.py b/scrapy/trunk/scrapy/xpath/constructors.py index 95eedf5d8..e6ca7fdcc 100644 --- a/scrapy/trunk/scrapy/xpath/constructors.py +++ b/scrapy/trunk/scrapy/xpath/constructors.py @@ -24,5 +24,8 @@ def xmlDoc_from_html(response): def xmlDoc_from_xml(response): """Return libxml2 doc for XMLs""" - return libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options) - + try: + lxdoc = libxml2.readDoc(response.body.to_string('utf-8'), response.url, 'utf-8', xml_parser_options) + except TypeError: # libxml2 doesn't parse text with null bytes + lxdoc = libxml2.readDoc(response.body.to_string('utf-8').replace("\x00", ""), response.url, 'utf-8', xml_parser_options) + return lxdoc diff --git a/scrapy/trunk/scrapy/xpath/iterator.py b/scrapy/trunk/scrapy/xpath/iterator.py index c216cad22..5d483914e 100644 --- a/scrapy/trunk/scrapy/xpath/iterator.py +++ b/scrapy/trunk/scrapy/xpath/iterator.py @@ -4,8 +4,8 @@ from cStringIO import StringIO import libxml2 -from scrapy.xpath.constructors import xml_parser_options, xmlDoc_from_xml -from scrapy.xpath.selector import XPathSelector +from scrapy.xpath.constructors import xml_parser_options +from scrapy.xpath.selector import XmlXPathSelector class XMLNodeIterator(object): """XMLNodeIterator provides a way to iterate over all nodes of the same @@ -63,7 +63,7 @@ class XMLNodeSAXParser(): if name == self.requested_nodename: self.inside_requested_node = False string = ''.join([self.xml_declaration, self.buffer.getvalue()]) - selector = XPathSelector(text=string, constructor=xmlDoc_from_xml).x('/' + self.requested_nodename)[0] + selector = XmlXPathSelector(text=string).x('/' + self.requested_nodename)[0] self.selectors.append(selector) def characters(self, data): diff --git a/scrapy/trunk/scrapy/xpath/selector.py b/scrapy/trunk/scrapy/xpath/selector.py index 85d0d8762..70e007457 100644 --- a/scrapy/trunk/scrapy/xpath/selector.py +++ b/scrapy/trunk/scrapy/xpath/selector.py @@ -2,22 +2,20 @@ import libxml2 from scrapy.http import Response from scrapy.xpath.extension import Libxml2Document -from scrapy.xpath.constructors import xmlDoc_from_html +from scrapy.xpath.constructors import xmlDoc_from_html, xmlDoc_from_xml from scrapy.utils.python import flatten from scrapy.utils.misc import extract_regex class XPathSelector(object): - """Provides an easy way for selecting document parts using XPaths and - regexs, it also supports nested queries. - - Usage example (untested code): - - x = XPathSelector(response) - i = ScrapedItem() - i.assign("name", x.x("//h2/text()")) - i.assign("features", x.x("//div[@class='features']).x("./span/text()") - """ + """The XPathSelector class provides a convenient way for selecting document + parts using XPaths and regexs, with support for nested queries. + Although this is not an abstract class, you usually instantiate one of its + children: + + - XmlXPathSelector (for XML content) + - HtmlXPathSelector (for HTML content) + """ def __init__(self, response=None, text=None, node=None, parent=None, expr=None, constructor=xmlDoc_from_html): if parent: @@ -36,6 +34,8 @@ class XPathSelector(object): self.expr = expr def x(self, xpath): + """Perform the given XPath query on the current XPathSelector and + return a XPathSelectorList of the result""" if hasattr(self.xmlNode, 'xpathEval'): self.doc.xpathContext.setContextNode(self.xmlNode) xpath_result = self.doc.xpathContext.xpathEval(xpath) @@ -47,13 +47,20 @@ class XPathSelector(object): return XPathSelectorList([]) def re(self, regex): + """Return a list of unicode strings by applying the regex over all + current XPath selections, and flattening the results""" return extract_regex(regex, self.extract(), 'utf-8') - def extract(self, **kwargs): + def extract(self): + """Return a unicode string of the content referenced by the XPathSelector""" if isinstance(self.xmlNode, basestring): text = unicode(self.xmlNode, 'utf-8', errors='ignore') - elif hasattr(self.xmlNode, 'xpathEval'): - if isinstance(self.xmlNode, libxml2.xmlAttr): + elif hasattr(self.xmlNode, 'serialize'): + if isinstance(self.xmlNode, libxml2.xmlDoc): + data = self.xmlNode.getRootElement().serialize('utf-8') + text = unicode(data, 'utf-8', errors='ignore') if data else u'' + elif isinstance(self.xmlNode, libxml2.xmlAttr): + # serialization doesn't work sometimes for xmlAttr types text = unicode(self.xmlNode.content, errors='ignore') else: data = self.xmlNode.serialize('utf-8') @@ -66,6 +73,7 @@ class XPathSelector(object): return text def register_namespace(self, prefix, uri): + """Register namespace so that it can be used in XPath queries""" self.doc.xpathContext.xpathRegisterNs(prefix, uri) def __str__(self): @@ -75,13 +83,29 @@ class XPathSelector(object): class XPathSelectorList(list): - - def extract(self, **kwargs): - return [x.extract(**kwargs) if isinstance(x, XPathSelector) else x for x in self] + """List of XPathSelector objects""" def x(self, xpath): + """Perform the given XPath query on each XPathSelector of the list and + return a new (flattened) XPathSelectorList of the results""" return XPathSelectorList(flatten([x.x(xpath) for x in self])) def re(self, regex): + """Perform the re() method on each XPathSelector of the list, and + return the result as a flattened list of unicode strings""" return flatten([x.re(regex) for x in self]) + def extract(self): + """Return a list of unicode strings with the content referenced by each + XPathSelector of the list""" + return [x.extract() if isinstance(x, XPathSelector) else x for x in self] + +class XmlXPathSelector(XPathSelector): + """XPathSelector for XML content""" + def __init__(self, response=None, text=None): + XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_xml) + +class HtmlXPathSelector(XPathSelector): + """XPathSelector for HTML content""" + def __init__(self, response=None, text=None): + XPathSelector.__init__(self, response=response, text=text, constructor=xmlDoc_from_html)