mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 03:03:47 +00:00
use sel
name for Selector's instances in docs, internals and shell
This commit is contained in:
parent
1abb1af0c6
commit
155ea08ea1
@ -143,12 +143,12 @@ Finally, here's the spider code::
|
||||
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
|
||||
def parse_torrent(self, response):
|
||||
ss = Selector(response)
|
||||
sel = Selector(response)
|
||||
torrent = TorrentItem()
|
||||
torrent['url'] = response.url
|
||||
torrent['name'] = ss.xpath("//h1/text()").extract()
|
||||
torrent['description'] = ss.xpath("//div[@id='description']").extract()
|
||||
torrent['size'] = ss.xpath("//div[@id='info-left']/p[2]/text()[2]").extract()
|
||||
torrent['name'] = sel.xpath("//h1/text()").extract()
|
||||
torrent['description'] = sel.xpath("//div[@id='description']").extract()
|
||||
torrent['size'] = sel.xpath("//div[@id='info-left']/p[2]/text()[2]").extract()
|
||||
return torrent
|
||||
|
||||
For brevity's sake, we intentionally left out the import statements. The
|
||||
|
@ -255,7 +255,7 @@ This is what the shell looks like::
|
||||
|
||||
[s] Available Scrapy objects:
|
||||
[s] 2010-08-19 21:45:59-0300 [default] INFO: Spider closed (finished)
|
||||
[s] ss <Selector (http://www.dmoz.org/Computers/Programming/Languages/Python/Books/) xpath=None>
|
||||
[s] sel <Selector (http://www.dmoz.org/Computers/Programming/Languages/Python/Books/) xpath=None>
|
||||
[s] item Item()
|
||||
[s] request <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[s] response <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
@ -271,25 +271,25 @@ After the shell loads, you will have the response fetched in a local
|
||||
``response`` variable, so if you type ``response.body`` you will see the body
|
||||
of the response, or you can type ``response.headers`` to see its headers.
|
||||
|
||||
The shell also pre-instantiate a selector for this response in variable ``ss``,
|
||||
The shell also pre-instantiate a selector for this response in variable ``sel``,
|
||||
the selector automatically chooses the best parsing rules (XML vs HTML) based
|
||||
on response's type.
|
||||
|
||||
So let's try it::
|
||||
|
||||
In [1]: ss.xpath('//title')
|
||||
In [1]: sel.xpath('//title')
|
||||
Out[1]: [<Selector (title) xpath=//title>]
|
||||
|
||||
In [2]: ss.xpath('//title').extract()
|
||||
In [2]: sel.xpath('//title').extract()
|
||||
Out[2]: [u'<title>Open Directory - Computers: Programming: Languages: Python: Books</title>']
|
||||
|
||||
In [3]: ss.xpath('//title/text()')
|
||||
In [3]: sel.xpath('//title/text()')
|
||||
Out[3]: [<Selector (text) xpath=//title/text()>]
|
||||
|
||||
In [4]: ss.xpath('//title/text()').extract()
|
||||
In [4]: sel.xpath('//title/text()').extract()
|
||||
Out[4]: [u'Open Directory - Computers: Programming: Languages: Python: Books']
|
||||
|
||||
In [5]: ss.xpath('//title/text()').re('(\w+):')
|
||||
In [5]: sel.xpath('//title/text()').re('(\w+):')
|
||||
Out[5]: [u'Computers', u'Programming', u'Languages', u'Python']
|
||||
|
||||
Extracting the data
|
||||
@ -309,25 +309,25 @@ is inside a ``<ul>`` element, in fact the *second* ``<ul>`` element.
|
||||
So we can select each ``<li>`` element belonging to the sites list with this
|
||||
code::
|
||||
|
||||
ss.xpath('//ul/li')
|
||||
sel.xpath('//ul/li')
|
||||
|
||||
And from them, the sites descriptions::
|
||||
|
||||
ss.xpath('//ul/li/text()').extract()
|
||||
sel.xpath('//ul/li/text()').extract()
|
||||
|
||||
The sites titles::
|
||||
|
||||
ss.xpath('//ul/li/a/text()').extract()
|
||||
sel.xpath('//ul/li/a/text()').extract()
|
||||
|
||||
And the sites links::
|
||||
|
||||
ss.xpath('//ul/li/a/@href').extract()
|
||||
sel.xpath('//ul/li/a/@href').extract()
|
||||
|
||||
As we said before, each ``.xpath()`` call returns a list of selectors, so we can
|
||||
concatenate further ``.xpath()`` calls to dig deeper into a node. We are going to use
|
||||
that property here, so::
|
||||
|
||||
sites = ss.xpath('//ul/li')
|
||||
sites = sel.xpath('//ul/li')
|
||||
for site in sites:
|
||||
title = site.xpath('a/text()').extract()
|
||||
link = site.xpath('a/@href').extract()
|
||||
@ -355,8 +355,8 @@ Let's add this code to our spider::
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
ss = Selector(response)
|
||||
sites = ss.xpath('//ul/li')
|
||||
sel = Selector(response)
|
||||
sites = sel.xpath('//ul/li')
|
||||
for site in sites:
|
||||
title = site.xpath('a/text()').extract()
|
||||
link = site.xpath('a/@href').extract()
|
||||
@ -398,8 +398,8 @@ scraped so far, the final code for our Spider would be like this::
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
ss = Selector(response)
|
||||
sites = ss.xpath('//ul/li')
|
||||
sel = Selector(response)
|
||||
sites = sel.xpath('//ul/li')
|
||||
items = []
|
||||
for site in sites:
|
||||
item = DmozItem()
|
||||
|
@ -146,10 +146,10 @@ that have that grey colour of the links,
|
||||
Finally, we can write our ``parse_category()`` method::
|
||||
|
||||
def parse_category(self, response):
|
||||
ss = Selector(response)
|
||||
sel = Selector(response)
|
||||
|
||||
# The path to website links in directory page
|
||||
links = ss.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
|
||||
links = sel.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
|
||||
|
||||
for link in links:
|
||||
item = DirectoryItem()
|
||||
|
@ -62,13 +62,13 @@ body is what they're going to be "selecting"::
|
||||
class MySpider(BaseSpider):
|
||||
# ...
|
||||
def parse(self, response):
|
||||
ss = Selector(response)
|
||||
sel = Selector(response)
|
||||
# Using XPath query
|
||||
print ss.xpath('//p')
|
||||
print sel.xpath('//p')
|
||||
# Using CSS query
|
||||
print ss.css('p')
|
||||
print sel.css('p')
|
||||
# Nesting queries
|
||||
print ss.xpath('//div[@foo="bar"]').css('span#bold')
|
||||
print sel.xpath('//div[@foo="bar"]').css('span#bold')
|
||||
|
||||
|
||||
Using selectors
|
||||
@ -94,7 +94,7 @@ First, let's open the shell::
|
||||
scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html
|
||||
|
||||
Then, after the shell loads, you'll have a selector already instantiated and
|
||||
ready to use in ``ss`` shell variable.
|
||||
ready to use in ``sel`` shell variable.
|
||||
|
||||
Since we're dealing with HTML, the selector will automatically use an HTML parser.
|
||||
|
||||
@ -104,7 +104,7 @@ So, by looking at the :ref:`HTML code <topics-selectors-htmlcode>` of that
|
||||
page, let's construct an XPath (using an HTML selector) for selecting the text
|
||||
inside the title tag::
|
||||
|
||||
>>> ss.xpath('//title/text()')
|
||||
>>> sel.xpath('//title/text()')
|
||||
[<Selector (text) xpath=//title/text()>]
|
||||
|
||||
As you can see, the ``.xpath()`` method returns an
|
||||
@ -114,45 +114,45 @@ selectors. This API can be used quickly for extracting nested data.
|
||||
To actually extract the textual data, you must call the selector ``.extract()``
|
||||
method, as follows::
|
||||
|
||||
>>> ss.xpath('//title/text()').extract()
|
||||
>>> sel.xpath('//title/text()').extract()
|
||||
[u'Example website']
|
||||
|
||||
Notice that CSS selectors can select text or attribute nodes using CSS3
|
||||
pseudo-elements::
|
||||
|
||||
>>> ss.css('title::text').extract()
|
||||
>>> sel.css('title::text').extract()
|
||||
[u'Example website']
|
||||
|
||||
Now we're going to get the base URL and some image links::
|
||||
|
||||
>>> ss.xpath('//base/@href').extract()
|
||||
>>> sel.xpath('//base/@href').extract()
|
||||
[u'http://example.com/']
|
||||
|
||||
>>> ss.css('base::attr(href)').extract()
|
||||
>>> sel.css('base::attr(href)').extract()
|
||||
[u'http://example.com/']
|
||||
|
||||
>>> ss.xpath('//a[contains(@href, "image")]/@href').extract()
|
||||
>>> sel.xpath('//a[contains(@href, "image")]/@href').extract()
|
||||
[u'image1.html',
|
||||
u'image2.html',
|
||||
u'image3.html',
|
||||
u'image4.html',
|
||||
u'image5.html']
|
||||
|
||||
>>> ss.css('a[href*=image]::attr(href)').extract()
|
||||
>>> sel.css('a[href*=image]::attr(href)').extract()
|
||||
[u'image1.html',
|
||||
u'image2.html',
|
||||
u'image3.html',
|
||||
u'image4.html',
|
||||
u'image5.html']
|
||||
|
||||
>>> ss.xpath('//a[contains(@href, "image")]/img/@src').extract()
|
||||
>>> sel.xpath('//a[contains(@href, "image")]/img/@src').extract()
|
||||
[u'image1_thumb.jpg',
|
||||
u'image2_thumb.jpg',
|
||||
u'image3_thumb.jpg',
|
||||
u'image4_thumb.jpg',
|
||||
u'image5_thumb.jpg']
|
||||
|
||||
>>> ss.css('a[href*=image] img::attr(src)').extract()
|
||||
>>> sel.css('a[href*=image] img::attr(src)').extract()
|
||||
[u'image1_thumb.jpg',
|
||||
u'image2_thumb.jpg',
|
||||
u'image3_thumb.jpg',
|
||||
@ -168,7 +168,7 @@ The selection methods (``.xpath()`` or ``.css()``) returns a list of selectors
|
||||
of the same type, so you can call the selection methods for those selectors
|
||||
too. Here's an example::
|
||||
|
||||
>>> links = ss.xpath('//a[contains(@href, "image")]')
|
||||
>>> links = sel.xpath('//a[contains(@href, "image")]')
|
||||
>>> links.extract()
|
||||
[u'<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
|
||||
u'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
|
||||
@ -197,7 +197,7 @@ can't construct nested ``.re()`` calls.
|
||||
Here's an example used to extract images names from the :ref:`HTML code
|
||||
<topics-selectors-htmlcode>` above::
|
||||
|
||||
>>> ss.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)')
|
||||
>>> sel.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)')
|
||||
[u'My image 1',
|
||||
u'My image 2',
|
||||
u'My image 3',
|
||||
@ -216,7 +216,7 @@ with ``/``, that XPath will be absolute to the document and not relative to the
|
||||
For example, suppose you want to extract all ``<p>`` elements inside ``<div>``
|
||||
elements. First, you would get all ``<div>`` elements::
|
||||
|
||||
>>> divs = ss.xpath('//div')
|
||||
>>> divs = sel.xpath('//div')
|
||||
|
||||
At first, you may be tempted to use the following approach, which is wrong, as
|
||||
it actually extracts all ``<p>`` elements from the document, not only those
|
||||
|
@ -83,7 +83,7 @@ Those objects are:
|
||||
* ``response`` - a :class:`~scrapy.http.Response` object containing the last
|
||||
fetched page
|
||||
|
||||
* ``ss`` - a :class:`~scrapy.selector.Selector` object constructed
|
||||
* ``sel`` - a :class:`~scrapy.selector.Selector` object constructed
|
||||
with the last response fetched
|
||||
|
||||
* ``settings`` - the current :ref:`Scrapy settings <topics-settings>`
|
||||
@ -111,7 +111,7 @@ list of available objects and useful shortcuts (you'll notice that these lines
|
||||
all start with the ``[s]`` prefix)::
|
||||
|
||||
[s] Available objects
|
||||
[s] ss <Selector (http://scrapy.org) xpath=None>
|
||||
[s] sel <Selector (http://scrapy.org) xpath=None>
|
||||
[s] item Item()
|
||||
[s] request <http://scrapy.org>
|
||||
[s] response <http://scrapy.org>
|
||||
@ -126,12 +126,12 @@ all start with the ``[s]`` prefix)::
|
||||
|
||||
After that, we can star playing with the objects::
|
||||
|
||||
>>> ss.xpath("//h2/text()").extract()[0]
|
||||
>>> sel.xpath("//h2/text()").extract()[0]
|
||||
u'Welcome to Scrapy'
|
||||
|
||||
>>> fetch("http://slashdot.org")
|
||||
[s] Available Scrapy objects:
|
||||
[s] ss <Selector (http://slashdot.org) xpath=None>
|
||||
[s] sel <Selector (http://slashdot.org) xpath=None>
|
||||
[s] item JobItem()
|
||||
[s] request <GET http://slashdot.org>
|
||||
[s] response <200 http://slashdot.org>
|
||||
@ -142,7 +142,7 @@ After that, we can star playing with the objects::
|
||||
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
|
||||
[s] view(response) View response in a browser
|
||||
|
||||
>>> ss.xpath("//h2/text()").extract()
|
||||
>>> sel.xpath("//h2/text()").extract()
|
||||
[u'News for nerds, stuff that matters']
|
||||
|
||||
>>> request = request.replace(method="POST")
|
||||
@ -180,7 +180,7 @@ When you run the spider, you will get something similar to this::
|
||||
2009-08-27 19:15:25-0300 [example.com] DEBUG: Crawled <http://www.example.com/> (referer: <None>)
|
||||
2009-08-27 19:15:26-0300 [example.com] DEBUG: Crawled <http://www.example.com/products.php> (referer: <http://www.example.com/>)
|
||||
[s] Available objects
|
||||
[s] ss <Selector (http://www.example.com/products.php) xpath=None>
|
||||
[s] sel <Selector (http://www.example.com/products.php) xpath=None>
|
||||
...
|
||||
|
||||
>>> response.url
|
||||
@ -188,7 +188,7 @@ When you run the spider, you will get something similar to this::
|
||||
|
||||
Then, you can check if the extraction code is working::
|
||||
|
||||
>>> ss.xpath('//h1')
|
||||
>>> sel.xpath('//h1')
|
||||
[]
|
||||
|
||||
Nope, it doesn't. So you can open the response in your web browser and see if
|
||||
|
@ -231,11 +231,11 @@ Another example returning multiples Requests and Items from a single callback::
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
ss = Selector(response)
|
||||
for h3 in ss.xpath('//h3').extract():
|
||||
sel = Selector(response)
|
||||
for h3 in sel.xpath('//h3').extract():
|
||||
yield MyItem(title=h3)
|
||||
|
||||
for url in ss.xpath('//a/@href').extract():
|
||||
for url in sel.xpath('//a/@href').extract():
|
||||
yield Request(url, callback=self.parse)
|
||||
|
||||
.. module:: scrapy.contrib.spiders
|
||||
@ -334,11 +334,11 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
def parse_item(self, response):
|
||||
self.log('Hi, this is an item page! %s' % response.url)
|
||||
|
||||
ss = Selector(response)
|
||||
sel = Selector(response)
|
||||
item = Item()
|
||||
item['id'] = ss.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
|
||||
item['name'] = ss.xpath('//td[@id="item_name"]/text()').extract()
|
||||
item['description'] = ss.xpath('//td[@id="item_description"]/text()').extract()
|
||||
item['id'] = sel.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
|
||||
item['name'] = sel.xpath('//td[@id="item_name"]/text()').extract()
|
||||
item['description'] = sel.xpath('//td[@id="item_description"]/text()').extract()
|
||||
return item
|
||||
|
||||
|
||||
|
@ -116,11 +116,11 @@ class SgmlLinkExtractor(BaseSgmlLinkExtractor):
|
||||
def extract_links(self, response):
|
||||
base_url = None
|
||||
if self.restrict_xpaths:
|
||||
ss = Selector(response)
|
||||
sel = Selector(response)
|
||||
base_url = get_base_url(response)
|
||||
body = u''.join(f
|
||||
for x in self.restrict_xpaths
|
||||
for f in ss.xpath(x).extract()
|
||||
for f in sel.xpath(x).extract()
|
||||
).encode(response.encoding)
|
||||
else:
|
||||
body = response.body
|
||||
|
@ -17,7 +17,7 @@ from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.response import open_in_browser
|
||||
from scrapy.utils.console import start_python_console
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.http import Request, Response, HtmlResponse, XmlResponse
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
|
||||
@ -95,7 +95,7 @@ class Shell(object):
|
||||
self.vars['spider'] = spider
|
||||
self.vars['request'] = request
|
||||
self.vars['response'] = response
|
||||
self.vars['ss'] = Selector(response)
|
||||
self.vars['sel'] = Selector(response)
|
||||
if self.inthread:
|
||||
self.vars['fetch'] = self.fetch
|
||||
self.vars['view'] = open_in_browser
|
||||
|
@ -31,7 +31,7 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_response_selector_html(self):
|
||||
xpath = 'ss.xpath("//p[@class=\'one\']/text()").extract()[0]'
|
||||
xpath = 'sel.xpath("//p[@class=\'one\']/text()").extract()[0]'
|
||||
_, out, _ = yield self.execute([self.url('/html'), '-c', xpath])
|
||||
self.assertEqual(out.strip(), 'Works')
|
||||
|
||||
|
@ -16,31 +16,31 @@ class SelectorTestCase(unittest.TestCase):
|
||||
"""Simple selector tests"""
|
||||
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
|
||||
response = TextResponse(url="http://example.com", body=body)
|
||||
ss = self.sscls(response)
|
||||
sel = self.sscls(response)
|
||||
|
||||
xl = ss.xpath('//input')
|
||||
xl = sel.xpath('//input')
|
||||
self.assertEqual(2, len(xl))
|
||||
for x in xl:
|
||||
assert isinstance(x, self.sscls)
|
||||
|
||||
self.assertEqual(ss.xpath('//input').extract(),
|
||||
[x.extract() for x in ss.xpath('//input')])
|
||||
self.assertEqual(sel.xpath('//input').extract(),
|
||||
[x.extract() for x in sel.xpath('//input')])
|
||||
|
||||
self.assertEqual([x.extract() for x in ss.xpath("//input[@name='a']/@name")],
|
||||
self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")],
|
||||
[u'a'])
|
||||
self.assertEqual([x.extract() for x in ss.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
|
||||
self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
|
||||
[u'12.0'])
|
||||
|
||||
self.assertEqual(ss.xpath("concat('xpath', 'rules')").extract(),
|
||||
self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(),
|
||||
[u'xpathrules'])
|
||||
self.assertEqual([x.extract() for x in ss.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
|
||||
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
|
||||
[u'12'])
|
||||
|
||||
def test_select_unicode_query(self):
|
||||
body = u"<p><input name='\xa9' value='1'/></p>"
|
||||
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
|
||||
ss = self.sscls(response)
|
||||
self.assertEqual(ss.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
|
||||
sel = self.sscls(response)
|
||||
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
|
||||
|
||||
def test_list_elements_type(self):
|
||||
"""Test Selector returning the same type in selection methods"""
|
||||
@ -69,14 +69,14 @@ class SelectorTestCase(unittest.TestCase):
|
||||
|
||||
def test_flavor_detection(self):
|
||||
text = '<div><img src="a.jpg"><p>Hello</div>'
|
||||
ss = self.sscls(XmlResponse('http://example.com', body=text))
|
||||
self.assertEqual(ss.contenttype, 'xml')
|
||||
self.assertEqual(ss.xpath("//div").extract(),
|
||||
sel = self.sscls(XmlResponse('http://example.com', body=text))
|
||||
self.assertEqual(sel.contenttype, 'xml')
|
||||
self.assertEqual(sel.xpath("//div").extract(),
|
||||
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
|
||||
|
||||
ss = self.sscls(HtmlResponse('http://example.com', body=text))
|
||||
self.assertEqual(ss.contenttype, 'html')
|
||||
self.assertEqual(ss.xpath("//div").extract(),
|
||||
sel = self.sscls(HtmlResponse('http://example.com', body=text))
|
||||
self.assertEqual(sel.contenttype, 'html')
|
||||
self.assertEqual(sel.xpath("//div").extract(),
|
||||
[u'<div><img src="a.jpg"><p>Hello</p></div>'])
|
||||
|
||||
def test_nested_selectors(self):
|
||||
@ -110,13 +110,13 @@ class SelectorTestCase(unittest.TestCase):
|
||||
<div id=1>not<span>me</span></div>
|
||||
<div class="dos"><p>text</p><a href='#'>foo</a></div>
|
||||
</body>'''
|
||||
ss = self.sscls(text=body)
|
||||
self.assertEqual(ss.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
|
||||
self.assertEqual(ss.css('#1').xpath('./span/text()').extract(), [u'me'])
|
||||
sel = self.sscls(text=body)
|
||||
self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
|
||||
self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me'])
|
||||
|
||||
def test_dont_strip(self):
|
||||
hxs = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
|
||||
self.assertEqual(hxs.xpath("//text()").extract(), [u'fff: ', u'zzz'])
|
||||
sel = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
|
||||
self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz'])
|
||||
|
||||
def test_namespaces_simple(self):
|
||||
body = """
|
||||
@ -279,10 +279,10 @@ class SelectorTestCase(unittest.TestCase):
|
||||
<link type="application/atom+xml">
|
||||
</feed>
|
||||
"""
|
||||
xxs = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
|
||||
self.assertEqual(len(xxs.xpath("//link")), 0)
|
||||
xxs.remove_namespaces()
|
||||
self.assertEqual(len(xxs.xpath("//link")), 2)
|
||||
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
|
||||
self.assertEqual(len(sel.xpath("//link")), 0)
|
||||
sel.remove_namespaces()
|
||||
self.assertEqual(len(sel.xpath("//link")), 2)
|
||||
|
||||
def test_remove_attributes_namespaces(self):
|
||||
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
@ -291,10 +291,10 @@ class SelectorTestCase(unittest.TestCase):
|
||||
<link atom:type="application/atom+xml">
|
||||
</feed>
|
||||
"""
|
||||
xxs = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
|
||||
self.assertEqual(len(xxs.xpath("//link/@type")), 0)
|
||||
xxs.remove_namespaces()
|
||||
self.assertEqual(len(xxs.xpath("//link/@type")), 2)
|
||||
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
|
||||
self.assertEqual(len(sel.xpath("//link/@type")), 0)
|
||||
sel.remove_namespaces()
|
||||
self.assertEqual(len(sel.xpath("//link/@type")), 2)
|
||||
|
||||
|
||||
class DeprecatedXpathSelectorTest(unittest.TestCase):
|
||||
|
@ -120,16 +120,16 @@ class CSSSelectorTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY)
|
||||
self.ss = self.sscls(self.htmlresponse)
|
||||
self.sel = self.sscls(self.htmlresponse)
|
||||
|
||||
def x(self, *a, **kw):
|
||||
return [v.strip() for v in self.ss.css(*a, **kw).extract() if v.strip()]
|
||||
return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()]
|
||||
|
||||
def test_selector_simple(self):
|
||||
for x in self.ss.css('input'):
|
||||
self.assertTrue(isinstance(x, self.ss.__class__), x)
|
||||
self.assertEqual(self.ss.css('input').extract(),
|
||||
[x.extract() for x in self.ss.css('input')])
|
||||
for x in self.sel.css('input'):
|
||||
self.assertTrue(isinstance(x, self.sel.__class__), x)
|
||||
self.assertEqual(self.sel.css('input').extract(),
|
||||
[x.extract() for x in self.sel.css('input')])
|
||||
|
||||
def test_text_pseudo_element(self):
|
||||
self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>'])
|
||||
@ -147,7 +147,7 @@ class CSSSelectorTest(unittest.TestCase):
|
||||
self.assertEqual(self.x('map[name="dummymap"] ::attr(shape)'), [u'circle', u'default'])
|
||||
|
||||
def test_nested_selector(self):
|
||||
self.assertEqual(self.ss.css('p').css('b::text').extract(),
|
||||
self.assertEqual(self.sel.css('p').css('b::text').extract(),
|
||||
[u'hi', u'guy'])
|
||||
self.assertEqual(self.ss.css('div').css('area:last-child').extract(),
|
||||
self.assertEqual(self.sel.css('div').css('area:last-child').extract(),
|
||||
[u'<area shape="default" id="area-nohref">'])
|
||||
|
Loading…
x
Reference in New Issue
Block a user