1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 06:43:43 +00:00

use sel name for Selector's instances in docs, internals and shell

This commit is contained in:
Daniel Graña 2013-10-15 15:58:36 -02:00
parent 1abb1af0c6
commit 155ea08ea1
11 changed files with 95 additions and 95 deletions

View File

@ -143,12 +143,12 @@ Finally, here's the spider code::
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
def parse_torrent(self, response):
ss = Selector(response)
sel = Selector(response)
torrent = TorrentItem()
torrent['url'] = response.url
torrent['name'] = ss.xpath("//h1/text()").extract()
torrent['description'] = ss.xpath("//div[@id='description']").extract()
torrent['size'] = ss.xpath("//div[@id='info-left']/p[2]/text()[2]").extract()
torrent['name'] = sel.xpath("//h1/text()").extract()
torrent['description'] = sel.xpath("//div[@id='description']").extract()
torrent['size'] = sel.xpath("//div[@id='info-left']/p[2]/text()[2]").extract()
return torrent
For brevity's sake, we intentionally left out the import statements. The

View File

@ -255,7 +255,7 @@ This is what the shell looks like::
[s] Available Scrapy objects:
[s] 2010-08-19 21:45:59-0300 [default] INFO: Spider closed (finished)
[s] ss <Selector (http://www.dmoz.org/Computers/Programming/Languages/Python/Books/) xpath=None>
[s] sel <Selector (http://www.dmoz.org/Computers/Programming/Languages/Python/Books/) xpath=None>
[s] item Item()
[s] request <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
[s] response <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
@ -271,25 +271,25 @@ After the shell loads, you will have the response fetched in a local
``response`` variable, so if you type ``response.body`` you will see the body
of the response, or you can type ``response.headers`` to see its headers.
The shell also pre-instantiate a selector for this response in variable ``ss``,
The shell also pre-instantiate a selector for this response in variable ``sel``,
the selector automatically chooses the best parsing rules (XML vs HTML) based
on response's type.
So let's try it::
In [1]: ss.xpath('//title')
In [1]: sel.xpath('//title')
Out[1]: [<Selector (title) xpath=//title>]
In [2]: ss.xpath('//title').extract()
In [2]: sel.xpath('//title').extract()
Out[2]: [u'<title>Open Directory - Computers: Programming: Languages: Python: Books</title>']
In [3]: ss.xpath('//title/text()')
In [3]: sel.xpath('//title/text()')
Out[3]: [<Selector (text) xpath=//title/text()>]
In [4]: ss.xpath('//title/text()').extract()
In [4]: sel.xpath('//title/text()').extract()
Out[4]: [u'Open Directory - Computers: Programming: Languages: Python: Books']
In [5]: ss.xpath('//title/text()').re('(\w+):')
In [5]: sel.xpath('//title/text()').re('(\w+):')
Out[5]: [u'Computers', u'Programming', u'Languages', u'Python']
Extracting the data
@ -309,25 +309,25 @@ is inside a ``<ul>`` element, in fact the *second* ``<ul>`` element.
So we can select each ``<li>`` element belonging to the sites list with this
code::
ss.xpath('//ul/li')
sel.xpath('//ul/li')
And from them, the sites descriptions::
ss.xpath('//ul/li/text()').extract()
sel.xpath('//ul/li/text()').extract()
The sites titles::
ss.xpath('//ul/li/a/text()').extract()
sel.xpath('//ul/li/a/text()').extract()
And the sites links::
ss.xpath('//ul/li/a/@href').extract()
sel.xpath('//ul/li/a/@href').extract()
As we said before, each ``.xpath()`` call returns a list of selectors, so we can
concatenate further ``.xpath()`` calls to dig deeper into a node. We are going to use
that property here, so::
sites = ss.xpath('//ul/li')
sites = sel.xpath('//ul/li')
for site in sites:
title = site.xpath('a/text()').extract()
link = site.xpath('a/@href').extract()
@ -355,8 +355,8 @@ Let's add this code to our spider::
]
def parse(self, response):
ss = Selector(response)
sites = ss.xpath('//ul/li')
sel = Selector(response)
sites = sel.xpath('//ul/li')
for site in sites:
title = site.xpath('a/text()').extract()
link = site.xpath('a/@href').extract()
@ -398,8 +398,8 @@ scraped so far, the final code for our Spider would be like this::
]
def parse(self, response):
ss = Selector(response)
sites = ss.xpath('//ul/li')
sel = Selector(response)
sites = sel.xpath('//ul/li')
items = []
for site in sites:
item = DmozItem()

View File

@ -146,10 +146,10 @@ that have that grey colour of the links,
Finally, we can write our ``parse_category()`` method::
def parse_category(self, response):
ss = Selector(response)
sel = Selector(response)
# The path to website links in directory page
links = ss.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
links = sel.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
for link in links:
item = DirectoryItem()

View File

@ -62,13 +62,13 @@ body is what they're going to be "selecting"::
class MySpider(BaseSpider):
# ...
def parse(self, response):
ss = Selector(response)
sel = Selector(response)
# Using XPath query
print ss.xpath('//p')
print sel.xpath('//p')
# Using CSS query
print ss.css('p')
print sel.css('p')
# Nesting queries
print ss.xpath('//div[@foo="bar"]').css('span#bold')
print sel.xpath('//div[@foo="bar"]').css('span#bold')
Using selectors
@ -94,7 +94,7 @@ First, let's open the shell::
scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html
Then, after the shell loads, you'll have a selector already instantiated and
ready to use in ``ss`` shell variable.
ready to use in ``sel`` shell variable.
Since we're dealing with HTML, the selector will automatically use an HTML parser.
@ -104,7 +104,7 @@ So, by looking at the :ref:`HTML code <topics-selectors-htmlcode>` of that
page, let's construct an XPath (using an HTML selector) for selecting the text
inside the title tag::
>>> ss.xpath('//title/text()')
>>> sel.xpath('//title/text()')
[<Selector (text) xpath=//title/text()>]
As you can see, the ``.xpath()`` method returns an
@ -114,45 +114,45 @@ selectors. This API can be used quickly for extracting nested data.
To actually extract the textual data, you must call the selector ``.extract()``
method, as follows::
>>> ss.xpath('//title/text()').extract()
>>> sel.xpath('//title/text()').extract()
[u'Example website']
Notice that CSS selectors can select text or attribute nodes using CSS3
pseudo-elements::
>>> ss.css('title::text').extract()
>>> sel.css('title::text').extract()
[u'Example website']
Now we're going to get the base URL and some image links::
>>> ss.xpath('//base/@href').extract()
>>> sel.xpath('//base/@href').extract()
[u'http://example.com/']
>>> ss.css('base::attr(href)').extract()
>>> sel.css('base::attr(href)').extract()
[u'http://example.com/']
>>> ss.xpath('//a[contains(@href, "image")]/@href').extract()
>>> sel.xpath('//a[contains(@href, "image")]/@href').extract()
[u'image1.html',
u'image2.html',
u'image3.html',
u'image4.html',
u'image5.html']
>>> ss.css('a[href*=image]::attr(href)').extract()
>>> sel.css('a[href*=image]::attr(href)').extract()
[u'image1.html',
u'image2.html',
u'image3.html',
u'image4.html',
u'image5.html']
>>> ss.xpath('//a[contains(@href, "image")]/img/@src').extract()
>>> sel.xpath('//a[contains(@href, "image")]/img/@src').extract()
[u'image1_thumb.jpg',
u'image2_thumb.jpg',
u'image3_thumb.jpg',
u'image4_thumb.jpg',
u'image5_thumb.jpg']
>>> ss.css('a[href*=image] img::attr(src)').extract()
>>> sel.css('a[href*=image] img::attr(src)').extract()
[u'image1_thumb.jpg',
u'image2_thumb.jpg',
u'image3_thumb.jpg',
@ -168,7 +168,7 @@ The selection methods (``.xpath()`` or ``.css()``) returns a list of selectors
of the same type, so you can call the selection methods for those selectors
too. Here's an example::
>>> links = ss.xpath('//a[contains(@href, "image")]')
>>> links = sel.xpath('//a[contains(@href, "image")]')
>>> links.extract()
[u'<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
u'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
@ -197,7 +197,7 @@ can't construct nested ``.re()`` calls.
Here's an example used to extract images names from the :ref:`HTML code
<topics-selectors-htmlcode>` above::
>>> ss.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)')
>>> sel.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)')
[u'My image 1',
u'My image 2',
u'My image 3',
@ -216,7 +216,7 @@ with ``/``, that XPath will be absolute to the document and not relative to the
For example, suppose you want to extract all ``<p>`` elements inside ``<div>``
elements. First, you would get all ``<div>`` elements::
>>> divs = ss.xpath('//div')
>>> divs = sel.xpath('//div')
At first, you may be tempted to use the following approach, which is wrong, as
it actually extracts all ``<p>`` elements from the document, not only those

View File

@ -83,7 +83,7 @@ Those objects are:
* ``response`` - a :class:`~scrapy.http.Response` object containing the last
fetched page
* ``ss`` - a :class:`~scrapy.selector.Selector` object constructed
* ``sel`` - a :class:`~scrapy.selector.Selector` object constructed
with the last response fetched
* ``settings`` - the current :ref:`Scrapy settings <topics-settings>`
@ -111,7 +111,7 @@ list of available objects and useful shortcuts (you'll notice that these lines
all start with the ``[s]`` prefix)::
[s] Available objects
[s] ss <Selector (http://scrapy.org) xpath=None>
[s] sel <Selector (http://scrapy.org) xpath=None>
[s] item Item()
[s] request <http://scrapy.org>
[s] response <http://scrapy.org>
@ -126,12 +126,12 @@ all start with the ``[s]`` prefix)::
After that, we can star playing with the objects::
>>> ss.xpath("//h2/text()").extract()[0]
>>> sel.xpath("//h2/text()").extract()[0]
u'Welcome to Scrapy'
>>> fetch("http://slashdot.org")
[s] Available Scrapy objects:
[s] ss <Selector (http://slashdot.org) xpath=None>
[s] sel <Selector (http://slashdot.org) xpath=None>
[s] item JobItem()
[s] request <GET http://slashdot.org>
[s] response <200 http://slashdot.org>
@ -142,7 +142,7 @@ After that, we can star playing with the objects::
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
[s] view(response) View response in a browser
>>> ss.xpath("//h2/text()").extract()
>>> sel.xpath("//h2/text()").extract()
[u'News for nerds, stuff that matters']
>>> request = request.replace(method="POST")
@ -180,7 +180,7 @@ When you run the spider, you will get something similar to this::
2009-08-27 19:15:25-0300 [example.com] DEBUG: Crawled <http://www.example.com/> (referer: <None>)
2009-08-27 19:15:26-0300 [example.com] DEBUG: Crawled <http://www.example.com/products.php> (referer: <http://www.example.com/>)
[s] Available objects
[s] ss <Selector (http://www.example.com/products.php) xpath=None>
[s] sel <Selector (http://www.example.com/products.php) xpath=None>
...
>>> response.url
@ -188,7 +188,7 @@ When you run the spider, you will get something similar to this::
Then, you can check if the extraction code is working::
>>> ss.xpath('//h1')
>>> sel.xpath('//h1')
[]
Nope, it doesn't. So you can open the response in your web browser and see if

View File

@ -231,11 +231,11 @@ Another example returning multiples Requests and Items from a single callback::
]
def parse(self, response):
ss = Selector(response)
for h3 in ss.xpath('//h3').extract():
sel = Selector(response)
for h3 in sel.xpath('//h3').extract():
yield MyItem(title=h3)
for url in ss.xpath('//a/@href').extract():
for url in sel.xpath('//a/@href').extract():
yield Request(url, callback=self.parse)
.. module:: scrapy.contrib.spiders
@ -334,11 +334,11 @@ Let's now take a look at an example CrawlSpider with rules::
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
ss = Selector(response)
sel = Selector(response)
item = Item()
item['id'] = ss.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = ss.xpath('//td[@id="item_name"]/text()').extract()
item['description'] = ss.xpath('//td[@id="item_description"]/text()').extract()
item['id'] = sel.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = sel.xpath('//td[@id="item_name"]/text()').extract()
item['description'] = sel.xpath('//td[@id="item_description"]/text()').extract()
return item

View File

@ -116,11 +116,11 @@ class SgmlLinkExtractor(BaseSgmlLinkExtractor):
def extract_links(self, response):
base_url = None
if self.restrict_xpaths:
ss = Selector(response)
sel = Selector(response)
base_url = get_base_url(response)
body = u''.join(f
for x in self.restrict_xpaths
for f in ss.xpath(x).extract()
for f in sel.xpath(x).extract()
).encode(response.encoding)
else:
body = response.body

View File

@ -17,7 +17,7 @@ from scrapy.utils.misc import load_object
from scrapy.utils.response import open_in_browser
from scrapy.utils.console import start_python_console
from scrapy.settings import Settings
from scrapy.http import Request, Response, HtmlResponse, XmlResponse
from scrapy.http import Request, Response
from scrapy.exceptions import IgnoreRequest
@ -95,7 +95,7 @@ class Shell(object):
self.vars['spider'] = spider
self.vars['request'] = request
self.vars['response'] = response
self.vars['ss'] = Selector(response)
self.vars['sel'] = Selector(response)
if self.inthread:
self.vars['fetch'] = self.fetch
self.vars['view'] = open_in_browser

View File

@ -31,7 +31,7 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
@defer.inlineCallbacks
def test_response_selector_html(self):
xpath = 'ss.xpath("//p[@class=\'one\']/text()").extract()[0]'
xpath = 'sel.xpath("//p[@class=\'one\']/text()").extract()[0]'
_, out, _ = yield self.execute([self.url('/html'), '-c', xpath])
self.assertEqual(out.strip(), 'Works')

View File

@ -16,31 +16,31 @@ class SelectorTestCase(unittest.TestCase):
"""Simple selector tests"""
body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
response = TextResponse(url="http://example.com", body=body)
ss = self.sscls(response)
sel = self.sscls(response)
xl = ss.xpath('//input')
xl = sel.xpath('//input')
self.assertEqual(2, len(xl))
for x in xl:
assert isinstance(x, self.sscls)
self.assertEqual(ss.xpath('//input').extract(),
[x.extract() for x in ss.xpath('//input')])
self.assertEqual(sel.xpath('//input').extract(),
[x.extract() for x in sel.xpath('//input')])
self.assertEqual([x.extract() for x in ss.xpath("//input[@name='a']/@name")],
self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")],
[u'a'])
self.assertEqual([x.extract() for x in ss.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
[u'12.0'])
self.assertEqual(ss.xpath("concat('xpath', 'rules')").extract(),
self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(),
[u'xpathrules'])
self.assertEqual([x.extract() for x in ss.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
[u'12'])
def test_select_unicode_query(self):
body = u"<p><input name='\xa9' value='1'/></p>"
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
ss = self.sscls(response)
self.assertEqual(ss.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
sel = self.sscls(response)
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
def test_list_elements_type(self):
"""Test Selector returning the same type in selection methods"""
@ -69,14 +69,14 @@ class SelectorTestCase(unittest.TestCase):
def test_flavor_detection(self):
text = '<div><img src="a.jpg"><p>Hello</div>'
ss = self.sscls(XmlResponse('http://example.com', body=text))
self.assertEqual(ss.contenttype, 'xml')
self.assertEqual(ss.xpath("//div").extract(),
sel = self.sscls(XmlResponse('http://example.com', body=text))
self.assertEqual(sel.contenttype, 'xml')
self.assertEqual(sel.xpath("//div").extract(),
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
ss = self.sscls(HtmlResponse('http://example.com', body=text))
self.assertEqual(ss.contenttype, 'html')
self.assertEqual(ss.xpath("//div").extract(),
sel = self.sscls(HtmlResponse('http://example.com', body=text))
self.assertEqual(sel.contenttype, 'html')
self.assertEqual(sel.xpath("//div").extract(),
[u'<div><img src="a.jpg"><p>Hello</p></div>'])
def test_nested_selectors(self):
@ -110,13 +110,13 @@ class SelectorTestCase(unittest.TestCase):
<div id=1>not<span>me</span></div>
<div class="dos"><p>text</p><a href='#'>foo</a></div>
</body>'''
ss = self.sscls(text=body)
self.assertEqual(ss.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
self.assertEqual(ss.css('#1').xpath('./span/text()').extract(), [u'me'])
sel = self.sscls(text=body)
self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me'])
def test_dont_strip(self):
hxs = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
self.assertEqual(hxs.xpath("//text()").extract(), [u'fff: ', u'zzz'])
sel = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz'])
def test_namespaces_simple(self):
body = """
@ -279,10 +279,10 @@ class SelectorTestCase(unittest.TestCase):
<link type="application/atom+xml">
</feed>
"""
xxs = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
self.assertEqual(len(xxs.xpath("//link")), 0)
xxs.remove_namespaces()
self.assertEqual(len(xxs.xpath("//link")), 2)
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
self.assertEqual(len(sel.xpath("//link")), 0)
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link")), 2)
def test_remove_attributes_namespaces(self):
xml = """<?xml version="1.0" encoding="UTF-8"?>
@ -291,10 +291,10 @@ class SelectorTestCase(unittest.TestCase):
<link atom:type="application/atom+xml">
</feed>
"""
xxs = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
self.assertEqual(len(xxs.xpath("//link/@type")), 0)
xxs.remove_namespaces()
self.assertEqual(len(xxs.xpath("//link/@type")), 2)
sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml))
self.assertEqual(len(sel.xpath("//link/@type")), 0)
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link/@type")), 2)
class DeprecatedXpathSelectorTest(unittest.TestCase):

View File

@ -120,16 +120,16 @@ class CSSSelectorTest(unittest.TestCase):
def setUp(self):
self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY)
self.ss = self.sscls(self.htmlresponse)
self.sel = self.sscls(self.htmlresponse)
def x(self, *a, **kw):
return [v.strip() for v in self.ss.css(*a, **kw).extract() if v.strip()]
return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()]
def test_selector_simple(self):
for x in self.ss.css('input'):
self.assertTrue(isinstance(x, self.ss.__class__), x)
self.assertEqual(self.ss.css('input').extract(),
[x.extract() for x in self.ss.css('input')])
for x in self.sel.css('input'):
self.assertTrue(isinstance(x, self.sel.__class__), x)
self.assertEqual(self.sel.css('input').extract(),
[x.extract() for x in self.sel.css('input')])
def test_text_pseudo_element(self):
self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>'])
@ -147,7 +147,7 @@ class CSSSelectorTest(unittest.TestCase):
self.assertEqual(self.x('map[name="dummymap"] ::attr(shape)'), [u'circle', u'default'])
def test_nested_selector(self):
self.assertEqual(self.ss.css('p').css('b::text').extract(),
self.assertEqual(self.sel.css('p').css('b::text').extract(),
[u'hi', u'guy'])
self.assertEqual(self.ss.css('div').css('area:last-child').extract(),
self.assertEqual(self.sel.css('div').css('area:last-child').extract(),
[u'<area shape="default" id="area-nohref">'])