diff --git a/docs/intro/overview.rst b/docs/intro/overview.rst index 2138e76f8..588f33755 100644 --- a/docs/intro/overview.rst +++ b/docs/intro/overview.rst @@ -143,12 +143,12 @@ Finally, here's the spider code:: rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')] def parse_torrent(self, response): - ss = Selector(response) + sel = Selector(response) torrent = TorrentItem() torrent['url'] = response.url - torrent['name'] = ss.xpath("//h1/text()").extract() - torrent['description'] = ss.xpath("//div[@id='description']").extract() - torrent['size'] = ss.xpath("//div[@id='info-left']/p[2]/text()[2]").extract() + torrent['name'] = sel.xpath("//h1/text()").extract() + torrent['description'] = sel.xpath("//div[@id='description']").extract() + torrent['size'] = sel.xpath("//div[@id='info-left']/p[2]/text()[2]").extract() return torrent For brevity's sake, we intentionally left out the import statements. The diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index c147d0497..fa1909479 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -255,7 +255,7 @@ This is what the shell looks like:: [s] Available Scrapy objects: [s] 2010-08-19 21:45:59-0300 [default] INFO: Spider closed (finished) - [s] ss + [s] sel [s] item Item() [s] request [s] response <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> @@ -271,25 +271,25 @@ After the shell loads, you will have the response fetched in a local ``response`` variable, so if you type ``response.body`` you will see the body of the response, or you can type ``response.headers`` to see its headers. -The shell also pre-instantiate a selector for this response in variable ``ss``, +The shell also pre-instantiate a selector for this response in variable ``sel``, the selector automatically chooses the best parsing rules (XML vs HTML) based on response's type. So let's try it:: - In [1]: ss.xpath('//title') + In [1]: sel.xpath('//title') Out[1]: [] - In [2]: ss.xpath('//title').extract() + In [2]: sel.xpath('//title').extract() Out[2]: [u'Open Directory - Computers: Programming: Languages: Python: Books'] - In [3]: ss.xpath('//title/text()') + In [3]: sel.xpath('//title/text()') Out[3]: [] - In [4]: ss.xpath('//title/text()').extract() + In [4]: sel.xpath('//title/text()').extract() Out[4]: [u'Open Directory - Computers: Programming: Languages: Python: Books'] - In [5]: ss.xpath('//title/text()').re('(\w+):') + In [5]: sel.xpath('//title/text()').re('(\w+):') Out[5]: [u'Computers', u'Programming', u'Languages', u'Python'] Extracting the data @@ -309,25 +309,25 @@ is inside a ``
    `` element, in fact the *second* ``
      `` element. So we can select each ``
    • `` element belonging to the sites list with this code:: - ss.xpath('//ul/li') + sel.xpath('//ul/li') And from them, the sites descriptions:: - ss.xpath('//ul/li/text()').extract() + sel.xpath('//ul/li/text()').extract() The sites titles:: - ss.xpath('//ul/li/a/text()').extract() + sel.xpath('//ul/li/a/text()').extract() And the sites links:: - ss.xpath('//ul/li/a/@href').extract() + sel.xpath('//ul/li/a/@href').extract() As we said before, each ``.xpath()`` call returns a list of selectors, so we can concatenate further ``.xpath()`` calls to dig deeper into a node. We are going to use that property here, so:: - sites = ss.xpath('//ul/li') + sites = sel.xpath('//ul/li') for site in sites: title = site.xpath('a/text()').extract() link = site.xpath('a/@href').extract() @@ -355,8 +355,8 @@ Let's add this code to our spider:: ] def parse(self, response): - ss = Selector(response) - sites = ss.xpath('//ul/li') + sel = Selector(response) + sites = sel.xpath('//ul/li') for site in sites: title = site.xpath('a/text()').extract() link = site.xpath('a/@href').extract() @@ -398,8 +398,8 @@ scraped so far, the final code for our Spider would be like this:: ] def parse(self, response): - ss = Selector(response) - sites = ss.xpath('//ul/li') + sel = Selector(response) + sites = sel.xpath('//ul/li') items = [] for site in sites: item = DmozItem() diff --git a/docs/topics/firebug.rst b/docs/topics/firebug.rst index 839701a32..5aa03a65e 100644 --- a/docs/topics/firebug.rst +++ b/docs/topics/firebug.rst @@ -146,10 +146,10 @@ that have that grey colour of the links, Finally, we can write our ``parse_category()`` method:: def parse_category(self, response): - ss = Selector(response) + sel = Selector(response) # The path to website links in directory page - links = ss.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font') + links = sel.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font') for link in links: item = DirectoryItem() diff --git a/docs/topics/selectors.rst b/docs/topics/selectors.rst index b9b185d4c..fd986d0cd 100644 --- a/docs/topics/selectors.rst +++ b/docs/topics/selectors.rst @@ -62,13 +62,13 @@ body is what they're going to be "selecting":: class MySpider(BaseSpider): # ... def parse(self, response): - ss = Selector(response) + sel = Selector(response) # Using XPath query - print ss.xpath('//p') + print sel.xpath('//p') # Using CSS query - print ss.css('p') + print sel.css('p') # Nesting queries - print ss.xpath('//div[@foo="bar"]').css('span#bold') + print sel.xpath('//div[@foo="bar"]').css('span#bold') Using selectors @@ -94,7 +94,7 @@ First, let's open the shell:: scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html Then, after the shell loads, you'll have a selector already instantiated and -ready to use in ``ss`` shell variable. +ready to use in ``sel`` shell variable. Since we're dealing with HTML, the selector will automatically use an HTML parser. @@ -104,7 +104,7 @@ So, by looking at the :ref:`HTML code ` of that page, let's construct an XPath (using an HTML selector) for selecting the text inside the title tag:: - >>> ss.xpath('//title/text()') + >>> sel.xpath('//title/text()') [] As you can see, the ``.xpath()`` method returns an @@ -114,45 +114,45 @@ selectors. This API can be used quickly for extracting nested data. To actually extract the textual data, you must call the selector ``.extract()`` method, as follows:: - >>> ss.xpath('//title/text()').extract() + >>> sel.xpath('//title/text()').extract() [u'Example website'] Notice that CSS selectors can select text or attribute nodes using CSS3 pseudo-elements:: - >>> ss.css('title::text').extract() + >>> sel.css('title::text').extract() [u'Example website'] Now we're going to get the base URL and some image links:: - >>> ss.xpath('//base/@href').extract() + >>> sel.xpath('//base/@href').extract() [u'http://example.com/'] - >>> ss.css('base::attr(href)').extract() + >>> sel.css('base::attr(href)').extract() [u'http://example.com/'] - >>> ss.xpath('//a[contains(@href, "image")]/@href').extract() + >>> sel.xpath('//a[contains(@href, "image")]/@href').extract() [u'image1.html', u'image2.html', u'image3.html', u'image4.html', u'image5.html'] - >>> ss.css('a[href*=image]::attr(href)').extract() + >>> sel.css('a[href*=image]::attr(href)').extract() [u'image1.html', u'image2.html', u'image3.html', u'image4.html', u'image5.html'] - >>> ss.xpath('//a[contains(@href, "image")]/img/@src').extract() + >>> sel.xpath('//a[contains(@href, "image")]/img/@src').extract() [u'image1_thumb.jpg', u'image2_thumb.jpg', u'image3_thumb.jpg', u'image4_thumb.jpg', u'image5_thumb.jpg'] - >>> ss.css('a[href*=image] img::attr(src)').extract() + >>> sel.css('a[href*=image] img::attr(src)').extract() [u'image1_thumb.jpg', u'image2_thumb.jpg', u'image3_thumb.jpg', @@ -168,7 +168,7 @@ The selection methods (``.xpath()`` or ``.css()``) returns a list of selectors of the same type, so you can call the selection methods for those selectors too. Here's an example:: - >>> links = ss.xpath('//a[contains(@href, "image")]') + >>> links = sel.xpath('//a[contains(@href, "image")]') >>> links.extract() [u'Name: My image 1
      ', u'Name: My image 2
      ', @@ -197,7 +197,7 @@ can't construct nested ``.re()`` calls. Here's an example used to extract images names from the :ref:`HTML code ` above:: - >>> ss.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)') + >>> sel.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)') [u'My image 1', u'My image 2', u'My image 3', @@ -216,7 +216,7 @@ with ``/``, that XPath will be absolute to the document and not relative to the For example, suppose you want to extract all ``

      `` elements inside ``

      `` elements. First, you would get all ``
      `` elements:: - >>> divs = ss.xpath('//div') + >>> divs = sel.xpath('//div') At first, you may be tempted to use the following approach, which is wrong, as it actually extracts all ``

      `` elements from the document, not only those diff --git a/docs/topics/shell.rst b/docs/topics/shell.rst index 589142aa6..561e299f4 100644 --- a/docs/topics/shell.rst +++ b/docs/topics/shell.rst @@ -83,7 +83,7 @@ Those objects are: * ``response`` - a :class:`~scrapy.http.Response` object containing the last fetched page - * ``ss`` - a :class:`~scrapy.selector.Selector` object constructed + * ``sel`` - a :class:`~scrapy.selector.Selector` object constructed with the last response fetched * ``settings`` - the current :ref:`Scrapy settings ` @@ -111,7 +111,7 @@ list of available objects and useful shortcuts (you'll notice that these lines all start with the ``[s]`` prefix):: [s] Available objects - [s] ss + [s] sel [s] item Item() [s] request [s] response @@ -126,12 +126,12 @@ all start with the ``[s]`` prefix):: After that, we can star playing with the objects:: - >>> ss.xpath("//h2/text()").extract()[0] + >>> sel.xpath("//h2/text()").extract()[0] u'Welcome to Scrapy' >>> fetch("http://slashdot.org") [s] Available Scrapy objects: - [s] ss + [s] sel [s] item JobItem() [s] request [s] response <200 http://slashdot.org> @@ -142,7 +142,7 @@ After that, we can star playing with the objects:: [s] fetch(req_or_url) Fetch request (or URL) and update local objects [s] view(response) View response in a browser - >>> ss.xpath("//h2/text()").extract() + >>> sel.xpath("//h2/text()").extract() [u'News for nerds, stuff that matters'] >>> request = request.replace(method="POST") @@ -180,7 +180,7 @@ When you run the spider, you will get something similar to this:: 2009-08-27 19:15:25-0300 [example.com] DEBUG: Crawled (referer: ) 2009-08-27 19:15:26-0300 [example.com] DEBUG: Crawled (referer: ) [s] Available objects - [s] ss + [s] sel ... >>> response.url @@ -188,7 +188,7 @@ When you run the spider, you will get something similar to this:: Then, you can check if the extraction code is working:: - >>> ss.xpath('//h1') + >>> sel.xpath('//h1') [] Nope, it doesn't. So you can open the response in your web browser and see if diff --git a/docs/topics/spiders.rst b/docs/topics/spiders.rst index 28393dfe2..dfbdcea18 100644 --- a/docs/topics/spiders.rst +++ b/docs/topics/spiders.rst @@ -231,11 +231,11 @@ Another example returning multiples Requests and Items from a single callback:: ] def parse(self, response): - ss = Selector(response) - for h3 in ss.xpath('//h3').extract(): + sel = Selector(response) + for h3 in sel.xpath('//h3').extract(): yield MyItem(title=h3) - for url in ss.xpath('//a/@href').extract(): + for url in sel.xpath('//a/@href').extract(): yield Request(url, callback=self.parse) .. module:: scrapy.contrib.spiders @@ -334,11 +334,11 @@ Let's now take a look at an example CrawlSpider with rules:: def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) - ss = Selector(response) + sel = Selector(response) item = Item() - item['id'] = ss.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') - item['name'] = ss.xpath('//td[@id="item_name"]/text()').extract() - item['description'] = ss.xpath('//td[@id="item_description"]/text()').extract() + item['id'] = sel.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') + item['name'] = sel.xpath('//td[@id="item_name"]/text()').extract() + item['description'] = sel.xpath('//td[@id="item_description"]/text()').extract() return item diff --git a/scrapy/contrib/linkextractors/sgml.py b/scrapy/contrib/linkextractors/sgml.py index f0e141a43..d8f6ae4ec 100644 --- a/scrapy/contrib/linkextractors/sgml.py +++ b/scrapy/contrib/linkextractors/sgml.py @@ -116,11 +116,11 @@ class SgmlLinkExtractor(BaseSgmlLinkExtractor): def extract_links(self, response): base_url = None if self.restrict_xpaths: - ss = Selector(response) + sel = Selector(response) base_url = get_base_url(response) body = u''.join(f for x in self.restrict_xpaths - for f in ss.xpath(x).extract() + for f in sel.xpath(x).extract() ).encode(response.encoding) else: body = response.body diff --git a/scrapy/shell.py b/scrapy/shell.py index 8b6c209cb..8f60b58a5 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -17,7 +17,7 @@ from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser from scrapy.utils.console import start_python_console from scrapy.settings import Settings -from scrapy.http import Request, Response, HtmlResponse, XmlResponse +from scrapy.http import Request, Response from scrapy.exceptions import IgnoreRequest @@ -95,7 +95,7 @@ class Shell(object): self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response - self.vars['ss'] = Selector(response) + self.vars['sel'] = Selector(response) if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser diff --git a/scrapy/tests/test_command_shell.py b/scrapy/tests/test_command_shell.py index 8dbaa632f..1fbb4b28a 100644 --- a/scrapy/tests/test_command_shell.py +++ b/scrapy/tests/test_command_shell.py @@ -31,7 +31,7 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase): @defer.inlineCallbacks def test_response_selector_html(self): - xpath = 'ss.xpath("//p[@class=\'one\']/text()").extract()[0]' + xpath = 'sel.xpath("//p[@class=\'one\']/text()").extract()[0]' _, out, _ = yield self.execute([self.url('/html'), '-c', xpath]) self.assertEqual(out.strip(), 'Works') diff --git a/scrapy/tests/test_selector.py b/scrapy/tests/test_selector.py index 4149b837c..d8ff6d346 100644 --- a/scrapy/tests/test_selector.py +++ b/scrapy/tests/test_selector.py @@ -16,31 +16,31 @@ class SelectorTestCase(unittest.TestCase): """Simple selector tests""" body = "

      " response = TextResponse(url="http://example.com", body=body) - ss = self.sscls(response) + sel = self.sscls(response) - xl = ss.xpath('//input') + xl = sel.xpath('//input') self.assertEqual(2, len(xl)) for x in xl: assert isinstance(x, self.sscls) - self.assertEqual(ss.xpath('//input').extract(), - [x.extract() for x in ss.xpath('//input')]) + self.assertEqual(sel.xpath('//input').extract(), + [x.extract() for x in sel.xpath('//input')]) - self.assertEqual([x.extract() for x in ss.xpath("//input[@name='a']/@name")], + self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")], [u'a']) - self.assertEqual([x.extract() for x in ss.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], + self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], [u'12.0']) - self.assertEqual(ss.xpath("concat('xpath', 'rules')").extract(), + self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(), [u'xpathrules']) - self.assertEqual([x.extract() for x in ss.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], + self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], [u'12']) def test_select_unicode_query(self): body = u"

      " response = TextResponse(url="http://example.com", body=body, encoding='utf8') - ss = self.sscls(response) - self.assertEqual(ss.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1']) + sel = self.sscls(response) + self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1']) def test_list_elements_type(self): """Test Selector returning the same type in selection methods""" @@ -69,14 +69,14 @@ class SelectorTestCase(unittest.TestCase): def test_flavor_detection(self): text = '

      Hello

      ' - ss = self.sscls(XmlResponse('http://example.com', body=text)) - self.assertEqual(ss.contenttype, 'xml') - self.assertEqual(ss.xpath("//div").extract(), + sel = self.sscls(XmlResponse('http://example.com', body=text)) + self.assertEqual(sel.contenttype, 'xml') + self.assertEqual(sel.xpath("//div").extract(), [u'

      Hello

      ']) - ss = self.sscls(HtmlResponse('http://example.com', body=text)) - self.assertEqual(ss.contenttype, 'html') - self.assertEqual(ss.xpath("//div").extract(), + sel = self.sscls(HtmlResponse('http://example.com', body=text)) + self.assertEqual(sel.contenttype, 'html') + self.assertEqual(sel.xpath("//div").extract(), [u'

      Hello

      ']) def test_nested_selectors(self): @@ -110,13 +110,13 @@ class SelectorTestCase(unittest.TestCase):
      notme

      text

      foo
      ''' - ss = self.sscls(text=body) - self.assertEqual(ss.xpath('//div[@id="1"]').css('span::text').extract(), [u'me']) - self.assertEqual(ss.css('#1').xpath('./span/text()').extract(), [u'me']) + sel = self.sscls(text=body) + self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me']) + self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me']) def test_dont_strip(self): - hxs = self.sscls(text='
      fff: zzz
      ') - self.assertEqual(hxs.xpath("//text()").extract(), [u'fff: ', u'zzz']) + sel = self.sscls(text='
      fff: zzz
      ') + self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz']) def test_namespaces_simple(self): body = """ @@ -279,10 +279,10 @@ class SelectorTestCase(unittest.TestCase): """ - xxs = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) - self.assertEqual(len(xxs.xpath("//link")), 0) - xxs.remove_namespaces() - self.assertEqual(len(xxs.xpath("//link")), 2) + sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) + self.assertEqual(len(sel.xpath("//link")), 0) + sel.remove_namespaces() + self.assertEqual(len(sel.xpath("//link")), 2) def test_remove_attributes_namespaces(self): xml = """ @@ -291,10 +291,10 @@ class SelectorTestCase(unittest.TestCase): """ - xxs = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) - self.assertEqual(len(xxs.xpath("//link/@type")), 0) - xxs.remove_namespaces() - self.assertEqual(len(xxs.xpath("//link/@type")), 2) + sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) + self.assertEqual(len(sel.xpath("//link/@type")), 0) + sel.remove_namespaces() + self.assertEqual(len(sel.xpath("//link/@type")), 2) class DeprecatedXpathSelectorTest(unittest.TestCase): diff --git a/scrapy/tests/test_selector_csstranslator.py b/scrapy/tests/test_selector_csstranslator.py index 45ef91c19..7ef9003aa 100644 --- a/scrapy/tests/test_selector_csstranslator.py +++ b/scrapy/tests/test_selector_csstranslator.py @@ -120,16 +120,16 @@ class CSSSelectorTest(unittest.TestCase): def setUp(self): self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY) - self.ss = self.sscls(self.htmlresponse) + self.sel = self.sscls(self.htmlresponse) def x(self, *a, **kw): - return [v.strip() for v in self.ss.css(*a, **kw).extract() if v.strip()] + return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()] def test_selector_simple(self): - for x in self.ss.css('input'): - self.assertTrue(isinstance(x, self.ss.__class__), x) - self.assertEqual(self.ss.css('input').extract(), - [x.extract() for x in self.ss.css('input')]) + for x in self.sel.css('input'): + self.assertTrue(isinstance(x, self.sel.__class__), x) + self.assertEqual(self.sel.css('input').extract(), + [x.extract() for x in self.sel.css('input')]) def test_text_pseudo_element(self): self.assertEqual(self.x('#p-b2'), [u'guy']) @@ -147,7 +147,7 @@ class CSSSelectorTest(unittest.TestCase): self.assertEqual(self.x('map[name="dummymap"] ::attr(shape)'), [u'circle', u'default']) def test_nested_selector(self): - self.assertEqual(self.ss.css('p').css('b::text').extract(), + self.assertEqual(self.sel.css('p').css('b::text').extract(), [u'hi', u'guy']) - self.assertEqual(self.ss.css('div').css('area:last-child').extract(), + self.assertEqual(self.sel.css('div').css('area:last-child').extract(), [u''])