response.selector, response.xpath(), response.css() and response.re()

2025-02-23 16:24:18 +00:00 · 2014-04-10 04:11:23 +06:00 · 2014-04-10 04:11:23 +06:00 · 134bd8a9e0
commit 134bd8a9e0
parent 21d073d03b
2 changed files with 83 additions and 1 deletions
--- a/scrapy/http/response/text.py
+++ b/scrapy/http/response/text.py
@ -19,6 +19,7 @@ class TextResponse(Response):
        self._encoding = kwargs.pop('encoding', None)
        self._cached_benc = None
        self._cached_ubody = None
+        self._cached_selector = None
        super(TextResponse, self).__init__(*args, **kwargs)

    def _set_url(self, url):
@ -88,3 +89,19 @@ class TextResponse(Response):
    @memoizemethod_noargs
    def _body_declared_encoding(self):
        return html_body_declared_encoding(self.body)
+
+    @property
+    def selector(self):
+        from scrapy.selector import Selector
+        if self._cached_selector is None:
+            self._cached_selector = Selector(self)
+        return self._cached_selector
+
+    def xpath(self, query):
+        return self.selector.xpath(query)
+
+    def css(self, query):
+        return self.selector.css(query)
+
+    def re(self, regex):
+        return self.selector.re(regex)
--- a/scrapy/tests/test_http_response.py
+++ b/scrapy/tests/test_http_response.py
@ -2,6 +2,7 @@ import unittest

 from w3lib.encoding import resolve_encoding
 from scrapy.http import Request, Response, TextResponse, HtmlResponse, XmlResponse, Headers
+from scrapy.selector import Selector


 class BaseResponseTest(unittest.TestCase):
@ -112,6 +113,7 @@ class BaseResponseTest(unittest.TestCase):
        self.assertRaises(AttributeError, setattr, r, 'url', 'http://example2.com')
        self.assertRaises(AttributeError, setattr, r, 'body', 'xxx')

+
 class ResponseText(BaseResponseTest):

    def test_no_unicode_url(self):
@ -258,13 +260,52 @@ class TextResponseTest(BaseResponseTest):
        #r = self.response_class("http://www.example.com", body='PREFIX\xe3\xabSUFFIX')
        #assert u'\ufffd' in r.body_as_unicode(), repr(r.body_as_unicode())

+    def test_selector(self):
+        body = "<html><head><title>Some page</title><body></body></html>"
+        response = self.response_class("http://www.example.com", body=body)
+
+        self.assertIsInstance(response.selector, Selector)
+        self.assertEqual(response.selector.type, 'html')
+        self.assertIs(response.selector, response.selector)  # property is cached
+        self.assertIs(response.selector.response, response)
+
+        self.assertEqual(
+            response.selector.xpath("//title/text()").extract(),
+            [u'Some page']
+        )
+        self.assertEqual(
+            response.selector.css("title::text").extract(),
+            [u'Some page']
+        )
+        self.assertEqual(
+            response.selector.re("Some (.*)</title>"),
+            [u'page']
+        )
+
+    def test_selector_shortcuts(self):
+        body = "<html><head><title>Some page</title><body></body></html>"
+        response = self.response_class("http://www.example.com", body=body)
+
+        self.assertEqual(
+            response.xpath("//title/text()").extract(),
+            response.selector.xpath("//title/text()").extract(),
+        )
+        self.assertEqual(
+            response.css("title::text").extract(),
+            response.selector.css("title::text").extract(),
+        )
+        self.assertEqual(
+            response.re("Some (.*)</title>"),
+            response.selector.re("Some (.*)</title>"),
+        )
+

 class HtmlResponseTest(TextResponseTest):

    response_class = HtmlResponse

    def test_html_encoding(self):
-        
+
        body = """<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
        </head><body>Price: \xa3100</body></html>'
        """
@ -328,6 +369,30 @@ class XmlResponseTest(TextResponseTest):
        self._assert_response_values(r6, 'iso-8859-1', body2)
        self._assert_response_values(r7, 'utf-8', body2)

+    def test_selector(self):
+        body = '<?xml version="1.0" encoding="utf-8"?><xml><elem>value</elem></xml>'
+        response = self.response_class("http://www.example.com", body=body)
+
+        self.assertIsInstance(response.selector, Selector)
+        self.assertEqual(response.selector.type, 'xml')
+        self.assertIs(response.selector, response.selector)  # property is cached
+        self.assertIs(response.selector.response, response)
+
+        self.assertEqual(
+            response.selector.xpath("//elem/text()").extract(),
+            [u'value']
+        )
+
+    def test_selector_shortcuts(self):
+        body = '<?xml version="1.0" encoding="utf-8"?><xml><elem>value</elem></xml>'
+        response = self.response_class("http://www.example.com", body=body)
+
+        self.assertEqual(
+            response.xpath("//elem/text()").extract(),
+            response.selector.xpath("//elem/text()").extract(),
+        )
+
+

 if __name__ == "__main__":
    unittest.main()