1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 08:03:53 +00:00

Response.follow

This commit is contained in:
Mikhail Korobov 2017-02-15 04:39:47 +05:00
parent 608c3f0c45
commit 2674f317df
3 changed files with 143 additions and 117 deletions

View File

@ -6,7 +6,9 @@ See documentation in docs/topics/request-response.rst
"""
from six.moves.urllib.parse import urljoin
from scrapy.http.request import Request
from scrapy.http.headers import Headers
from scrapy.link import Link
from scrapy.utils.trackref import object_ref
from scrapy.http.common import obsolete_setter
from scrapy.exceptions import NotSupported
@ -101,3 +103,30 @@ class Response(object_ref):
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None):
# type: (...) -> Request
"""
Return a scrapy.Request instance to follow a link ``url``.
``url`` can be:
* absolute URL;
* relative URL;
* scrapy.link.Link object.
"""
if isinstance(url, Link):
url = url.url
url = self.urljoin(url)
return Request(url, callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback)

View File

@ -140,25 +140,22 @@ class TextResponse(Response):
* a Selector for ``<a>`` element, e.g.
``response.css('a.my_link')[0]``.
"""
if isinstance(url, Link):
url = url.url
elif isinstance(url, parsel.Selector):
if isinstance(url, parsel.Selector):
url = _url_from_selector(url)
elif isinstance(url, parsel.SelectorList):
raise ValueError("SelectorList is not supported")
encoding = self.encoding if encoding is None else encoding
url = self.urljoin(url)
return Request(url, callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback)
return super(TextResponse, self).follow(url, callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback
)
def _url_from_selector(sel):

View File

@ -143,6 +143,38 @@ class BaseResponseTest(unittest.TestCase):
r.css('body')
r.xpath('//body')
def test_follow_url_absolute(self):
self._assert_followed_url('http://foo.example.com',
'http://foo.example.com')
def test_follow_url_relative(self):
self._assert_followed_url('foo',
'http://example.com/foo')
def test_follow_link(self):
self._assert_followed_url(Link('http://example.com/foo'),
'http://example.com/foo')
def test_follow_whitespace_url(self):
self._assert_followed_url('foo ',
'http://example.com/foo%20')
def test_follow_whitespace_link(self):
self._assert_followed_url(Link('http://example.com/foo '),
'http://example.com/foo%20')
def _assert_followed_url(self, follow_obj, target_url, response=None):
if response is None:
response = self._links_response()
req = response.follow(follow_obj)
self.assertEqual(req.url, target_url)
return req
def _links_response(self):
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
resp = self.response_class('http://example.com/index', body=body)
return resp
class TextResponseTest(BaseResponseTest):
@ -354,16 +386,81 @@ class TextResponseTest(BaseResponseTest):
absolute = 'http://www.example.com/elsewhere/test'
self.assertEqual(joined, absolute)
def test_follow_selector(self):
resp = self._links_response()
urls = [
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/sample3.html',
'http://www.google.com/something',
'http://example.com/innertag.html'
]
# select <a> elements
for sellist in [resp.css('a'), resp.xpath('//a')]:
for sel, url in zip(sellist, urls):
self._assert_followed_url(sel, url, response=resp)
# href attributes should work
for sellist in [resp.css('a::attr(href)'), resp.xpath('//a/@href')]:
for sel, url in zip(sellist, urls):
self._assert_followed_url(sel, url, response=resp)
# non-a elements are not supported
self.assertRaises(ValueError, resp.follow, resp.css('div')[0])
def test_follow_selector_list(self):
resp = self._links_response()
self.assertRaisesRegex(ValueError, 'SelectorList',
resp.follow, resp.css('a'))
def test_follow_selector_attribute(self):
resp = self._links_response()
for src in resp.css('img::attr(src)'):
self._assert_followed_url(src, 'http://example.com/sample2.jpg')
def test_follow_whitespace_selector(self):
resp = self.response_class(
'http://example.com',
body=b'''<html><body><a href=" foo\n">click me</a></body></html>'''
)
self._assert_followed_url(resp.css('a')[0],
'http://example.com/foo',
response=resp)
self._assert_followed_url(resp.css('a::attr(href)')[0],
'http://example.com/foo',
response=resp)
def test_follow_encoding(self):
resp1 = self.response_class(
'http://example.com',
encoding='utf8',
body='<html><body><a href="foo?привет">click me</a></body></html>'.encode('utf8')
)
req = self._assert_followed_url(
resp1.css('a')[0],
'http://example.com/foo?%D0%BF%D1%80%D0%B8%D0%B2%D0%B5%D1%82',
response=resp1,
)
self.assertEqual(req.encoding, 'utf8')
resp2 = self.response_class(
'http://example.com',
encoding='cp1251',
body='<html><body><a href="foo?привет">click me</a></body></html>'.encode('cp1251')
)
req = self._assert_followed_url(
resp2.css('a')[0],
'http://example.com/foo?%EF%F0%E8%E2%E5%F2',
response=resp2,
)
self.assertEqual(req.encoding, 'cp1251')
class HtmlResponseTest(TextResponseTest):
response_class = HtmlResponse
def _links_response(self):
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
resp = self.response_class('http://example.com/index', body=body)
return resp
def test_html_encoding(self):
body = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
@ -396,103 +493,6 @@ class HtmlResponseTest(TextResponseTest):
r1 = self.response_class("http://www.example.com", body=body)
self._assert_response_values(r1, 'gb2312', body)
def assert_followed_url(self, follow_obj, target_url, response=None):
if response is None:
response = self._links_response()
req = response.follow(follow_obj)
self.assertEqual(req.url, target_url)
return req
def test_follow_url_absolute(self):
self.assert_followed_url('http://foo.example.com',
'http://foo.example.com')
def test_follow_url_relative(self):
self.assert_followed_url('foo',
'http://example.com/foo')
def test_follow_link(self):
self.assert_followed_url(Link('http://example.com/foo'),
'http://example.com/foo')
def test_follow_selector(self):
resp = self._links_response()
urls = [
'http://example.com/sample2.html',
'http://example.com/sample3.html',
'http://example.com/sample3.html',
'http://www.google.com/something',
'http://example.com/innertag.html'
]
# select <a> elements
for sellist in [resp.css('a'), resp.xpath('//a')]:
for sel, url in zip(sellist, urls):
self.assert_followed_url(sel, url, response=resp)
# href attributes should work
for sellist in [resp.css('a::attr(href)'), resp.xpath('//a/@href')]:
for sel, url in zip(sellist, urls):
self.assert_followed_url(sel, url, response=resp)
# non-a elements are not supported
self.assertRaises(ValueError, resp.follow, resp.css('div')[0])
def test_follow_selector_list(self):
resp = self._links_response()
self.assertRaisesRegex(ValueError, 'SelectorList',
resp.follow, resp.css('a'))
def test_follow_selector_attribute(self):
resp = self._links_response()
for src in resp.css('img::attr(src)'):
self.assert_followed_url(src, 'http://example.com/sample2.jpg')
def test_follow_whitespace_url(self):
self.assert_followed_url('foo ',
'http://example.com/foo%20')
def test_follow_whitespace_link(self):
self.assert_followed_url(Link('http://example.com/foo '),
'http://example.com/foo%20')
def test_follow_whitespace_selector(self):
resp = self.response_class(
'http://example.com',
body=b'''<html><body><a href=" foo\n">click me</a></body></html>'''
)
self.assert_followed_url(resp.css('a')[0],
'http://example.com/foo',
response=resp)
self.assert_followed_url(resp.css('a::attr(href)')[0],
'http://example.com/foo',
response=resp)
def test_follow_encoding(self):
resp1 = self.response_class(
'http://example.com',
encoding='utf8',
body='<html><body><a href="foo?привет">click me</a></body></html>'.encode('utf8')
)
req = self.assert_followed_url(
resp1.css('a')[0],
'http://example.com/foo?%D0%BF%D1%80%D0%B8%D0%B2%D0%B5%D1%82',
response=resp1,
)
self.assertEqual(req.encoding, 'utf8')
resp2 = self.response_class(
'http://example.com',
encoding='cp1251',
body='<html><body><a href="foo?привет">click me</a></body></html>'.encode('cp1251')
)
req = self.assert_followed_url(
resp2.css('a')[0],
'http://example.com/foo?%EF%F0%E8%E2%E5%F2',
response=resp2,
)
self.assertEqual(req.encoding, 'cp1251')
class XmlResponseTest(TextResponseTest):