2009-02-10 06:20:43 +00:00
|
|
|
import unittest
|
|
|
|
|
2015-04-20 21:23:05 -03:00
|
|
|
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware, MetaRefreshMiddleware
|
2015-05-09 04:20:09 -03:00
|
|
|
from scrapy.spiders import Spider
|
2010-08-10 17:36:48 -03:00
|
|
|
from scrapy.exceptions import IgnoreRequest
|
2013-01-08 09:55:16 -02:00
|
|
|
from scrapy.http import Request, Response, HtmlResponse
|
2012-08-21 17:27:45 -03:00
|
|
|
from scrapy.utils.test import get_crawler
|
2009-02-10 06:20:43 +00:00
|
|
|
|
2013-01-08 09:55:16 -02:00
|
|
|
|
2009-02-10 06:20:43 +00:00
|
|
|
class RedirectMiddlewareTest(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
2015-07-16 12:50:26 +02:00
|
|
|
self.crawler = get_crawler(Spider)
|
|
|
|
self.spider = self.crawler._create_spider('foo')
|
|
|
|
self.mw = RedirectMiddleware.from_crawler(self.crawler)
|
2009-02-23 20:21:11 +00:00
|
|
|
|
2009-06-20 19:19:07 -03:00
|
|
|
def test_priority_adjust(self):
|
|
|
|
req = Request('http://a.com')
|
|
|
|
rsp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301)
|
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
2009-06-20 19:23:26 -03:00
|
|
|
assert req2.priority > req.priority
|
2009-06-20 19:19:07 -03:00
|
|
|
|
2009-04-21 01:38:11 +00:00
|
|
|
def test_redirect_301(self):
|
2010-05-04 16:11:45 +08:00
|
|
|
def _test(method):
|
|
|
|
url = 'http://www.example.com/301'
|
|
|
|
url2 = 'http://www.example.com/redirected'
|
|
|
|
req = Request(url, method=method)
|
|
|
|
rsp = Response(url, headers={'Location': url2}, status=301)
|
2009-02-10 06:20:43 +00:00
|
|
|
|
2010-05-04 16:11:45 +08:00
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert isinstance(req2, Request)
|
|
|
|
self.assertEqual(req2.url, url2)
|
|
|
|
self.assertEqual(req2.method, method)
|
2009-02-10 06:20:43 +00:00
|
|
|
|
2010-05-04 16:11:45 +08:00
|
|
|
# response without Location header but with status code is 3XX should be ignored
|
|
|
|
del rsp.headers['Location']
|
|
|
|
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
|
|
|
|
|
|
|
_test('GET')
|
|
|
|
_test('POST')
|
|
|
|
_test('HEAD')
|
2009-05-08 12:01:02 -03:00
|
|
|
|
2010-09-09 21:37:35 -03:00
|
|
|
def test_dont_redirect(self):
|
|
|
|
url = 'http://www.example.com/301'
|
|
|
|
url2 = 'http://www.example.com/redirected'
|
|
|
|
req = Request(url, meta={'dont_redirect': True})
|
|
|
|
rsp = Response(url, headers={'Location': url2}, status=301)
|
|
|
|
|
|
|
|
r = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert isinstance(r, Response)
|
|
|
|
assert r is rsp
|
|
|
|
|
2014-08-15 13:44:29 -07:00
|
|
|
# Test that it redirects when dont_redirect is False
|
|
|
|
req = Request(url, meta={'dont_redirect': False})
|
|
|
|
rsp = Response(url2, status=200)
|
|
|
|
|
|
|
|
r = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert isinstance(r, Response)
|
|
|
|
assert r is rsp
|
|
|
|
|
|
|
|
|
2009-04-21 01:38:11 +00:00
|
|
|
def test_redirect_302(self):
|
2009-02-10 06:20:43 +00:00
|
|
|
url = 'http://www.example.com/302'
|
2009-02-19 19:43:24 +00:00
|
|
|
url2 = 'http://www.example.com/redirected2'
|
2013-01-08 09:55:16 -02:00
|
|
|
req = Request(url, method='POST', body='test',
|
2009-04-21 01:38:11 +00:00
|
|
|
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
|
2009-05-27 16:51:36 -03:00
|
|
|
rsp = Response(url, headers={'Location': url2}, status=302)
|
2009-02-10 06:20:43 +00:00
|
|
|
|
2009-05-27 16:51:36 -03:00
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
2009-02-10 06:20:43 +00:00
|
|
|
assert isinstance(req2, Request)
|
|
|
|
self.assertEqual(req2.url, url2)
|
|
|
|
self.assertEqual(req2.method, 'GET')
|
2009-04-21 01:38:11 +00:00
|
|
|
assert 'Content-Type' not in req2.headers, \
|
|
|
|
"Content-Type header must not be present in redirected request"
|
|
|
|
assert 'Content-Length' not in req2.headers, \
|
|
|
|
"Content-Length header must not be present in redirected request"
|
|
|
|
assert not req2.body, \
|
|
|
|
"Redirected body must be empty, not '%s'" % req2.body
|
2009-02-10 06:20:43 +00:00
|
|
|
|
2009-05-08 12:01:02 -03:00
|
|
|
# response without Location header but with status code is 3XX should be ignored
|
2010-05-04 16:11:45 +08:00
|
|
|
del rsp.headers['Location']
|
|
|
|
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
|
|
|
|
|
|
|
def test_redirect_302_head(self):
|
|
|
|
url = 'http://www.example.com/302'
|
|
|
|
url2 = 'http://www.example.com/redirected2'
|
|
|
|
req = Request(url, method='HEAD')
|
|
|
|
rsp = Response(url, headers={'Location': url2}, status=302)
|
|
|
|
|
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert isinstance(req2, Request)
|
|
|
|
self.assertEqual(req2.url, url2)
|
|
|
|
self.assertEqual(req2.method, 'HEAD')
|
|
|
|
|
|
|
|
# response without Location header but with status code is 3XX should be ignored
|
2009-05-08 12:01:02 -03:00
|
|
|
del rsp.headers['Location']
|
2009-05-27 16:51:36 -03:00
|
|
|
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
2009-05-08 12:01:02 -03:00
|
|
|
|
2013-01-08 09:55:16 -02:00
|
|
|
|
|
|
|
def test_max_redirect_times(self):
|
|
|
|
self.mw.max_redirect_times = 1
|
|
|
|
req = Request('http://scrapytest.org/302')
|
|
|
|
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
|
|
|
|
|
|
|
|
req = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert isinstance(req, Request)
|
|
|
|
assert 'redirect_times' in req.meta
|
|
|
|
self.assertEqual(req.meta['redirect_times'], 1)
|
|
|
|
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
|
|
|
|
|
|
|
def test_ttl(self):
|
|
|
|
self.mw.max_redirect_times = 100
|
|
|
|
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
|
|
|
|
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
|
|
|
|
|
|
|
|
req = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert isinstance(req, Request)
|
|
|
|
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
|
|
|
|
|
|
|
def test_redirect_urls(self):
|
|
|
|
req1 = Request('http://scrapytest.org/first')
|
|
|
|
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
|
|
|
|
req2 = self.mw.process_response(req1, rsp1, self.spider)
|
|
|
|
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
|
|
|
|
req3 = self.mw.process_response(req2, rsp2, self.spider)
|
|
|
|
|
|
|
|
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
|
|
|
|
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
|
|
|
|
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
|
|
|
|
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
|
|
|
|
|
2015-07-16 12:50:26 +02:00
|
|
|
def test_spider_handling(self):
|
|
|
|
smartspider = self.crawler._create_spider('smarty')
|
|
|
|
smartspider.handle_httpstatus_list = [404, 301, 302]
|
|
|
|
url = 'http://www.example.com/301'
|
|
|
|
url2 = 'http://www.example.com/redirected'
|
2015-07-17 16:11:53 +02:00
|
|
|
req = Request(url)
|
2015-07-16 12:50:26 +02:00
|
|
|
rsp = Response(url, headers={'Location': url2}, status=301)
|
|
|
|
r = self.mw.process_response(req, rsp, smartspider)
|
|
|
|
self.assertIs(r, rsp)
|
|
|
|
|
2015-08-21 13:22:42 +02:00
|
|
|
def test_request_meta_handling(self):
|
|
|
|
url = 'http://www.example.com/301'
|
|
|
|
url2 = 'http://www.example.com/redirected'
|
|
|
|
def _test_passthrough(req):
|
|
|
|
rsp = Response(url, headers={'Location': url2}, status=301, request=req)
|
|
|
|
r = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
self.assertIs(r, rsp)
|
|
|
|
_test_passthrough(Request(url, meta={'handle_httpstatus_list':
|
|
|
|
[404, 301, 302]}))
|
|
|
|
_test_passthrough(Request(url, meta={'handle_httpstatus_all': True}))
|
|
|
|
|
2015-07-16 12:50:26 +02:00
|
|
|
|
2013-01-08 09:55:16 -02:00
|
|
|
class MetaRefreshMiddlewareTest(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
2014-07-31 04:12:12 -03:00
|
|
|
crawler = get_crawler(Spider)
|
|
|
|
self.spider = crawler._create_spider('foo')
|
2013-01-08 09:55:16 -02:00
|
|
|
self.mw = MetaRefreshMiddleware.from_crawler(crawler)
|
|
|
|
|
|
|
|
def _body(self, interval=5, url='http://example.org/newpage'):
|
|
|
|
return """<html><head><meta http-equiv="refresh" content="{0};url={1}"/></head></html>"""\
|
|
|
|
.format(interval, url)
|
|
|
|
|
|
|
|
def test_priority_adjust(self):
|
|
|
|
req = Request('http://a.com')
|
|
|
|
rsp = HtmlResponse(req.url, body=self._body())
|
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
assert req2.priority > req.priority
|
|
|
|
|
2009-04-21 01:38:11 +00:00
|
|
|
def test_meta_refresh(self):
|
2009-02-10 06:20:43 +00:00
|
|
|
req = Request(url='http://example.org')
|
2013-01-08 09:55:16 -02:00
|
|
|
rsp = HtmlResponse(req.url, body=self._body())
|
2009-03-20 19:38:32 +00:00
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
2009-02-10 06:20:43 +00:00
|
|
|
assert isinstance(req2, Request)
|
|
|
|
self.assertEqual(req2.url, 'http://example.org/newpage')
|
|
|
|
|
2009-10-21 11:35:36 -02:00
|
|
|
def test_meta_refresh_with_high_interval(self):
|
2009-02-10 06:20:43 +00:00
|
|
|
# meta-refresh with high intervals don't trigger redirects
|
|
|
|
req = Request(url='http://example.org')
|
2013-01-08 09:55:16 -02:00
|
|
|
rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000))
|
2009-03-20 19:38:32 +00:00
|
|
|
rsp2 = self.mw.process_response(req, rsp, self.spider)
|
2009-02-10 06:20:43 +00:00
|
|
|
assert rsp is rsp2
|
|
|
|
|
2009-10-21 11:35:36 -02:00
|
|
|
def test_meta_refresh_trough_posted_request(self):
|
|
|
|
req = Request(url='http://example.org', method='POST', body='test',
|
2013-01-08 09:55:16 -02:00
|
|
|
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
|
|
|
|
rsp = HtmlResponse(req.url, body=self._body())
|
2009-10-21 11:35:36 -02:00
|
|
|
req2 = self.mw.process_response(req, rsp, self.spider)
|
|
|
|
|
|
|
|
assert isinstance(req2, Request)
|
|
|
|
self.assertEqual(req2.url, 'http://example.org/newpage')
|
|
|
|
self.assertEqual(req2.method, 'GET')
|
|
|
|
assert 'Content-Type' not in req2.headers, \
|
|
|
|
"Content-Type header must not be present in redirected request"
|
|
|
|
assert 'Content-Length' not in req2.headers, \
|
|
|
|
"Content-Length header must not be present in redirected request"
|
|
|
|
assert not req2.body, \
|
|
|
|
"Redirected body must be empty, not '%s'" % req2.body
|
|
|
|
|
2009-03-20 19:38:32 +00:00
|
|
|
def test_max_redirect_times(self):
|
|
|
|
self.mw.max_redirect_times = 1
|
2013-01-08 09:55:16 -02:00
|
|
|
req = Request('http://scrapytest.org/max')
|
|
|
|
rsp = HtmlResponse(req.url, body=self._body())
|
2009-03-20 19:38:32 +00:00
|
|
|
|
2009-05-27 16:51:36 -03:00
|
|
|
req = self.mw.process_response(req, rsp, self.spider)
|
2009-03-20 19:38:32 +00:00
|
|
|
assert isinstance(req, Request)
|
|
|
|
assert 'redirect_times' in req.meta
|
|
|
|
self.assertEqual(req.meta['redirect_times'], 1)
|
2009-08-11 12:39:02 -03:00
|
|
|
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
2009-03-20 19:38:32 +00:00
|
|
|
|
|
|
|
def test_ttl(self):
|
|
|
|
self.mw.max_redirect_times = 100
|
|
|
|
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
|
2013-01-08 09:55:16 -02:00
|
|
|
rsp = HtmlResponse(req.url, body=self._body())
|
2009-03-20 19:38:32 +00:00
|
|
|
|
2009-05-27 16:51:36 -03:00
|
|
|
req = self.mw.process_response(req, rsp, self.spider)
|
2009-03-20 19:38:32 +00:00
|
|
|
assert isinstance(req, Request)
|
2009-08-11 12:39:02 -03:00
|
|
|
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
2009-03-20 19:38:32 +00:00
|
|
|
|
2010-11-18 12:51:54 -02:00
|
|
|
def test_redirect_urls(self):
|
|
|
|
req1 = Request('http://scrapytest.org/first')
|
2013-01-08 09:55:16 -02:00
|
|
|
rsp1 = HtmlResponse(req1.url, body=self._body(url='/redirected'))
|
2010-11-18 12:51:54 -02:00
|
|
|
req2 = self.mw.process_response(req1, rsp1, self.spider)
|
2013-01-08 09:55:16 -02:00
|
|
|
assert isinstance(req2, Request), req2
|
|
|
|
rsp2 = HtmlResponse(req2.url, body=self._body(url='/redirected2'))
|
2010-11-18 12:51:54 -02:00
|
|
|
req3 = self.mw.process_response(req2, rsp2, self.spider)
|
2013-01-08 09:55:16 -02:00
|
|
|
assert isinstance(req3, Request), req3
|
2010-11-18 12:51:54 -02:00
|
|
|
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
|
|
|
|
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
|
|
|
|
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
|
|
|
|
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
|
|
|
|
|
2009-02-10 06:20:43 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|