2009-10-05 04:10:22 -02:00
|
|
|
import os
|
2013-06-04 16:36:21 -03:00
|
|
|
import twisted
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
from twisted.trial import unittest
|
|
|
|
from twisted.protocols.policies import WrappingFactory
|
|
|
|
from twisted.python.filepath import FilePath
|
2012-04-25 12:45:28 -03:00
|
|
|
from twisted.internet import reactor, defer, error
|
2009-10-05 04:10:22 -02:00
|
|
|
from twisted.web import server, static, util, resource
|
|
|
|
from twisted.web.test.test_webclient import ForeverTakingResource, \
|
|
|
|
NoLengthResource, HostHeaderResource, \
|
|
|
|
PayloadResource, BrokenDownloadResource
|
2013-06-27 16:52:59 -02:00
|
|
|
from twisted.protocols.ftp import FTPRealm, FTPFactory
|
|
|
|
from twisted.cred import portal, checkers, credentials
|
2013-07-16 00:49:35 -03:00
|
|
|
from twisted.protocols.ftp import FTPClient, ConnectionLost
|
2011-04-19 01:33:52 -03:00
|
|
|
from w3lib.url import path_to_file_uri
|
2009-10-05 04:10:22 -02:00
|
|
|
|
2013-07-16 00:49:35 -03:00
|
|
|
from scrapy import twisted_version
|
2014-03-12 22:32:48 -03:00
|
|
|
from scrapy.core.downloader.handlers import DownloadHandlers
|
2010-09-05 19:35:53 -03:00
|
|
|
from scrapy.core.downloader.handlers.file import FileDownloadHandler
|
2013-06-04 16:59:35 -03:00
|
|
|
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler, HttpDownloadHandler
|
|
|
|
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
|
|
|
|
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
|
2010-09-05 19:35:53 -03:00
|
|
|
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
|
2013-06-27 16:52:59 -02:00
|
|
|
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
|
|
|
|
2013-12-28 00:47:32 +06:00
|
|
|
from scrapy.spider import Spider
|
2009-10-05 04:10:22 -02:00
|
|
|
from scrapy.http import Request
|
2012-09-03 16:43:51 -03:00
|
|
|
from scrapy.settings import Settings
|
2010-09-03 16:19:47 -03:00
|
|
|
from scrapy import optional_features
|
2014-03-12 22:32:48 -03:00
|
|
|
from scrapy.utils.test import get_crawler
|
|
|
|
from scrapy.exceptions import NotConfigured
|
|
|
|
|
2014-11-12 12:28:02 +01:00
|
|
|
from tests.mockserver import MockServer
|
|
|
|
from tests.spiders import SingleRequestSpider
|
2014-03-12 22:32:48 -03:00
|
|
|
|
|
|
|
class DummyDH(object):
|
|
|
|
|
2014-03-13 01:29:10 -03:00
|
|
|
def __init__(self, crawler):
|
2014-03-12 22:32:48 -03:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class OffDH(object):
|
|
|
|
|
|
|
|
def __init__(self, crawler):
|
|
|
|
raise NotConfigured
|
|
|
|
|
|
|
|
|
|
|
|
class LoadTestCase(unittest.TestCase):
|
|
|
|
|
|
|
|
def test_enabled_handler(self):
|
2014-07-30 16:53:28 -03:00
|
|
|
handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'}
|
2014-07-31 04:12:12 -03:00
|
|
|
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
|
|
|
|
dh = DownloadHandlers(crawler)
|
2014-03-12 22:32:48 -03:00
|
|
|
self.assertIn('scheme', dh._handlers)
|
|
|
|
self.assertNotIn('scheme', dh._notconfigured)
|
|
|
|
|
|
|
|
def test_not_configured_handler(self):
|
2014-07-30 16:53:28 -03:00
|
|
|
handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
|
2014-07-31 04:12:12 -03:00
|
|
|
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
|
|
|
|
dh = DownloadHandlers(crawler)
|
2014-03-12 22:32:48 -03:00
|
|
|
self.assertNotIn('scheme', dh._handlers)
|
|
|
|
self.assertIn('scheme', dh._notconfigured)
|
|
|
|
|
|
|
|
def test_disabled_handler(self):
|
|
|
|
handlers = {'scheme': None}
|
2014-07-31 04:12:12 -03:00
|
|
|
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
|
|
|
|
dh = DownloadHandlers(crawler)
|
2014-03-12 22:32:48 -03:00
|
|
|
self.assertNotIn('scheme', dh._handlers)
|
2014-03-13 01:29:10 -03:00
|
|
|
self.assertNotIn('scheme', dh._notconfigured)
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
|
|
|
|
class FileTestCase(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
self.tmpname = self.mktemp()
|
2009-12-12 11:07:50 -02:00
|
|
|
fd = open(self.tmpname + '^', 'w')
|
2009-10-05 04:10:22 -02:00
|
|
|
fd.write('0123456789')
|
|
|
|
fd.close()
|
2012-09-03 16:43:51 -03:00
|
|
|
self.download_request = FileDownloadHandler(Settings()).download_request
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
def test_download(self):
|
|
|
|
def _test(response):
|
|
|
|
self.assertEquals(response.url, request.url)
|
|
|
|
self.assertEquals(response.status, 200)
|
|
|
|
self.assertEquals(response.body, '0123456789')
|
|
|
|
|
2010-09-05 06:05:34 -03:00
|
|
|
request = Request(path_to_file_uri(self.tmpname + '^'))
|
2009-12-12 11:07:50 -02:00
|
|
|
assert request.url.upper().endswith('%5E')
|
2013-12-28 00:47:32 +06:00
|
|
|
return self.download_request(request, Spider('foo')).addCallback(_test)
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
def test_non_existent(self):
|
|
|
|
request = Request('file://%s' % self.mktemp())
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2009-10-05 04:10:22 -02:00
|
|
|
return self.assertFailure(d, IOError)
|
|
|
|
|
|
|
|
|
|
|
|
class HttpTestCase(unittest.TestCase):
|
|
|
|
|
2013-06-04 16:59:35 -03:00
|
|
|
download_handler_cls = HTTPDownloadHandler
|
2012-05-16 10:31:03 -03:00
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
def setUp(self):
|
|
|
|
name = self.mktemp()
|
|
|
|
os.mkdir(name)
|
|
|
|
FilePath(name).child("file").setContent("0123456789")
|
|
|
|
r = static.File(name)
|
|
|
|
r.putChild("redirect", util.Redirect("/file"))
|
|
|
|
r.putChild("wait", ForeverTakingResource())
|
2013-06-20 10:15:23 -03:00
|
|
|
r.putChild("hang-after-headers", ForeverTakingResource(write=True))
|
2009-10-05 04:10:22 -02:00
|
|
|
r.putChild("nolength", NoLengthResource())
|
|
|
|
r.putChild("host", HostHeaderResource())
|
|
|
|
r.putChild("payload", PayloadResource())
|
|
|
|
r.putChild("broken", BrokenDownloadResource())
|
|
|
|
self.site = server.Site(r, timeout=None)
|
|
|
|
self.wrapper = WrappingFactory(self.site)
|
|
|
|
self.port = reactor.listenTCP(0, self.wrapper, interface='127.0.0.1')
|
|
|
|
self.portno = self.port.getHost().port
|
2012-05-16 10:31:03 -03:00
|
|
|
self.download_handler = self.download_handler_cls(Settings())
|
|
|
|
self.download_request = self.download_handler.download_request
|
2009-10-05 04:10:22 -02:00
|
|
|
|
2012-05-16 10:31:03 -03:00
|
|
|
@defer.inlineCallbacks
|
2009-10-05 04:10:22 -02:00
|
|
|
def tearDown(self):
|
2012-05-16 10:31:03 -03:00
|
|
|
yield self.port.stopListening()
|
|
|
|
if hasattr(self.download_handler, 'close'):
|
|
|
|
yield self.download_handler.close()
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
def getURL(self, path):
|
|
|
|
return "http://127.0.0.1:%d/%s" % (self.portno, path)
|
|
|
|
|
|
|
|
def test_download(self):
|
|
|
|
request = Request(self.getURL('file'))
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2009-10-05 04:10:22 -02:00
|
|
|
d.addCallback(lambda r: r.body)
|
|
|
|
d.addCallback(self.assertEquals, "0123456789")
|
|
|
|
return d
|
|
|
|
|
2010-05-04 15:50:26 +08:00
|
|
|
def test_download_head(self):
|
|
|
|
request = Request(self.getURL('file'), method='HEAD')
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2010-05-04 15:50:26 +08:00
|
|
|
d.addCallback(lambda r: r.body)
|
|
|
|
d.addCallback(self.assertEquals, '')
|
|
|
|
return d
|
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
def test_redirect_status(self):
|
|
|
|
request = Request(self.getURL('redirect'))
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2009-10-05 04:10:22 -02:00
|
|
|
d.addCallback(lambda r: r.status)
|
|
|
|
d.addCallback(self.assertEquals, 302)
|
|
|
|
return d
|
|
|
|
|
2010-05-04 15:50:26 +08:00
|
|
|
def test_redirect_status_head(self):
|
|
|
|
request = Request(self.getURL('redirect'), method='HEAD')
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2010-05-04 15:50:26 +08:00
|
|
|
d.addCallback(lambda r: r.status)
|
|
|
|
d.addCallback(self.assertEquals, 302)
|
|
|
|
return d
|
|
|
|
|
2013-06-20 10:15:23 -03:00
|
|
|
@defer.inlineCallbacks
|
2009-10-05 04:10:22 -02:00
|
|
|
def test_timeout_download_from_spider(self):
|
2013-12-28 00:47:32 +06:00
|
|
|
spider = Spider('foo')
|
2013-06-20 10:15:23 -03:00
|
|
|
meta = {'download_timeout': 0.2}
|
|
|
|
# client connects but no data is received
|
|
|
|
request = Request(self.getURL('wait'), meta=meta)
|
|
|
|
d = self.download_request(request, spider)
|
|
|
|
yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
|
|
|
|
# client connects, server send headers and some body bytes but hangs
|
|
|
|
request = Request(self.getURL('hang-after-headers'), meta=meta)
|
|
|
|
d = self.download_request(request, spider)
|
|
|
|
yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
|
2009-10-05 04:10:22 -02:00
|
|
|
|
2009-10-15 11:19:52 -02:00
|
|
|
def test_host_header_not_in_request_headers(self):
|
|
|
|
def _test(response):
|
|
|
|
self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
|
|
|
|
self.assertEquals(request.headers, {})
|
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
request = Request(self.getURL('host'))
|
2013-12-28 00:47:32 +06:00
|
|
|
return self.download_request(request, Spider('foo')).addCallback(_test)
|
2009-10-15 11:19:52 -02:00
|
|
|
|
|
|
|
def test_host_header_seted_in_request_headers(self):
|
|
|
|
def _test(response):
|
|
|
|
self.assertEquals(response.body, 'example.com')
|
|
|
|
self.assertEquals(request.headers.get('Host'), 'example.com')
|
|
|
|
|
|
|
|
request = Request(self.getURL('host'), headers={'Host': 'example.com'})
|
2013-12-28 00:47:32 +06:00
|
|
|
return self.download_request(request, Spider('foo')).addCallback(_test)
|
2009-10-15 11:19:52 -02:00
|
|
|
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2009-10-05 04:10:22 -02:00
|
|
|
d.addCallback(lambda r: r.body)
|
2009-10-15 11:19:52 -02:00
|
|
|
d.addCallback(self.assertEquals, 'example.com')
|
2009-10-05 04:10:22 -02:00
|
|
|
return d
|
|
|
|
|
|
|
|
def test_payload(self):
|
|
|
|
body = '1'*100 # PayloadResource requires body length to be 100
|
|
|
|
request = Request(self.getURL('payload'), method='POST', body=body)
|
2013-12-28 00:47:32 +06:00
|
|
|
d = self.download_request(request, Spider('foo'))
|
2009-10-05 04:10:22 -02:00
|
|
|
d.addCallback(lambda r: r.body)
|
|
|
|
d.addCallback(self.assertEquals, body)
|
|
|
|
return d
|
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
|
2013-06-04 16:59:35 -03:00
|
|
|
class DeprecatedHttpTestCase(HttpTestCase):
|
|
|
|
"""HTTP 1.0 test case"""
|
|
|
|
download_handler_cls = HttpDownloadHandler
|
|
|
|
|
|
|
|
|
|
|
|
class Http10TestCase(HttpTestCase):
|
|
|
|
"""HTTP 1.0 test case"""
|
|
|
|
download_handler_cls = HTTP10DownloadHandler
|
|
|
|
|
|
|
|
|
2012-04-07 23:00:21 +02:00
|
|
|
class Http11TestCase(HttpTestCase):
|
2012-05-16 10:31:03 -03:00
|
|
|
"""HTTP 1.1 test case"""
|
2013-06-04 16:59:35 -03:00
|
|
|
download_handler_cls = HTTP11DownloadHandler
|
|
|
|
if 'http11' not in optional_features:
|
2013-06-04 16:36:21 -03:00
|
|
|
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
|
|
|
|
|
2014-11-12 12:28:02 +01:00
|
|
|
def test_download_without_maxsize_limit(self):
|
|
|
|
request = Request(self.getURL('file'))
|
|
|
|
d = self.download_request(request, Spider('foo'))
|
|
|
|
d.addCallback(lambda r: r.body)
|
|
|
|
d.addCallback(self.assertEquals, "0123456789")
|
|
|
|
return d
|
|
|
|
|
2014-11-19 11:50:07 +01:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_download_with_maxsize(self):
|
|
|
|
request = Request(self.getURL('file'))
|
|
|
|
|
|
|
|
# 10 is minimal size for this request and the limit is only counted on
|
|
|
|
# response body. (regardless of headers)
|
|
|
|
d = self.download_request(request, Spider('foo', download_maxsize=10))
|
|
|
|
d.addCallback(lambda r: r.body)
|
|
|
|
d.addCallback(self.assertEquals, "0123456789")
|
|
|
|
yield d
|
|
|
|
|
|
|
|
d = self.download_request(request, Spider('foo', download_maxsize=9))
|
|
|
|
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
|
|
|
|
|
2014-11-12 12:28:02 +01:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_download_with_maxsize_per_req(self):
|
|
|
|
meta = {'download_maxsize': 2}
|
|
|
|
request = Request(self.getURL('file'), meta=meta)
|
|
|
|
d = self.download_request(request, Spider('foo'))
|
|
|
|
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_download_with_small_maxsize_per_spider(self):
|
|
|
|
request = Request(self.getURL('file'))
|
|
|
|
d = self.download_request(request, Spider('foo', download_maxsize=2))
|
|
|
|
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
|
|
|
|
|
|
|
|
def test_download_with_large_maxsize_per_spider(self):
|
|
|
|
request = Request(self.getURL('file'))
|
|
|
|
d = self.download_request(request, Spider('foo', download_maxsize=100))
|
|
|
|
d.addCallback(lambda r: r.body)
|
|
|
|
d.addCallback(self.assertEquals, "0123456789")
|
|
|
|
return d
|
|
|
|
|
|
|
|
|
|
|
|
class Http11MockServerTestCase(unittest.TestCase):
|
|
|
|
"""HTTP 1.1 test case with MockServer"""
|
|
|
|
if 'http11' not in optional_features:
|
|
|
|
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
self.mockserver = MockServer()
|
|
|
|
self.mockserver.__enter__()
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
self.mockserver.__exit__(None, None, None)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_download_with_content_length(self):
|
|
|
|
crawler = get_crawler(SingleRequestSpider)
|
|
|
|
# http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid
|
|
|
|
# download it
|
|
|
|
yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000}))
|
|
|
|
failure = crawler.spider.meta['failure']
|
|
|
|
self.assertIsInstance(failure.value, defer.CancelledError)
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_download(self):
|
|
|
|
crawler = get_crawler(SingleRequestSpider)
|
|
|
|
yield crawler.crawl(seed=Request(url='http://localhost:8998'))
|
|
|
|
failure = crawler.spider.meta.get('failure')
|
|
|
|
self.assertTrue(failure == None)
|
|
|
|
reason = crawler.spider.meta['close_reason']
|
|
|
|
self.assertTrue(reason, 'finished')
|
|
|
|
|
2014-11-19 11:50:07 +01:00
|
|
|
@defer.inlineCallbacks
|
|
|
|
def test_download_gzip_response(self):
|
|
|
|
crawler = get_crawler(SingleRequestSpider)
|
|
|
|
body = '1'*100 # PayloadResource requires body length to be 100
|
|
|
|
request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50})
|
|
|
|
yield crawler.crawl(seed=request)
|
|
|
|
failure = crawler.spider.meta['failure']
|
|
|
|
# download_maxsize < 100, hence the CancelledError
|
|
|
|
self.assertIsInstance(failure.value, defer.CancelledError)
|
|
|
|
|
|
|
|
request.headers.setdefault('Accept-Encoding', 'gzip,deflate')
|
|
|
|
request = request.replace(url='http://localhost:8998/xpayload')
|
|
|
|
yield crawler.crawl(seed=request)
|
|
|
|
|
|
|
|
# download_maxsize = 50 is enough for the gzipped response
|
|
|
|
failure = crawler.spider.meta.get('failure')
|
|
|
|
self.assertTrue(failure == None)
|
|
|
|
reason = crawler.spider.meta['close_reason']
|
|
|
|
self.assertTrue(reason, 'finished')
|
|
|
|
|
2012-04-07 23:00:21 +02:00
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
class UriResource(resource.Resource):
|
|
|
|
"""Return the full uri that was requested"""
|
|
|
|
|
|
|
|
def getChild(self, path, request):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def render(self, request):
|
|
|
|
return request.uri
|
|
|
|
|
|
|
|
|
|
|
|
class HttpProxyTestCase(unittest.TestCase):
|
2013-06-04 16:59:35 -03:00
|
|
|
download_handler_cls = HTTPDownloadHandler
|
2012-05-16 10:31:03 -03:00
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
def setUp(self):
|
|
|
|
site = server.Site(UriResource(), timeout=None)
|
|
|
|
wrapper = WrappingFactory(site)
|
|
|
|
self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
|
|
|
|
self.portno = self.port.getHost().port
|
2012-05-16 10:31:03 -03:00
|
|
|
self.download_handler = self.download_handler_cls(Settings())
|
|
|
|
self.download_request = self.download_handler.download_request
|
2009-10-05 04:10:22 -02:00
|
|
|
|
2012-05-16 10:31:03 -03:00
|
|
|
@defer.inlineCallbacks
|
2009-10-05 04:10:22 -02:00
|
|
|
def tearDown(self):
|
2012-05-16 10:31:03 -03:00
|
|
|
yield self.port.stopListening()
|
|
|
|
if hasattr(self.download_handler, 'close'):
|
|
|
|
yield self.download_handler.close()
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
def getURL(self, path):
|
|
|
|
return "http://127.0.0.1:%d/%s" % (self.portno, path)
|
|
|
|
|
|
|
|
def test_download_with_proxy(self):
|
|
|
|
def _test(response):
|
|
|
|
self.assertEquals(response.status, 200)
|
|
|
|
self.assertEquals(response.url, request.url)
|
2013-12-03 03:10:16 -02:00
|
|
|
self.assertEquals(response.body, 'http://example.com')
|
2009-10-05 04:10:22 -02:00
|
|
|
|
|
|
|
http_proxy = self.getURL('')
|
2013-12-03 03:10:16 -02:00
|
|
|
request = Request('http://example.com', meta={'proxy': http_proxy})
|
2013-12-28 00:47:32 +06:00
|
|
|
return self.download_request(request, Spider('foo')).addCallback(_test)
|
2009-10-05 04:10:22 -02:00
|
|
|
|
2013-12-03 12:55:44 -02:00
|
|
|
def test_download_with_proxy_https_noconnect(self):
|
|
|
|
def _test(response):
|
|
|
|
self.assertEquals(response.status, 200)
|
|
|
|
self.assertEquals(response.url, request.url)
|
|
|
|
self.assertEquals(response.body, 'https://example.com')
|
|
|
|
|
|
|
|
http_proxy = '%s?noconnect' % self.getURL('')
|
|
|
|
request = Request('https://example.com', meta={'proxy': http_proxy})
|
2013-12-28 00:47:32 +06:00
|
|
|
return self.download_request(request, Spider('foo')).addCallback(_test)
|
2013-12-03 12:55:44 -02:00
|
|
|
|
2009-10-05 04:10:22 -02:00
|
|
|
def test_download_without_proxy(self):
|
|
|
|
def _test(response):
|
|
|
|
self.assertEquals(response.status, 200)
|
|
|
|
self.assertEquals(response.url, request.url)
|
|
|
|
self.assertEquals(response.body, '/path/to/resource')
|
|
|
|
|
|
|
|
request = Request(self.getURL('path/to/resource'))
|
2013-12-28 00:47:32 +06:00
|
|
|
return self.download_request(request, Spider('foo')).addCallback(_test)
|
2010-09-03 16:19:47 -03:00
|
|
|
|
|
|
|
|
2013-06-04 16:59:35 -03:00
|
|
|
class DeprecatedHttpProxyTestCase(unittest.TestCase):
|
|
|
|
"""Old deprecated reference to http10 downloader handler"""
|
|
|
|
download_handler_cls = HttpDownloadHandler
|
|
|
|
|
2012-04-07 23:00:21 +02:00
|
|
|
|
2013-06-04 16:59:35 -03:00
|
|
|
class Http10ProxyTestCase(HttpProxyTestCase):
|
|
|
|
download_handler_cls = HTTP10DownloadHandler
|
|
|
|
|
|
|
|
|
|
|
|
class Http11ProxyTestCase(HttpProxyTestCase):
|
|
|
|
download_handler_cls = HTTP11DownloadHandler
|
|
|
|
if 'http11' not in optional_features:
|
2013-06-04 16:36:21 -03:00
|
|
|
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
|
|
|
|
|
2012-04-07 23:00:21 +02:00
|
|
|
|
2010-09-05 19:35:53 -03:00
|
|
|
class HttpDownloadHandlerMock(object):
|
2012-09-03 16:43:51 -03:00
|
|
|
def __init__(self, settings):
|
|
|
|
pass
|
|
|
|
|
2010-09-03 16:19:47 -03:00
|
|
|
def download_request(self, request, spider):
|
|
|
|
return request
|
|
|
|
|
|
|
|
class S3TestCase(unittest.TestCase):
|
|
|
|
skip = 'boto' not in optional_features and 'missing boto library'
|
|
|
|
|
|
|
|
# test use same example keys than amazon developer guide
|
|
|
|
# http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
|
|
|
|
# and the tests described here are the examples from that manual
|
|
|
|
|
|
|
|
AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
|
|
|
|
AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
|
|
|
|
|
|
|
|
def setUp(self):
|
2012-09-03 16:43:51 -03:00
|
|
|
s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID, \
|
2010-09-03 16:19:47 -03:00
|
|
|
self.AWS_SECRET_ACCESS_KEY, \
|
2010-09-05 19:35:53 -03:00
|
|
|
httpdownloadhandler=HttpDownloadHandlerMock)
|
2010-09-03 16:19:47 -03:00
|
|
|
self.download_request = s3reqh.download_request
|
2013-12-28 00:47:32 +06:00
|
|
|
self.spider = Spider('foo')
|
2010-09-03 16:19:47 -03:00
|
|
|
|
|
|
|
def test_request_signing1(self):
|
|
|
|
# gets an object from the johnsmith bucket.
|
|
|
|
req = Request('s3://johnsmith/photos/puppy.jpg',
|
|
|
|
headers={'Date': 'Tue, 27 Mar 2007 19:36:42 +0000'})
|
|
|
|
httpreq = self.download_request(req, self.spider)
|
|
|
|
self.assertEqual(httpreq.headers['Authorization'], \
|
|
|
|
'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
|
|
|
|
|
|
|
|
def test_request_signing2(self):
|
|
|
|
# puts an object into the johnsmith bucket.
|
|
|
|
req = Request('s3://johnsmith/photos/puppy.jpg', method='PUT', headers={
|
|
|
|
'Content-Type': 'image/jpeg',
|
|
|
|
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
|
|
|
|
'Content-Length': '94328',
|
|
|
|
})
|
|
|
|
httpreq = self.download_request(req, self.spider)
|
|
|
|
self.assertEqual(httpreq.headers['Authorization'], \
|
|
|
|
'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
|
|
|
|
|
|
|
|
def test_request_signing3(self):
|
|
|
|
# lists the content of the johnsmith bucket.
|
|
|
|
req = Request('s3://johnsmith/?prefix=photos&max-keys=50&marker=puppy', \
|
|
|
|
method='GET', headers={
|
|
|
|
'User-Agent': 'Mozilla/5.0',
|
|
|
|
'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
|
|
|
|
})
|
|
|
|
httpreq = self.download_request(req, self.spider)
|
|
|
|
self.assertEqual(httpreq.headers['Authorization'], \
|
|
|
|
'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
|
|
|
|
|
|
|
|
def test_request_signing4(self):
|
|
|
|
# fetches the access control policy sub-resource for the 'johnsmith' bucket.
|
|
|
|
req = Request('s3://johnsmith/?acl', \
|
|
|
|
method='GET', headers={'Date': 'Tue, 27 Mar 2007 19:44:46 +0000'})
|
|
|
|
httpreq = self.download_request(req, self.spider)
|
|
|
|
self.assertEqual(httpreq.headers['Authorization'], \
|
|
|
|
'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
|
|
|
|
|
|
|
|
def test_request_signing5(self):
|
2012-04-07 23:00:21 +02:00
|
|
|
# deletes an object from the 'johnsmith' bucket using the
|
2010-09-03 16:19:47 -03:00
|
|
|
# path-style and Date alternative.
|
|
|
|
req = Request('s3://johnsmith/photos/puppy.jpg', \
|
|
|
|
method='DELETE', headers={
|
|
|
|
'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
|
|
|
|
'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
|
|
|
|
})
|
|
|
|
httpreq = self.download_request(req, self.spider)
|
|
|
|
self.assertEqual(httpreq.headers['Authorization'], \
|
|
|
|
'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
|
|
|
|
|
|
|
|
def test_request_signing6(self):
|
|
|
|
# uploads an object to a CNAME style virtual hosted bucket with metadata.
|
|
|
|
req = Request('s3://static.johnsmith.net:8080/db-backup.dat.gz', \
|
|
|
|
method='PUT', headers={
|
|
|
|
'User-Agent': 'curl/7.15.5',
|
|
|
|
'Host': 'static.johnsmith.net:8080',
|
|
|
|
'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
|
|
|
|
'x-amz-acl': 'public-read',
|
|
|
|
'content-type': 'application/x-download',
|
|
|
|
'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
|
|
|
|
'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
|
|
|
|
'X-Amz-Meta-FileChecksum': '0x02661779',
|
|
|
|
'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
|
|
|
|
'Content-Disposition': 'attachment; filename=database.dat',
|
|
|
|
'Content-Encoding': 'gzip',
|
|
|
|
'Content-Length': '5913339',
|
|
|
|
})
|
|
|
|
httpreq = self.download_request(req, self.spider)
|
|
|
|
self.assertEqual(httpreq.headers['Authorization'], \
|
|
|
|
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
|
2013-06-27 16:52:59 -02:00
|
|
|
|
|
|
|
class FTPTestCase(unittest.TestCase):
|
2013-07-16 00:49:35 -03:00
|
|
|
|
2013-06-27 16:52:59 -02:00
|
|
|
username = "scrapy"
|
|
|
|
password = "passwd"
|
|
|
|
|
2013-07-16 00:49:35 -03:00
|
|
|
if twisted_version < (10, 2, 0):
|
|
|
|
skip = "Twisted pre 10.2.0 doesn't allow to set home path other than /home"
|
|
|
|
|
2013-06-27 16:52:59 -02:00
|
|
|
def setUp(self):
|
|
|
|
# setup dirs and test file
|
|
|
|
self.directory = self.mktemp()
|
|
|
|
os.mkdir(self.directory)
|
|
|
|
userdir = os.path.join(self.directory, self.username)
|
|
|
|
os.mkdir(userdir)
|
|
|
|
FilePath(userdir).child('file.txt').setContent("I have the power!")
|
|
|
|
|
|
|
|
# setup server
|
|
|
|
realm = FTPRealm(anonymousRoot=self.directory, userHome=self.directory)
|
|
|
|
p = portal.Portal(realm)
|
|
|
|
users_checker = checkers.InMemoryUsernamePasswordDatabaseDontUse()
|
|
|
|
users_checker.addUser(self.username, self.password)
|
|
|
|
p.registerChecker(users_checker, credentials.IUsernamePassword)
|
|
|
|
self.factory = FTPFactory(portal=p)
|
|
|
|
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
|
|
|
|
self.portNum = self.port.getHost().port
|
|
|
|
self.download_handler = FTPDownloadHandler(Settings())
|
|
|
|
self.addCleanup(self.port.stopListening)
|
|
|
|
|
|
|
|
def _add_test_callbacks(self, deferred, callback=None, errback=None):
|
|
|
|
def _clean(data):
|
|
|
|
self.download_handler.client.transport.loseConnection()
|
|
|
|
return data
|
|
|
|
deferred.addCallback(_clean)
|
|
|
|
if callback:
|
|
|
|
deferred.addCallback(callback)
|
|
|
|
if errback:
|
|
|
|
deferred.addErrback(errback)
|
|
|
|
return deferred
|
|
|
|
|
|
|
|
def test_ftp_download_success(self):
|
|
|
|
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
|
|
|
|
meta={"ftp_user": self.username, "ftp_password": self.password})
|
|
|
|
d = self.download_handler.download_request(request, None)
|
|
|
|
|
|
|
|
def _test(r):
|
|
|
|
self.assertEqual(r.status, 200)
|
|
|
|
self.assertEqual(r.body, 'I have the power!')
|
2014-08-01 02:10:57 -03:00
|
|
|
self.assertEqual(r.headers, {'Local Filename': [''], 'Size': ['17']})
|
2013-06-27 16:52:59 -02:00
|
|
|
return self._add_test_callbacks(d, _test)
|
|
|
|
|
|
|
|
def test_ftp_download_notexist(self):
|
|
|
|
request = Request(url="ftp://127.0.0.1:%s/notexist.txt" % self.portNum,
|
|
|
|
meta={"ftp_user": self.username, "ftp_password": self.password})
|
|
|
|
d = self.download_handler.download_request(request, None)
|
|
|
|
|
|
|
|
def _test(r):
|
|
|
|
self.assertEqual(r.status, 404)
|
|
|
|
return self._add_test_callbacks(d, _test)
|
|
|
|
|
|
|
|
def test_ftp_local_filename(self):
|
|
|
|
local_fname = "/tmp/file.txt"
|
|
|
|
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
|
|
|
|
meta={"ftp_user": self.username, "ftp_password": self.password, "ftp_local_filename": local_fname})
|
|
|
|
d = self.download_handler.download_request(request, None)
|
|
|
|
|
|
|
|
def _test(r):
|
|
|
|
self.assertEqual(r.body, local_fname)
|
2014-08-01 02:10:57 -03:00
|
|
|
self.assertEqual(r.headers, {'Local Filename': ['/tmp/file.txt'], 'Size': ['17']})
|
2013-06-27 16:52:59 -02:00
|
|
|
self.assertTrue(os.path.exists(local_fname))
|
|
|
|
with open(local_fname) as f:
|
|
|
|
self.assertEqual(f.read(), "I have the power!")
|
|
|
|
os.remove(local_fname)
|
|
|
|
return self._add_test_callbacks(d, _test)
|
|
|
|
|
|
|
|
def test_invalid_credentials(self):
|
|
|
|
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
|
|
|
|
meta={"ftp_user": self.username, "ftp_password": 'invalid'})
|
|
|
|
d = self.download_handler.download_request(request, None)
|
|
|
|
|
|
|
|
def _test(r):
|
|
|
|
self.assertEqual(r.type, ConnectionLost)
|
|
|
|
return self._add_test_callbacks(d, errback=_test)
|