1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 22:04:16 +00:00
scrapy/tests/test_downloader_handlers.py

582 lines
23 KiB
Python
Raw Normal View History

import os
import twisted
2014-11-25 14:09:51 +01:00
import six
from twisted.trial import unittest
from twisted.protocols.policies import WrappingFactory
from twisted.python.filepath import FilePath
2012-04-25 12:45:28 -03:00
from twisted.internet import reactor, defer, error
from twisted.web import server, static, util, resource
from twisted.web.test.test_webclient import ForeverTakingResource, \
NoLengthResource, HostHeaderResource, \
PayloadResource, BrokenDownloadResource
2013-06-27 16:52:59 -02:00
from twisted.protocols.ftp import FTPRealm, FTPFactory
from twisted.cred import portal, checkers, credentials
from twisted.protocols.ftp import FTPClient, ConnectionLost
from w3lib.url import path_to_file_uri
from scrapy import twisted_version
2014-03-12 22:32:48 -03:00
from scrapy.core.downloader.handlers import DownloadHandlers
from scrapy.core.downloader.handlers.file import FileDownloadHandler
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler, HttpDownloadHandler
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
2013-06-27 16:52:59 -02:00
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
from scrapy.spider import Spider
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy import optional_features
2014-03-12 22:32:48 -03:00
from scrapy.utils.test import get_crawler
from scrapy.exceptions import NotConfigured
from tests.mockserver import MockServer
from tests.spiders import SingleRequestSpider
2014-03-12 22:32:48 -03:00
class DummyDH(object):
def __init__(self, crawler):
2014-03-12 22:32:48 -03:00
pass
class OffDH(object):
def __init__(self, crawler):
raise NotConfigured
class LoadTestCase(unittest.TestCase):
def test_enabled_handler(self):
2014-07-30 16:53:28 -03:00
handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
2014-03-12 22:32:48 -03:00
self.assertIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)
def test_not_configured_handler(self):
2014-07-30 16:53:28 -03:00
handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
2014-03-12 22:32:48 -03:00
self.assertNotIn('scheme', dh._handlers)
self.assertIn('scheme', dh._notconfigured)
def test_disabled_handler(self):
handlers = {'scheme': None}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
2014-03-12 22:32:48 -03:00
self.assertNotIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)
class FileTestCase(unittest.TestCase):
def setUp(self):
self.tmpname = self.mktemp()
fd = open(self.tmpname + '^', 'w')
fd.write('0123456789')
fd.close()
self.download_request = FileDownloadHandler(Settings()).download_request
def test_download(self):
def _test(response):
self.assertEquals(response.url, request.url)
self.assertEquals(response.status, 200)
self.assertEquals(response.body, '0123456789')
2010-09-05 06:05:34 -03:00
request = Request(path_to_file_uri(self.tmpname + '^'))
assert request.url.upper().endswith('%5E')
return self.download_request(request, Spider('foo')).addCallback(_test)
def test_non_existent(self):
request = Request('file://%s' % self.mktemp())
d = self.download_request(request, Spider('foo'))
return self.assertFailure(d, IOError)
class HttpTestCase(unittest.TestCase):
download_handler_cls = HTTPDownloadHandler
def setUp(self):
name = self.mktemp()
os.mkdir(name)
FilePath(name).child("file").setContent("0123456789")
r = static.File(name)
r.putChild("redirect", util.Redirect("/file"))
r.putChild("wait", ForeverTakingResource())
r.putChild("hang-after-headers", ForeverTakingResource(write=True))
r.putChild("nolength", NoLengthResource())
r.putChild("host", HostHeaderResource())
r.putChild("payload", PayloadResource())
r.putChild("broken", BrokenDownloadResource())
self.site = server.Site(r, timeout=None)
self.wrapper = WrappingFactory(self.site)
self.port = reactor.listenTCP(0, self.wrapper, interface='127.0.0.1')
self.portno = self.port.getHost().port
self.download_handler = self.download_handler_cls(Settings())
self.download_request = self.download_handler.download_request
@defer.inlineCallbacks
def tearDown(self):
yield self.port.stopListening()
if hasattr(self.download_handler, 'close'):
yield self.download_handler.close()
def getURL(self, path):
return "http://127.0.0.1:%d/%s" % (self.portno, path)
def test_download(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d
2010-05-04 15:50:26 +08:00
def test_download_head(self):
request = Request(self.getURL('file'), method='HEAD')
d = self.download_request(request, Spider('foo'))
2010-05-04 15:50:26 +08:00
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, '')
return d
def test_redirect_status(self):
request = Request(self.getURL('redirect'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.status)
d.addCallback(self.assertEquals, 302)
return d
2010-05-04 15:50:26 +08:00
def test_redirect_status_head(self):
request = Request(self.getURL('redirect'), method='HEAD')
d = self.download_request(request, Spider('foo'))
2010-05-04 15:50:26 +08:00
d.addCallback(lambda r: r.status)
d.addCallback(self.assertEquals, 302)
return d
@defer.inlineCallbacks
def test_timeout_download_from_spider(self):
spider = Spider('foo')
meta = {'download_timeout': 0.2}
# client connects but no data is received
request = Request(self.getURL('wait'), meta=meta)
d = self.download_request(request, spider)
yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
# client connects, server send headers and some body bytes but hangs
request = Request(self.getURL('hang-after-headers'), meta=meta)
d = self.download_request(request, spider)
yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
def test_host_header_not_in_request_headers(self):
def _test(response):
self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
self.assertEquals(request.headers, {})
request = Request(self.getURL('host'))
return self.download_request(request, Spider('foo')).addCallback(_test)
def test_host_header_seted_in_request_headers(self):
def _test(response):
self.assertEquals(response.body, 'example.com')
self.assertEquals(request.headers.get('Host'), 'example.com')
request = Request(self.getURL('host'), headers={'Host': 'example.com'})
return self.download_request(request, Spider('foo')).addCallback(_test)
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, 'example.com')
return d
def test_payload(self):
body = '1'*100 # PayloadResource requires body length to be 100
request = Request(self.getURL('payload'), method='POST', body=body)
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, body)
return d
class DeprecatedHttpTestCase(HttpTestCase):
"""HTTP 1.0 test case"""
download_handler_cls = HttpDownloadHandler
class Http10TestCase(HttpTestCase):
"""HTTP 1.0 test case"""
download_handler_cls = HTTP10DownloadHandler
class Http11TestCase(HttpTestCase):
"""HTTP 1.1 test case"""
download_handler_cls = HTTP11DownloadHandler
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
def test_download_without_maxsize_limit(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d
@defer.inlineCallbacks
def test_download_with_maxsize(self):
request = Request(self.getURL('file'))
# 10 is minimal size for this request and the limit is only counted on
# response body. (regardless of headers)
d = self.download_request(request, Spider('foo', download_maxsize=10))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
yield d
d = self.download_request(request, Spider('foo', download_maxsize=9))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
@defer.inlineCallbacks
def test_download_with_maxsize_per_req(self):
meta = {'download_maxsize': 2}
request = Request(self.getURL('file'), meta=meta)
d = self.download_request(request, Spider('foo'))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
@defer.inlineCallbacks
def test_download_with_small_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=2))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
def test_download_with_large_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=100))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d
class Http11MockServerTestCase(unittest.TestCase):
"""HTTP 1.1 test case with MockServer"""
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_download_with_content_length(self):
crawler = get_crawler(SingleRequestSpider)
# http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid
# download it
yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000}))
failure = crawler.spider.meta['failure']
self.assertIsInstance(failure.value, defer.CancelledError)
@defer.inlineCallbacks
def test_download(self):
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=Request(url='http://localhost:8998'))
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
@defer.inlineCallbacks
def test_download_gzip_response(self):
2014-11-25 14:09:51 +01:00
if six.PY2 and twisted_version > (12, 3, 0):
crawler = get_crawler(SingleRequestSpider)
body = '1'*100 # PayloadResource requires body length to be 100
request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50})
yield crawler.crawl(seed=request)
failure = crawler.spider.meta['failure']
# download_maxsize < 100, hence the CancelledError
self.assertIsInstance(failure.value, defer.CancelledError)
request.headers.setdefault('Accept-Encoding', 'gzip,deflate')
request = request.replace(url='http://localhost:8998/xpayload')
yield crawler.crawl(seed=request)
# download_maxsize = 50 is enough for the gzipped response
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
else:
raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x")
class UriResource(resource.Resource):
"""Return the full uri that was requested"""
def getChild(self, path, request):
return self
def render(self, request):
return request.uri
class HttpProxyTestCase(unittest.TestCase):
download_handler_cls = HTTPDownloadHandler
def setUp(self):
site = server.Site(UriResource(), timeout=None)
wrapper = WrappingFactory(site)
self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
self.portno = self.port.getHost().port
self.download_handler = self.download_handler_cls(Settings())
self.download_request = self.download_handler.download_request
@defer.inlineCallbacks
def tearDown(self):
yield self.port.stopListening()
if hasattr(self.download_handler, 'close'):
yield self.download_handler.close()
def getURL(self, path):
return "http://127.0.0.1:%d/%s" % (self.portno, path)
def test_download_with_proxy(self):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, 'http://example.com')
http_proxy = self.getURL('')
request = Request('http://example.com', meta={'proxy': http_proxy})
return self.download_request(request, Spider('foo')).addCallback(_test)
def test_download_with_proxy_https_noconnect(self):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, 'https://example.com')
http_proxy = '%s?noconnect' % self.getURL('')
request = Request('https://example.com', meta={'proxy': http_proxy})
return self.download_request(request, Spider('foo')).addCallback(_test)
def test_download_without_proxy(self):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, '/path/to/resource')
request = Request(self.getURL('path/to/resource'))
return self.download_request(request, Spider('foo')).addCallback(_test)
class DeprecatedHttpProxyTestCase(unittest.TestCase):
"""Old deprecated reference to http10 downloader handler"""
download_handler_cls = HttpDownloadHandler
class Http10ProxyTestCase(HttpProxyTestCase):
download_handler_cls = HTTP10DownloadHandler
class Http11ProxyTestCase(HttpProxyTestCase):
download_handler_cls = HTTP11DownloadHandler
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
class HttpDownloadHandlerMock(object):
def __init__(self, settings):
pass
def download_request(self, request, spider):
return request
class S3TestCase(unittest.TestCase):
skip = 'boto' not in optional_features and 'missing boto library'
# test use same example keys than amazon developer guide
# http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
# and the tests described here are the examples from that manual
AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
def setUp(self):
s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID, \
self.AWS_SECRET_ACCESS_KEY, \
httpdownloadhandler=HttpDownloadHandlerMock)
self.download_request = s3reqh.download_request
self.spider = Spider('foo')
def test_request_signing1(self):
# gets an object from the johnsmith bucket.
req = Request('s3://johnsmith/photos/puppy.jpg',
headers={'Date': 'Tue, 27 Mar 2007 19:36:42 +0000'})
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
def test_request_signing2(self):
# puts an object into the johnsmith bucket.
req = Request('s3://johnsmith/photos/puppy.jpg', method='PUT', headers={
'Content-Type': 'image/jpeg',
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
'Content-Length': '94328',
})
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
def test_request_signing3(self):
# lists the content of the johnsmith bucket.
req = Request('s3://johnsmith/?prefix=photos&max-keys=50&marker=puppy', \
method='GET', headers={
'User-Agent': 'Mozilla/5.0',
'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
})
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
def test_request_signing4(self):
# fetches the access control policy sub-resource for the 'johnsmith' bucket.
req = Request('s3://johnsmith/?acl', \
method='GET', headers={'Date': 'Tue, 27 Mar 2007 19:44:46 +0000'})
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
def test_request_signing5(self):
# deletes an object from the 'johnsmith' bucket using the
# path-style and Date alternative.
req = Request('s3://johnsmith/photos/puppy.jpg', \
method='DELETE', headers={
'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
})
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
def test_request_signing6(self):
# uploads an object to a CNAME style virtual hosted bucket with metadata.
req = Request('s3://static.johnsmith.net:8080/db-backup.dat.gz', \
method='PUT', headers={
'User-Agent': 'curl/7.15.5',
'Host': 'static.johnsmith.net:8080',
'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
'x-amz-acl': 'public-read',
'content-type': 'application/x-download',
'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
'X-Amz-Meta-FileChecksum': '0x02661779',
'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
'Content-Disposition': 'attachment; filename=database.dat',
'Content-Encoding': 'gzip',
'Content-Length': '5913339',
})
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
2013-06-27 16:52:59 -02:00
def test_request_signing7(self):
# ensure that spaces are quoted properly before signing
req = Request(
("s3://johnsmith/photos/my puppy.jpg"
"?response-content-disposition=my puppy.jpg"),
method='GET',
headers={
'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
})
httpreq = self.download_request(req, self.spider)
self.assertEqual(
httpreq.headers['Authorization'],
'AWS 0PN5J17HBGZHT7JJ3X82:+CfvG8EZ3YccOrRVMXNaK2eKZmM=')
2013-06-27 16:52:59 -02:00
class FTPTestCase(unittest.TestCase):
2013-06-27 16:52:59 -02:00
username = "scrapy"
password = "passwd"
if twisted_version < (10, 2, 0):
skip = "Twisted pre 10.2.0 doesn't allow to set home path other than /home"
2013-06-27 16:52:59 -02:00
def setUp(self):
# setup dirs and test file
self.directory = self.mktemp()
os.mkdir(self.directory)
userdir = os.path.join(self.directory, self.username)
os.mkdir(userdir)
FilePath(userdir).child('file.txt').setContent("I have the power!")
# setup server
realm = FTPRealm(anonymousRoot=self.directory, userHome=self.directory)
p = portal.Portal(realm)
users_checker = checkers.InMemoryUsernamePasswordDatabaseDontUse()
users_checker.addUser(self.username, self.password)
p.registerChecker(users_checker, credentials.IUsernamePassword)
self.factory = FTPFactory(portal=p)
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
self.portNum = self.port.getHost().port
self.download_handler = FTPDownloadHandler(Settings())
self.addCleanup(self.port.stopListening)
def _add_test_callbacks(self, deferred, callback=None, errback=None):
def _clean(data):
self.download_handler.client.transport.loseConnection()
return data
deferred.addCallback(_clean)
if callback:
deferred.addCallback(callback)
if errback:
deferred.addErrback(errback)
return deferred
def test_ftp_download_success(self):
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": self.password})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.status, 200)
self.assertEqual(r.body, 'I have the power!')
2014-08-01 02:10:57 -03:00
self.assertEqual(r.headers, {'Local Filename': [''], 'Size': ['17']})
2013-06-27 16:52:59 -02:00
return self._add_test_callbacks(d, _test)
def test_ftp_download_notexist(self):
request = Request(url="ftp://127.0.0.1:%s/notexist.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": self.password})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.status, 404)
return self._add_test_callbacks(d, _test)
def test_ftp_local_filename(self):
local_fname = "/tmp/file.txt"
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": self.password, "ftp_local_filename": local_fname})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.body, local_fname)
2014-08-01 02:10:57 -03:00
self.assertEqual(r.headers, {'Local Filename': ['/tmp/file.txt'], 'Size': ['17']})
2013-06-27 16:52:59 -02:00
self.assertTrue(os.path.exists(local_fname))
with open(local_fname) as f:
self.assertEqual(f.read(), "I have the power!")
os.remove(local_fname)
return self._add_test_callbacks(d, _test)
def test_invalid_credentials(self):
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": 'invalid'})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.type, ConnectionLost)
return self._add_test_callbacks(d, errback=_test)