mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 09:43:46 +00:00
commit
26c7ec7d9f
104
scrapy/core/downloader/handlers/ftp.py
Normal file
104
scrapy/core/downloader/handlers/ftp.py
Normal file
@ -0,0 +1,104 @@
|
||||
"""
|
||||
An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
|
||||
|
||||
FTP connection parameters are passed using the request meta field:
|
||||
- ftp_user (required)
|
||||
- ftp_password (required)
|
||||
- ftp_passive (by default, enabled) sets FTP connection passive mode
|
||||
- ftp_local_filename
|
||||
- If not given, file data will come in the response.body, as a normal scrapy Response,
|
||||
which will imply that the entire file will be on memory.
|
||||
- if given, file data will be saved in a local file with the given name
|
||||
This helps when downloading very big files to avoid memory issues. In addition, for
|
||||
convenience the local file name will also be given in the response body.
|
||||
|
||||
The status of the built html response will be, by default
|
||||
- 200 in case of success
|
||||
- 404 in case specified file was not found in the server (ftp code 550)
|
||||
|
||||
or raise corresponding ftp exception otherwise
|
||||
|
||||
The matching from server ftp command return codes to html response codes is defined in the
|
||||
CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
|
||||
that is not explicitly present among the map keys. You may need to overwrite this
|
||||
mapping if want a different behaviour than default.
|
||||
|
||||
In case of status 200 request, response.headers will come with two keys:
|
||||
'Local Filename' - with the value of the local filename if given
|
||||
'Size' - with size of the downloaded data
|
||||
"""
|
||||
|
||||
import re
|
||||
from urlparse import urlparse
|
||||
from cStringIO import StringIO
|
||||
|
||||
from twisted.internet import reactor
|
||||
from twisted.protocols.ftp import FTPClient, CommandFailed
|
||||
from twisted.internet.protocol import Protocol, ClientCreator
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
class ReceivedDataProtocol(Protocol):
|
||||
def __init__(self, filename=None):
|
||||
self.__filename = filename
|
||||
self.body = open(filename, "w") if filename else StringIO()
|
||||
self.size = 0
|
||||
|
||||
def dataReceived(self, data):
|
||||
self.body.write(data)
|
||||
self.size += len(data)
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return self.__filename
|
||||
|
||||
def close(self):
|
||||
self.body.close() if self.filename else self.body.reset()
|
||||
|
||||
_CODE_RE = re.compile("\d+")
|
||||
class FTPDownloadHandler(object):
|
||||
|
||||
CODE_MAPPING = {
|
||||
"550": 404,
|
||||
"default": 503,
|
||||
}
|
||||
|
||||
def __init__(self, setting):
|
||||
pass
|
||||
|
||||
def download_request(self, request, spider):
|
||||
parsed_url = urlparse(request.url)
|
||||
creator = ClientCreator(reactor, FTPClient, request.meta["ftp_user"],
|
||||
request.meta["ftp_password"],
|
||||
passive=request.meta.get("ftp_passive", 1))
|
||||
return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
|
||||
request, parsed_url.path)
|
||||
|
||||
def gotClient(self, client, request, filepath):
|
||||
self.client = client
|
||||
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
|
||||
return client.retrieveFile(filepath, protocol)\
|
||||
.addCallbacks(callback=self._build_response,
|
||||
callbackArgs=(request, protocol),
|
||||
errback=self._failed,
|
||||
errbackArgs=(request,))
|
||||
|
||||
def _build_response(self, result, request, protocol):
|
||||
self.result = result
|
||||
respcls = responsetypes.from_args(url=request.url)
|
||||
protocol.close()
|
||||
body = protocol.filename or protocol.body.read()
|
||||
headers = {"local filename": protocol.filename or '', "size": protocol.size}
|
||||
return respcls(url=request.url, status=200, body=body, headers=headers)
|
||||
|
||||
def _failed(self, result, request):
|
||||
message = result.getErrorMessage()
|
||||
if result.type == CommandFailed:
|
||||
m = _CODE_RE.search(message)
|
||||
if m:
|
||||
ftpcode = m.group()
|
||||
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
|
||||
return Response(url=request.url, status=httpcode, body=message)
|
||||
raise result.type(result.value)
|
||||
|
@ -56,6 +56,7 @@ DOWNLOAD_HANDLERS_BASE = {
|
||||
'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
|
||||
'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
|
||||
's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
|
||||
'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler',
|
||||
}
|
||||
|
||||
DOWNLOAD_TIMEOUT = 180 # 3mins
|
||||
|
@ -9,6 +9,9 @@ from twisted.web import server, static, util, resource
|
||||
from twisted.web.test.test_webclient import ForeverTakingResource, \
|
||||
NoLengthResource, HostHeaderResource, \
|
||||
PayloadResource, BrokenDownloadResource
|
||||
from twisted.protocols.ftp import FTPRealm, FTPFactory
|
||||
from twisted.cred import portal, checkers, credentials
|
||||
from twisted.protocols.ftp import FTPClient, ConnectionLost
|
||||
from w3lib.url import path_to_file_uri
|
||||
|
||||
from scrapy.core.downloader.handlers.file import FileDownloadHandler
|
||||
@ -16,6 +19,8 @@ from scrapy.core.downloader.handlers.http import HTTPDownloadHandler, HttpDownlo
|
||||
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
|
||||
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
|
||||
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
|
||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
from scrapy.settings import Settings
|
||||
@ -326,3 +331,83 @@ class S3TestCase(unittest.TestCase):
|
||||
httpreq = self.download_request(req, self.spider)
|
||||
self.assertEqual(httpreq.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
|
||||
|
||||
class FTPTestCase(unittest.TestCase):
|
||||
|
||||
username = "scrapy"
|
||||
password = "passwd"
|
||||
|
||||
def setUp(self):
|
||||
# setup dirs and test file
|
||||
self.directory = self.mktemp()
|
||||
os.mkdir(self.directory)
|
||||
userdir = os.path.join(self.directory, self.username)
|
||||
os.mkdir(userdir)
|
||||
FilePath(userdir).child('file.txt').setContent("I have the power!")
|
||||
|
||||
# setup server
|
||||
realm = FTPRealm(anonymousRoot=self.directory, userHome=self.directory)
|
||||
p = portal.Portal(realm)
|
||||
users_checker = checkers.InMemoryUsernamePasswordDatabaseDontUse()
|
||||
users_checker.addUser(self.username, self.password)
|
||||
p.registerChecker(users_checker, credentials.IUsernamePassword)
|
||||
self.factory = FTPFactory(portal=p)
|
||||
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
|
||||
self.portNum = self.port.getHost().port
|
||||
self.download_handler = FTPDownloadHandler(Settings())
|
||||
self.addCleanup(self.port.stopListening)
|
||||
|
||||
def _add_test_callbacks(self, deferred, callback=None, errback=None):
|
||||
def _clean(data):
|
||||
self.download_handler.client.transport.loseConnection()
|
||||
return data
|
||||
deferred.addCallback(_clean)
|
||||
if callback:
|
||||
deferred.addCallback(callback)
|
||||
if errback:
|
||||
deferred.addErrback(errback)
|
||||
return deferred
|
||||
|
||||
def test_ftp_download_success(self):
|
||||
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
|
||||
meta={"ftp_user": self.username, "ftp_password": self.password})
|
||||
d = self.download_handler.download_request(request, None)
|
||||
|
||||
def _test(r):
|
||||
self.assertEqual(r.status, 200)
|
||||
self.assertEqual(r.body, 'I have the power!')
|
||||
self.assertEqual(r.headers, {'Local Filename': [''], 'Size': [17]})
|
||||
return self._add_test_callbacks(d, _test)
|
||||
|
||||
def test_ftp_download_notexist(self):
|
||||
request = Request(url="ftp://127.0.0.1:%s/notexist.txt" % self.portNum,
|
||||
meta={"ftp_user": self.username, "ftp_password": self.password})
|
||||
d = self.download_handler.download_request(request, None)
|
||||
|
||||
def _test(r):
|
||||
self.assertEqual(r.status, 404)
|
||||
return self._add_test_callbacks(d, _test)
|
||||
|
||||
def test_ftp_local_filename(self):
|
||||
local_fname = "/tmp/file.txt"
|
||||
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
|
||||
meta={"ftp_user": self.username, "ftp_password": self.password, "ftp_local_filename": local_fname})
|
||||
d = self.download_handler.download_request(request, None)
|
||||
|
||||
def _test(r):
|
||||
self.assertEqual(r.body, local_fname)
|
||||
self.assertEqual(r.headers, {'Local Filename': ['/tmp/file.txt'], 'Size': [17]})
|
||||
self.assertTrue(os.path.exists(local_fname))
|
||||
with open(local_fname) as f:
|
||||
self.assertEqual(f.read(), "I have the power!")
|
||||
os.remove(local_fname)
|
||||
return self._add_test_callbacks(d, _test)
|
||||
|
||||
def test_invalid_credentials(self):
|
||||
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
|
||||
meta={"ftp_user": self.username, "ftp_password": 'invalid'})
|
||||
d = self.download_handler.download_request(request, None)
|
||||
|
||||
def _test(r):
|
||||
self.assertEqual(r.type, ConnectionLost)
|
||||
return self._add_test_callbacks(d, errback=_test)
|
||||
|
Loading…
x
Reference in New Issue
Block a user