1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 09:43:46 +00:00

Merge pull request #329 from kalessin/ftphandler

added ftp handler
This commit is contained in:
Daniel Graña 2013-07-15 16:39:46 -07:00
commit 26c7ec7d9f
3 changed files with 190 additions and 0 deletions

View File

@ -0,0 +1,104 @@
"""
An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
FTP connection parameters are passed using the request meta field:
- ftp_user (required)
- ftp_password (required)
- ftp_passive (by default, enabled) sets FTP connection passive mode
- ftp_local_filename
- If not given, file data will come in the response.body, as a normal scrapy Response,
which will imply that the entire file will be on memory.
- if given, file data will be saved in a local file with the given name
This helps when downloading very big files to avoid memory issues. In addition, for
convenience the local file name will also be given in the response body.
The status of the built html response will be, by default
- 200 in case of success
- 404 in case specified file was not found in the server (ftp code 550)
or raise corresponding ftp exception otherwise
The matching from server ftp command return codes to html response codes is defined in the
CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
that is not explicitly present among the map keys. You may need to overwrite this
mapping if want a different behaviour than default.
In case of status 200 request, response.headers will come with two keys:
'Local Filename' - with the value of the local filename if given
'Size' - with size of the downloaded data
"""
import re
from urlparse import urlparse
from cStringIO import StringIO
from twisted.internet import reactor
from twisted.protocols.ftp import FTPClient, CommandFailed
from twisted.internet.protocol import Protocol, ClientCreator
from scrapy.http import Response
from scrapy.responsetypes import responsetypes
class ReceivedDataProtocol(Protocol):
def __init__(self, filename=None):
self.__filename = filename
self.body = open(filename, "w") if filename else StringIO()
self.size = 0
def dataReceived(self, data):
self.body.write(data)
self.size += len(data)
@property
def filename(self):
return self.__filename
def close(self):
self.body.close() if self.filename else self.body.reset()
_CODE_RE = re.compile("\d+")
class FTPDownloadHandler(object):
CODE_MAPPING = {
"550": 404,
"default": 503,
}
def __init__(self, setting):
pass
def download_request(self, request, spider):
parsed_url = urlparse(request.url)
creator = ClientCreator(reactor, FTPClient, request.meta["ftp_user"],
request.meta["ftp_password"],
passive=request.meta.get("ftp_passive", 1))
return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
request, parsed_url.path)
def gotClient(self, client, request, filepath):
self.client = client
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
return client.retrieveFile(filepath, protocol)\
.addCallbacks(callback=self._build_response,
callbackArgs=(request, protocol),
errback=self._failed,
errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
respcls = responsetypes.from_args(url=request.url)
protocol.close()
body = protocol.filename or protocol.body.read()
headers = {"local filename": protocol.filename or '', "size": protocol.size}
return respcls(url=request.url, status=200, body=body, headers=headers)
def _failed(self, result, request):
message = result.getErrorMessage()
if result.type == CommandFailed:
m = _CODE_RE.search(message)
if m:
ftpcode = m.group()
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
return Response(url=request.url, status=httpcode, body=message)
raise result.type(result.value)

View File

@ -56,6 +56,7 @@ DOWNLOAD_HANDLERS_BASE = {
'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler',
}
DOWNLOAD_TIMEOUT = 180 # 3mins

View File

@ -9,6 +9,9 @@ from twisted.web import server, static, util, resource
from twisted.web.test.test_webclient import ForeverTakingResource, \
NoLengthResource, HostHeaderResource, \
PayloadResource, BrokenDownloadResource
from twisted.protocols.ftp import FTPRealm, FTPFactory
from twisted.cred import portal, checkers, credentials
from twisted.protocols.ftp import FTPClient, ConnectionLost
from w3lib.url import path_to_file_uri
from scrapy.core.downloader.handlers.file import FileDownloadHandler
@ -16,6 +19,8 @@ from scrapy.core.downloader.handlers.http import HTTPDownloadHandler, HttpDownlo
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.settings import Settings
@ -326,3 +331,83 @@ class S3TestCase(unittest.TestCase):
httpreq = self.download_request(req, self.spider)
self.assertEqual(httpreq.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
class FTPTestCase(unittest.TestCase):
username = "scrapy"
password = "passwd"
def setUp(self):
# setup dirs and test file
self.directory = self.mktemp()
os.mkdir(self.directory)
userdir = os.path.join(self.directory, self.username)
os.mkdir(userdir)
FilePath(userdir).child('file.txt').setContent("I have the power!")
# setup server
realm = FTPRealm(anonymousRoot=self.directory, userHome=self.directory)
p = portal.Portal(realm)
users_checker = checkers.InMemoryUsernamePasswordDatabaseDontUse()
users_checker.addUser(self.username, self.password)
p.registerChecker(users_checker, credentials.IUsernamePassword)
self.factory = FTPFactory(portal=p)
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
self.portNum = self.port.getHost().port
self.download_handler = FTPDownloadHandler(Settings())
self.addCleanup(self.port.stopListening)
def _add_test_callbacks(self, deferred, callback=None, errback=None):
def _clean(data):
self.download_handler.client.transport.loseConnection()
return data
deferred.addCallback(_clean)
if callback:
deferred.addCallback(callback)
if errback:
deferred.addErrback(errback)
return deferred
def test_ftp_download_success(self):
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": self.password})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.status, 200)
self.assertEqual(r.body, 'I have the power!')
self.assertEqual(r.headers, {'Local Filename': [''], 'Size': [17]})
return self._add_test_callbacks(d, _test)
def test_ftp_download_notexist(self):
request = Request(url="ftp://127.0.0.1:%s/notexist.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": self.password})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.status, 404)
return self._add_test_callbacks(d, _test)
def test_ftp_local_filename(self):
local_fname = "/tmp/file.txt"
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": self.password, "ftp_local_filename": local_fname})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.body, local_fname)
self.assertEqual(r.headers, {'Local Filename': ['/tmp/file.txt'], 'Size': [17]})
self.assertTrue(os.path.exists(local_fname))
with open(local_fname) as f:
self.assertEqual(f.read(), "I have the power!")
os.remove(local_fname)
return self._add_test_callbacks(d, _test)
def test_invalid_credentials(self):
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": 'invalid'})
d = self.download_handler.download_request(request, None)
def _test(r):
self.assertEqual(r.type, ConnectionLost)
return self._add_test_callbacks(d, errback=_test)