mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-21 07:52:49 +00:00
Download handlers: from_crawler factory method, take crawler instead of settings in __init__
This commit is contained in:
parent
d1cdfb4701
commit
3d77f74e40
@ -5,7 +5,7 @@ from twisted.internet import defer
|
||||
import six
|
||||
from scrapy.exceptions import NotSupported, NotConfigured
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import without_none_values
|
||||
from scrapy import signals
|
||||
|
||||
@ -48,7 +48,11 @@ class DownloadHandlers(object):
|
||||
dhcls = load_object(path)
|
||||
if skip_lazy and getattr(dhcls, 'lazy', True):
|
||||
return None
|
||||
dh = dhcls(self._crawler.settings)
|
||||
dh = create_instance(
|
||||
dhcls,
|
||||
self._crawler.settings,
|
||||
self._crawler,
|
||||
)
|
||||
except NotConfigured as ex:
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
|
@ -8,9 +8,6 @@ from scrapy.utils.decorators import defers
|
||||
class DataURIDownloadHandler(object):
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings):
|
||||
super(DataURIDownloadHandler, self).__init__()
|
||||
|
||||
@defers
|
||||
def download_request(self, request, spider):
|
||||
uri = parse_data_uri(request.url)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
@ -6,9 +7,6 @@ from scrapy.utils.decorators import defers
|
||||
class FileDownloadHandler(object):
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings):
|
||||
pass
|
||||
|
||||
@defers
|
||||
def download_request(self, request, spider):
|
||||
filepath = file_uri_to_path(request.url)
|
||||
|
@ -59,6 +59,7 @@ class ReceivedDataProtocol(Protocol):
|
||||
def close(self):
|
||||
self.body.close() if self.filename else self.body.seek(0)
|
||||
|
||||
|
||||
_CODE_RE = re.compile(r"\d+")
|
||||
|
||||
|
||||
@ -70,10 +71,14 @@ class FTPDownloadHandler(object):
|
||||
"default": 503,
|
||||
}
|
||||
|
||||
def __init__(self, settings):
|
||||
self.default_user = settings['FTP_USER']
|
||||
self.default_password = settings['FTP_PASSWORD']
|
||||
self.passive_mode = settings['FTP_PASSIVE_MODE']
|
||||
def __init__(self, crawler):
|
||||
self.default_user = crawler.settings['FTP_USER']
|
||||
self.default_password = crawler.settings['FTP_PASSWORD']
|
||||
self.passive_mode = crawler.settings['FTP_PASSIVE_MODE']
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
parsed_url = urlparse_cached(request)
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""Download handlers for http and https schemes
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
|
||||
from scrapy.utils.misc import load_object, create_instance
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
@ -8,10 +9,15 @@ from scrapy.utils.python import to_unicode
|
||||
class HTTP10DownloadHandler(object):
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings):
|
||||
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
self._settings = settings
|
||||
def __init__(self, crawler):
|
||||
self.HTTPClientFactory = load_object(crawler.settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
self.ClientContextFactory = load_object(crawler.settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
self._crawler = crawler
|
||||
self._settings = crawler.settings
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
@ -22,7 +28,11 @@ class HTTP10DownloadHandler(object):
|
||||
def _connect(self, factory):
|
||||
host, port = to_unicode(factory.host), factory.port
|
||||
if factory.scheme == b'https':
|
||||
client_context_factory = create_instance(self.ClientContextFactory, settings=self._settings, crawler=None)
|
||||
client_context_factory = create_instance(
|
||||
self.ClientContextFactory,
|
||||
settings=self._settings,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
return reactor.connectSSL(host, port, factory, client_context_factory)
|
||||
else:
|
||||
return reactor.connectTCP(host, port, factory)
|
||||
|
@ -30,7 +30,9 @@ logger = logging.getLogger(__name__)
|
||||
class HTTP11DownloadHandler(object):
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings):
|
||||
def __init__(self, crawler):
|
||||
settings = crawler.settings
|
||||
|
||||
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
||||
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self._pool._factory.noisy = False
|
||||
@ -42,7 +44,7 @@ class HTTP11DownloadHandler(object):
|
||||
self._contextFactory = create_instance(
|
||||
self._contextFactoryClass,
|
||||
settings=settings,
|
||||
crawler=None,
|
||||
crawler=crawler,
|
||||
method=self._sslMethod,
|
||||
)
|
||||
except TypeError:
|
||||
@ -50,7 +52,7 @@ class HTTP11DownloadHandler(object):
|
||||
self._contextFactory = create_instance(
|
||||
self._contextFactoryClass,
|
||||
settings=settings,
|
||||
crawler=None,
|
||||
crawler=crawler,
|
||||
)
|
||||
msg = """
|
||||
'%s' does not accept `method` argument (type OpenSSL.SSL method,\
|
||||
@ -63,6 +65,10 @@ class HTTP11DownloadHandler(object):
|
||||
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
|
||||
self._disconnect_timeout = 1
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
agent = ScrapyAgent(
|
||||
|
@ -32,13 +32,12 @@ def _get_boto_connection():
|
||||
|
||||
class S3DownloadHandler(object):
|
||||
|
||||
def __init__(self, settings, aws_access_key_id=None, aws_secret_access_key=None, \
|
||||
httpdownloadhandler=HTTPDownloadHandler, **kw):
|
||||
|
||||
def __init__(self, crawler, aws_access_key_id=None, aws_secret_access_key=None,
|
||||
httpdownloadhandler=HTTPDownloadHandler, **kw):
|
||||
if not aws_access_key_id:
|
||||
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
|
||||
aws_access_key_id = crawler.settings['AWS_ACCESS_KEY_ID']
|
||||
if not aws_secret_access_key:
|
||||
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
|
||||
aws_secret_access_key = crawler.settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
# If no credentials could be found anywhere,
|
||||
# consider this an anonymous connection request by default;
|
||||
@ -67,7 +66,11 @@ class S3DownloadHandler(object):
|
||||
except Exception as ex:
|
||||
raise NotConfigured(str(ex))
|
||||
|
||||
self._download_http = httpdownloadhandler(settings).download_request
|
||||
self._download_http = httpdownloadhandler(crawler).download_request
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
p = urlparse_cached(request)
|
||||
|
@ -30,7 +30,6 @@ from scrapy.spiders import Spider
|
||||
from scrapy.http import Headers, Request
|
||||
from scrapy.http.response.text import TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.test import get_crawler, skip_if_no_boto
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -45,6 +44,10 @@ class DummyDH(object):
|
||||
def __init__(self, crawler):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
|
||||
class DummyLazyDH(object):
|
||||
# Default is lazy for backward compatibility
|
||||
@ -52,6 +55,10 @@ class DummyLazyDH(object):
|
||||
def __init__(self, crawler):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
|
||||
class OffDH(object):
|
||||
lazy = False
|
||||
@ -59,6 +66,10 @@ class OffDH(object):
|
||||
def __init__(self, crawler):
|
||||
raise NotConfigured
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
|
||||
class LoadTestCase(unittest.TestCase):
|
||||
|
||||
@ -106,7 +117,7 @@ class FileTestCase(unittest.TestCase):
|
||||
self.tmpname = self.mktemp()
|
||||
with open(self.tmpname + '^', 'w') as f:
|
||||
f.write('0123456789')
|
||||
self.download_request = FileDownloadHandler(Settings()).download_request
|
||||
self.download_request = FileDownloadHandler().download_request
|
||||
|
||||
def tearDown(self):
|
||||
os.unlink(self.tmpname + '^')
|
||||
@ -239,7 +250,7 @@ class HttpTestCase(unittest.TestCase):
|
||||
else:
|
||||
self.port = reactor.listenTCP(0, self.wrapper, interface=self.host)
|
||||
self.portno = self.port.getHost().port
|
||||
self.download_handler = self.download_handler_cls(Settings())
|
||||
self.download_handler = self.download_handler_cls(get_crawler())
|
||||
self.download_request = self.download_handler.download_request
|
||||
|
||||
@defer.inlineCallbacks
|
||||
@ -479,9 +490,9 @@ class Http11TestCase(HttpTestCase):
|
||||
return self.test_download_broken_content_allow_data_loss('broken-chunked')
|
||||
|
||||
def test_download_broken_content_allow_data_loss_via_setting(self, url='broken'):
|
||||
download_handler = self.download_handler_cls(Settings({
|
||||
'DOWNLOAD_FAIL_ON_DATALOSS': False,
|
||||
}))
|
||||
download_handler = self.download_handler_cls(
|
||||
get_crawler(settings_dict={'DOWNLOAD_FAIL_ON_DATALOSS': False})
|
||||
)
|
||||
request = Request(self.getURL(url))
|
||||
d = download_handler.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.flags)
|
||||
@ -499,9 +510,9 @@ class Https11TestCase(Http11TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_tls_logging(self):
|
||||
download_handler = self.download_handler_cls(Settings({
|
||||
'DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING': True,
|
||||
}))
|
||||
download_handler = self.download_handler_cls(
|
||||
get_crawler(settings_dict={'DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING': True})
|
||||
)
|
||||
try:
|
||||
with LogCapture() as log_capture:
|
||||
request = Request(self.getURL('file'))
|
||||
@ -569,7 +580,8 @@ class Https11CustomCiphers(unittest.TestCase):
|
||||
interface=self.host)
|
||||
self.portno = self.port.getHost().port
|
||||
self.download_handler = self.download_handler_cls(
|
||||
Settings({'DOWNLOADER_CLIENT_TLS_CIPHERS': 'CAMELLIA256-SHA'}))
|
||||
get_crawler(settings_dict={'DOWNLOADER_CLIENT_TLS_CIPHERS': 'CAMELLIA256-SHA'})
|
||||
)
|
||||
self.download_request = self.download_handler.download_request
|
||||
|
||||
@defer.inlineCallbacks
|
||||
@ -665,7 +677,7 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
wrapper = WrappingFactory(site)
|
||||
self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
|
||||
self.portno = self.port.getHost().port
|
||||
self.download_handler = self.download_handler_cls(Settings())
|
||||
self.download_handler = self.download_handler_cls(get_crawler())
|
||||
self.download_request = self.download_handler.download_request
|
||||
|
||||
@defer.inlineCallbacks
|
||||
@ -738,9 +750,10 @@ class S3AnonTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
skip_if_no_boto()
|
||||
self.s3reqh = S3DownloadHandler(Settings(),
|
||||
httpdownloadhandler=HttpDownloadHandlerMock,
|
||||
#anon=True, # is implicit
|
||||
self.s3reqh = S3DownloadHandler(
|
||||
crawler=get_crawler(),
|
||||
httpdownloadhandler=HttpDownloadHandlerMock,
|
||||
#anon=True, # is implicit
|
||||
)
|
||||
self.download_request = self.s3reqh.download_request
|
||||
self.spider = Spider('foo')
|
||||
@ -766,9 +779,12 @@ class S3TestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
skip_if_no_boto()
|
||||
s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID,
|
||||
self.AWS_SECRET_ACCESS_KEY,
|
||||
httpdownloadhandler=HttpDownloadHandlerMock)
|
||||
s3reqh = S3DownloadHandler(
|
||||
get_crawler(),
|
||||
self.AWS_ACCESS_KEY_ID,
|
||||
self.AWS_SECRET_ACCESS_KEY,
|
||||
httpdownloadhandler=HttpDownloadHandlerMock,
|
||||
)
|
||||
self.download_request = s3reqh.download_request
|
||||
self.spider = Spider('foo')
|
||||
|
||||
@ -788,7 +804,7 @@ class S3TestCase(unittest.TestCase):
|
||||
|
||||
def test_extra_kw(self):
|
||||
try:
|
||||
S3DownloadHandler(Settings(), extra_kw=True)
|
||||
S3DownloadHandler(get_crawler(), extra_kw=True)
|
||||
except Exception as e:
|
||||
self.assertIsInstance(e, (TypeError, NotConfigured))
|
||||
else:
|
||||
@ -928,7 +944,7 @@ class BaseFTPTestCase(unittest.TestCase):
|
||||
self.factory = FTPFactory(portal=p)
|
||||
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
|
||||
self.portNum = self.port.getHost().port
|
||||
self.download_handler = FTPDownloadHandler(Settings())
|
||||
self.download_handler = FTPDownloadHandler(get_crawler())
|
||||
self.addCleanup(self.port.stopListening)
|
||||
|
||||
def tearDown(self):
|
||||
@ -1042,7 +1058,7 @@ class AnonymousFTPTestCase(BaseFTPTestCase):
|
||||
userAnonymous=self.username)
|
||||
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
|
||||
self.portNum = self.port.getHost().port
|
||||
self.download_handler = FTPDownloadHandler(Settings())
|
||||
self.download_handler = FTPDownloadHandler(get_crawler())
|
||||
self.addCleanup(self.port.stopListening)
|
||||
|
||||
def tearDown(self):
|
||||
@ -1052,7 +1068,7 @@ class AnonymousFTPTestCase(BaseFTPTestCase):
|
||||
class DataURITestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.download_handler = DataURIDownloadHandler(Settings())
|
||||
self.download_handler = DataURIDownloadHandler()
|
||||
self.download_request = self.download_handler.download_request
|
||||
self.spider = Spider('foo')
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user