1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 07:52:49 +00:00

Download handlers: from_crawler factory method, take crawler instead of settings in __init__

This commit is contained in:
Eugenio Lacuesta 2019-11-05 00:54:46 -03:00
parent d1cdfb4701
commit 3d77f74e40
No known key found for this signature in database
GPG Key ID: DA3EF2D0913E9810
8 changed files with 86 additions and 47 deletions

View File

@ -5,7 +5,7 @@ from twisted.internet import defer
import six
from scrapy.exceptions import NotSupported, NotConfigured
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import load_object
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import without_none_values
from scrapy import signals
@ -48,7 +48,11 @@ class DownloadHandlers(object):
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = dhcls(self._crawler.settings)
dh = create_instance(
dhcls,
self._crawler.settings,
self._crawler,
)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None

View File

@ -8,9 +8,6 @@ from scrapy.utils.decorators import defers
class DataURIDownloadHandler(object):
lazy = False
def __init__(self, settings):
super(DataURIDownloadHandler, self).__init__()
@defers
def download_request(self, request, spider):
uri = parse_data_uri(request.url)

View File

@ -1,4 +1,5 @@
from w3lib.url import file_uri_to_path
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
@ -6,9 +7,6 @@ from scrapy.utils.decorators import defers
class FileDownloadHandler(object):
lazy = False
def __init__(self, settings):
pass
@defers
def download_request(self, request, spider):
filepath = file_uri_to_path(request.url)

View File

@ -59,6 +59,7 @@ class ReceivedDataProtocol(Protocol):
def close(self):
self.body.close() if self.filename else self.body.seek(0)
_CODE_RE = re.compile(r"\d+")
@ -70,10 +71,14 @@ class FTPDownloadHandler(object):
"default": 503,
}
def __init__(self, settings):
self.default_user = settings['FTP_USER']
self.default_password = settings['FTP_PASSWORD']
self.passive_mode = settings['FTP_PASSIVE_MODE']
def __init__(self, crawler):
self.default_user = crawler.settings['FTP_USER']
self.default_password = crawler.settings['FTP_PASSWORD']
self.passive_mode = crawler.settings['FTP_PASSIVE_MODE']
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def download_request(self, request, spider):
parsed_url = urlparse_cached(request)

View File

@ -1,6 +1,7 @@
"""Download handlers for http and https schemes
"""
from twisted.internet import reactor
from scrapy.utils.misc import load_object, create_instance
from scrapy.utils.python import to_unicode
@ -8,10 +9,15 @@ from scrapy.utils.python import to_unicode
class HTTP10DownloadHandler(object):
lazy = False
def __init__(self, settings):
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._settings = settings
def __init__(self, crawler):
self.HTTPClientFactory = load_object(crawler.settings['DOWNLOADER_HTTPCLIENTFACTORY'])
self.ClientContextFactory = load_object(crawler.settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._crawler = crawler
self._settings = crawler.settings
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
@ -22,7 +28,11 @@ class HTTP10DownloadHandler(object):
def _connect(self, factory):
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
client_context_factory = create_instance(self.ClientContextFactory, settings=self._settings, crawler=None)
client_context_factory = create_instance(
self.ClientContextFactory,
settings=self._settings,
crawler=self._crawler,
)
return reactor.connectSSL(host, port, factory, client_context_factory)
else:
return reactor.connectTCP(host, port, factory)

View File

@ -30,7 +30,9 @@ logger = logging.getLogger(__name__)
class HTTP11DownloadHandler(object):
lazy = False
def __init__(self, settings):
def __init__(self, crawler):
settings = crawler.settings
self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self._pool._factory.noisy = False
@ -42,7 +44,7 @@ class HTTP11DownloadHandler(object):
self._contextFactory = create_instance(
self._contextFactoryClass,
settings=settings,
crawler=None,
crawler=crawler,
method=self._sslMethod,
)
except TypeError:
@ -50,7 +52,7 @@ class HTTP11DownloadHandler(object):
self._contextFactory = create_instance(
self._contextFactoryClass,
settings=settings,
crawler=None,
crawler=crawler,
)
msg = """
'%s' does not accept `method` argument (type OpenSSL.SSL method,\
@ -63,6 +65,10 @@ class HTTP11DownloadHandler(object):
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
self._disconnect_timeout = 1
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(

View File

@ -32,13 +32,12 @@ def _get_boto_connection():
class S3DownloadHandler(object):
def __init__(self, settings, aws_access_key_id=None, aws_secret_access_key=None, \
httpdownloadhandler=HTTPDownloadHandler, **kw):
def __init__(self, crawler, aws_access_key_id=None, aws_secret_access_key=None,
httpdownloadhandler=HTTPDownloadHandler, **kw):
if not aws_access_key_id:
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
aws_access_key_id = crawler.settings['AWS_ACCESS_KEY_ID']
if not aws_secret_access_key:
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
aws_secret_access_key = crawler.settings['AWS_SECRET_ACCESS_KEY']
# If no credentials could be found anywhere,
# consider this an anonymous connection request by default;
@ -67,7 +66,11 @@ class S3DownloadHandler(object):
except Exception as ex:
raise NotConfigured(str(ex))
self._download_http = httpdownloadhandler(settings).download_request
self._download_http = httpdownloadhandler(crawler).download_request
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def download_request(self, request, spider):
p = urlparse_cached(request)

View File

@ -30,7 +30,6 @@ from scrapy.spiders import Spider
from scrapy.http import Headers, Request
from scrapy.http.response.text import TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler, skip_if_no_boto
from scrapy.utils.python import to_bytes
from scrapy.exceptions import NotConfigured
@ -45,6 +44,10 @@ class DummyDH(object):
def __init__(self, crawler):
pass
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
class DummyLazyDH(object):
# Default is lazy for backward compatibility
@ -52,6 +55,10 @@ class DummyLazyDH(object):
def __init__(self, crawler):
pass
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
class OffDH(object):
lazy = False
@ -59,6 +66,10 @@ class OffDH(object):
def __init__(self, crawler):
raise NotConfigured
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
class LoadTestCase(unittest.TestCase):
@ -106,7 +117,7 @@ class FileTestCase(unittest.TestCase):
self.tmpname = self.mktemp()
with open(self.tmpname + '^', 'w') as f:
f.write('0123456789')
self.download_request = FileDownloadHandler(Settings()).download_request
self.download_request = FileDownloadHandler().download_request
def tearDown(self):
os.unlink(self.tmpname + '^')
@ -239,7 +250,7 @@ class HttpTestCase(unittest.TestCase):
else:
self.port = reactor.listenTCP(0, self.wrapper, interface=self.host)
self.portno = self.port.getHost().port
self.download_handler = self.download_handler_cls(Settings())
self.download_handler = self.download_handler_cls(get_crawler())
self.download_request = self.download_handler.download_request
@defer.inlineCallbacks
@ -479,9 +490,9 @@ class Http11TestCase(HttpTestCase):
return self.test_download_broken_content_allow_data_loss('broken-chunked')
def test_download_broken_content_allow_data_loss_via_setting(self, url='broken'):
download_handler = self.download_handler_cls(Settings({
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}))
download_handler = self.download_handler_cls(
get_crawler(settings_dict={'DOWNLOAD_FAIL_ON_DATALOSS': False})
)
request = Request(self.getURL(url))
d = download_handler.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.flags)
@ -499,9 +510,9 @@ class Https11TestCase(Http11TestCase):
@defer.inlineCallbacks
def test_tls_logging(self):
download_handler = self.download_handler_cls(Settings({
'DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING': True,
}))
download_handler = self.download_handler_cls(
get_crawler(settings_dict={'DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING': True})
)
try:
with LogCapture() as log_capture:
request = Request(self.getURL('file'))
@ -569,7 +580,8 @@ class Https11CustomCiphers(unittest.TestCase):
interface=self.host)
self.portno = self.port.getHost().port
self.download_handler = self.download_handler_cls(
Settings({'DOWNLOADER_CLIENT_TLS_CIPHERS': 'CAMELLIA256-SHA'}))
get_crawler(settings_dict={'DOWNLOADER_CLIENT_TLS_CIPHERS': 'CAMELLIA256-SHA'})
)
self.download_request = self.download_handler.download_request
@defer.inlineCallbacks
@ -665,7 +677,7 @@ class HttpProxyTestCase(unittest.TestCase):
wrapper = WrappingFactory(site)
self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
self.portno = self.port.getHost().port
self.download_handler = self.download_handler_cls(Settings())
self.download_handler = self.download_handler_cls(get_crawler())
self.download_request = self.download_handler.download_request
@defer.inlineCallbacks
@ -738,9 +750,10 @@ class S3AnonTestCase(unittest.TestCase):
def setUp(self):
skip_if_no_boto()
self.s3reqh = S3DownloadHandler(Settings(),
httpdownloadhandler=HttpDownloadHandlerMock,
#anon=True, # is implicit
self.s3reqh = S3DownloadHandler(
crawler=get_crawler(),
httpdownloadhandler=HttpDownloadHandlerMock,
#anon=True, # is implicit
)
self.download_request = self.s3reqh.download_request
self.spider = Spider('foo')
@ -766,9 +779,12 @@ class S3TestCase(unittest.TestCase):
def setUp(self):
skip_if_no_boto()
s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID,
self.AWS_SECRET_ACCESS_KEY,
httpdownloadhandler=HttpDownloadHandlerMock)
s3reqh = S3DownloadHandler(
get_crawler(),
self.AWS_ACCESS_KEY_ID,
self.AWS_SECRET_ACCESS_KEY,
httpdownloadhandler=HttpDownloadHandlerMock,
)
self.download_request = s3reqh.download_request
self.spider = Spider('foo')
@ -788,7 +804,7 @@ class S3TestCase(unittest.TestCase):
def test_extra_kw(self):
try:
S3DownloadHandler(Settings(), extra_kw=True)
S3DownloadHandler(get_crawler(), extra_kw=True)
except Exception as e:
self.assertIsInstance(e, (TypeError, NotConfigured))
else:
@ -928,7 +944,7 @@ class BaseFTPTestCase(unittest.TestCase):
self.factory = FTPFactory(portal=p)
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
self.portNum = self.port.getHost().port
self.download_handler = FTPDownloadHandler(Settings())
self.download_handler = FTPDownloadHandler(get_crawler())
self.addCleanup(self.port.stopListening)
def tearDown(self):
@ -1042,7 +1058,7 @@ class AnonymousFTPTestCase(BaseFTPTestCase):
userAnonymous=self.username)
self.port = reactor.listenTCP(0, self.factory, interface="127.0.0.1")
self.portNum = self.port.getHost().port
self.download_handler = FTPDownloadHandler(Settings())
self.download_handler = FTPDownloadHandler(get_crawler())
self.addCleanup(self.port.stopListening)
def tearDown(self):
@ -1052,7 +1068,7 @@ class AnonymousFTPTestCase(BaseFTPTestCase):
class DataURITestCase(unittest.TestCase):
def setUp(self):
self.download_handler = DataURIDownloadHandler(Settings())
self.download_handler = DataURIDownloadHandler()
self.download_request = self.download_handler.download_request
self.spider = Spider('foo')