mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 06:23:41 +00:00
add http connection pool and custom ssl context factory
This commit is contained in:
parent
a7a354f982
commit
e4fe7c63b0
@ -6,7 +6,8 @@ from urlparse import urldefrag
|
||||
|
||||
from zope.interface import implements
|
||||
from twisted.internet import defer, reactor, protocol
|
||||
from twisted.web.client import Agent, ProxyAgent, ResponseDone, ResponseFailed
|
||||
from twisted.web.client import Agent, ProxyAgent, ResponseDone, \
|
||||
ResponseFailed, HTTPConnectionPool
|
||||
from twisted.web.http_headers import Headers
|
||||
from twisted.web.http import PotentialDataLoss
|
||||
from twisted.web.iweb import IBodyProducer
|
||||
@ -15,39 +16,88 @@ from twisted.internet.endpoints import TCP4ClientEndpoint
|
||||
from scrapy.http import Headers as ScrapyHeaders
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.conf import settings
|
||||
from scrapy import log
|
||||
|
||||
|
||||
ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
|
||||
|
||||
class Http11DownloadHandler(object):
|
||||
|
||||
def __init__(self):
|
||||
self._pool = HTTPConnectionPool(reactor, persistent=False)
|
||||
self._contextFactory = ClientContextFactory()
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
|
||||
return agent.download_request(request)
|
||||
|
||||
|
||||
class ScrapyAgent(object):
|
||||
|
||||
def __init__(self, contextFactory=None, connectTimeout=180, bindAddress=None):
|
||||
_Agent = Agent
|
||||
_ProxyAgent = ProxyAgent
|
||||
|
||||
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None):
|
||||
self._contextFactory = contextFactory
|
||||
self._connectTimeout = connectTimeout
|
||||
self._bindAddress = bindAddress
|
||||
self._pool = pool
|
||||
|
||||
def launchRequest(self, request):
|
||||
request_timeout = request.meta.get('download_timeout') or self._connectTimeout
|
||||
def download_request(self, request):
|
||||
url = urldefrag(request.url)[0]
|
||||
method = request.method
|
||||
headers = Headers(request.headers)
|
||||
bodyproducer = _RequestBodyProducer(request.body or '')
|
||||
agent = self._get_agent(request)
|
||||
start_time = time()
|
||||
d = agent.request(method, url, headers, bodyproducer)
|
||||
d.addBoth(self._download_latency, request, start_time)
|
||||
d.addCallback(self._agentrequest_downloaded, request)
|
||||
d.addErrback(self._agentrequest_failed, request)
|
||||
return d
|
||||
|
||||
def _get_agent(self, request):
|
||||
timeout = request.meta.get('download_timeout') or self._connectTimeout
|
||||
bindaddress = request.meta.get('bindaddress') or self._bindAddress
|
||||
proxy = request.meta.get('proxy')
|
||||
if proxy is not None and proxy != '':
|
||||
if proxy:
|
||||
scheme, _, host, port, _ = _parse(proxy)
|
||||
endpoint = TCP4ClientEndpoint(reactor,
|
||||
host, port,
|
||||
timeout=request_timeout,
|
||||
bindAddress=self._bindAddress)
|
||||
agent = ProxyAgent(endpoint)
|
||||
else:
|
||||
agent = Agent(reactor,
|
||||
contextFactory=self._contextFactory,
|
||||
connectTimeout=request_timeout,
|
||||
bindAddress=self._bindAddress)
|
||||
endpoint = TCP4ClientEndpoint(reactor, host, port, timeout=timeout,
|
||||
bindAddress=bindaddress)
|
||||
return self._ProxyAgent(endpoint)
|
||||
|
||||
request._tw_start_time = time()
|
||||
return agent.request(
|
||||
request.method,
|
||||
urldefrag(request.url)[0],
|
||||
Headers(request.headers),
|
||||
_RequestBodyProducer(request.body or ''),
|
||||
)
|
||||
return self._Agent(reactor, contextFactory=self._contextFactory,
|
||||
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
|
||||
|
||||
def _download_latency(self, any_, request, start_time):
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
return any_
|
||||
|
||||
def _agentrequest_downloaded(self, txresponse, request):
|
||||
if txresponse.length == 0:
|
||||
return self._build_response(('', None), txresponse, request)
|
||||
finished = defer.Deferred()
|
||||
finished.addCallback(self._build_response, txresponse, request)
|
||||
txresponse.deliverBody(_ResponseReader(finished))
|
||||
return finished
|
||||
|
||||
def _build_response(self, (body, flag), txresponse, request):
|
||||
if flag is not None:
|
||||
request.meta[flag] = True
|
||||
url = urldefrag(request.url)[0]
|
||||
status = int(txresponse.code)
|
||||
headers = ScrapyHeaders(txresponse.headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
return respcls(url=url, status=status, headers=headers, body=body)
|
||||
|
||||
def _agentrequest_failed(self, failure, request):
|
||||
# be clear it is an HTTP failure with new downloader
|
||||
log.err(failure, 'HTTP11 failure: %s' % request)
|
||||
return failure
|
||||
|
||||
|
||||
class _RequestBodyProducer(object):
|
||||
@ -85,43 +135,3 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._finished.callback((body, 'partial_download'))
|
||||
else:
|
||||
self._finished.errback(reason)
|
||||
|
||||
|
||||
class Http11DownloadHandler(object):
|
||||
|
||||
def __init__(self):
|
||||
self.debug = False
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
agent = ScrapyAgent(reactor)
|
||||
d = agent.launchRequest(request)
|
||||
d.addBoth(self._download_latency, request, time())
|
||||
d.addCallback(self._agent_callback, request)
|
||||
d.addErrback(self._agent_errback, request)
|
||||
return d
|
||||
|
||||
def _download_latency(self, any_, request, start_time):
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
return any_
|
||||
|
||||
def _agent_callback(self, txresponse, request):
|
||||
if txresponse.length == 0:
|
||||
return self._build_response(('', None), txresponse, request)
|
||||
finished = defer.Deferred()
|
||||
finished.addCallback(self._build_response, txresponse, request)
|
||||
txresponse.deliverBody(_ResponseReader(finished))
|
||||
return finished
|
||||
|
||||
def _build_response(self, (body, flag), txresponse, request):
|
||||
if flag is not None:
|
||||
request.meta[flag] = True
|
||||
url = urldefrag(request.url)[0]
|
||||
status = int(txresponse.code)
|
||||
headers = ScrapyHeaders(txresponse.headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
return respcls(url=url, status=status, headers=headers, body=body)
|
||||
|
||||
def _agent_errback(self, failure, request):
|
||||
#log.err(failure, 'HTTP11 failure: %s' % request)
|
||||
return failure
|
||||
|
@ -144,12 +144,16 @@ class ScrapyClientContextFactory(ClientContextFactory):
|
||||
# see https://github.com/scrapy/scrapy/issues/82
|
||||
# and https://github.com/scrapy/scrapy/issues/26
|
||||
|
||||
<<<<<<< HEAD
|
||||
def __init__(self):
|
||||
# see this issue on why we use TLSv1_METHOD by default
|
||||
# https://github.com/scrapy/scrapy/issues/194
|
||||
self.method = SSL.TLSv1_METHOD
|
||||
|
||||
def getContext(self):
|
||||
=======
|
||||
def getContext(self, hostname, port):
|
||||
>>>>>>> add http connection pool and custom ssl context factory
|
||||
ctx = ClientContextFactory.getContext(self)
|
||||
# Enable all workarounds to SSL bugs as documented by
|
||||
# http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html
|
||||
|
Loading…
x
Reference in New Issue
Block a user