1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 06:23:41 +00:00

add http connection pool and custom ssl context factory

This commit is contained in:
Daniel Graña 2012-05-14 17:04:08 -03:00
parent a7a354f982
commit e4fe7c63b0
2 changed files with 76 additions and 62 deletions

View File

@ -6,7 +6,8 @@ from urlparse import urldefrag
from zope.interface import implements
from twisted.internet import defer, reactor, protocol
from twisted.web.client import Agent, ProxyAgent, ResponseDone, ResponseFailed
from twisted.web.client import Agent, ProxyAgent, ResponseDone, \
ResponseFailed, HTTPConnectionPool
from twisted.web.http_headers import Headers
from twisted.web.http import PotentialDataLoss
from twisted.web.iweb import IBodyProducer
@ -15,39 +16,88 @@ from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.http import Headers as ScrapyHeaders
from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.webclient import _parse
from scrapy.utils.misc import load_object
from scrapy.conf import settings
from scrapy import log
ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
class Http11DownloadHandler(object):
def __init__(self):
self._pool = HTTPConnectionPool(reactor, persistent=False)
self._contextFactory = ClientContextFactory()
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
return agent.download_request(request)
class ScrapyAgent(object):
def __init__(self, contextFactory=None, connectTimeout=180, bindAddress=None):
_Agent = Agent
_ProxyAgent = ProxyAgent
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
def launchRequest(self, request):
request_timeout = request.meta.get('download_timeout') or self._connectTimeout
def download_request(self, request):
url = urldefrag(request.url)[0]
method = request.method
headers = Headers(request.headers)
bodyproducer = _RequestBodyProducer(request.body or '')
agent = self._get_agent(request)
start_time = time()
d = agent.request(method, url, headers, bodyproducer)
d.addBoth(self._download_latency, request, start_time)
d.addCallback(self._agentrequest_downloaded, request)
d.addErrback(self._agentrequest_failed, request)
return d
def _get_agent(self, request):
timeout = request.meta.get('download_timeout') or self._connectTimeout
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy is not None and proxy != '':
if proxy:
scheme, _, host, port, _ = _parse(proxy)
endpoint = TCP4ClientEndpoint(reactor,
host, port,
timeout=request_timeout,
bindAddress=self._bindAddress)
agent = ProxyAgent(endpoint)
else:
agent = Agent(reactor,
contextFactory=self._contextFactory,
connectTimeout=request_timeout,
bindAddress=self._bindAddress)
endpoint = TCP4ClientEndpoint(reactor, host, port, timeout=timeout,
bindAddress=bindaddress)
return self._ProxyAgent(endpoint)
request._tw_start_time = time()
return agent.request(
request.method,
urldefrag(request.url)[0],
Headers(request.headers),
_RequestBodyProducer(request.body or ''),
)
return self._Agent(reactor, contextFactory=self._contextFactory,
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def _download_latency(self, any_, request, start_time):
request.meta['download_latency'] = time() - start_time
return any_
def _agentrequest_downloaded(self, txresponse, request):
if txresponse.length == 0:
return self._build_response(('', None), txresponse, request)
finished = defer.Deferred()
finished.addCallback(self._build_response, txresponse, request)
txresponse.deliverBody(_ResponseReader(finished))
return finished
def _build_response(self, (body, flag), txresponse, request):
if flag is not None:
request.meta[flag] = True
url = urldefrag(request.url)[0]
status = int(txresponse.code)
headers = ScrapyHeaders(txresponse.headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url)
return respcls(url=url, status=status, headers=headers, body=body)
def _agentrequest_failed(self, failure, request):
# be clear it is an HTTP failure with new downloader
log.err(failure, 'HTTP11 failure: %s' % request)
return failure
class _RequestBodyProducer(object):
@ -85,43 +135,3 @@ class _ResponseReader(protocol.Protocol):
self._finished.callback((body, 'partial_download'))
else:
self._finished.errback(reason)
class Http11DownloadHandler(object):
def __init__(self):
self.debug = False
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(reactor)
d = agent.launchRequest(request)
d.addBoth(self._download_latency, request, time())
d.addCallback(self._agent_callback, request)
d.addErrback(self._agent_errback, request)
return d
def _download_latency(self, any_, request, start_time):
request.meta['download_latency'] = time() - start_time
return any_
def _agent_callback(self, txresponse, request):
if txresponse.length == 0:
return self._build_response(('', None), txresponse, request)
finished = defer.Deferred()
finished.addCallback(self._build_response, txresponse, request)
txresponse.deliverBody(_ResponseReader(finished))
return finished
def _build_response(self, (body, flag), txresponse, request):
if flag is not None:
request.meta[flag] = True
url = urldefrag(request.url)[0]
status = int(txresponse.code)
headers = ScrapyHeaders(txresponse.headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url)
return respcls(url=url, status=status, headers=headers, body=body)
def _agent_errback(self, failure, request):
#log.err(failure, 'HTTP11 failure: %s' % request)
return failure

View File

@ -144,12 +144,16 @@ class ScrapyClientContextFactory(ClientContextFactory):
# see https://github.com/scrapy/scrapy/issues/82
# and https://github.com/scrapy/scrapy/issues/26
<<<<<<< HEAD
def __init__(self):
# see this issue on why we use TLSv1_METHOD by default
# https://github.com/scrapy/scrapy/issues/194
self.method = SSL.TLSv1_METHOD
def getContext(self):
=======
def getContext(self, hostname, port):
>>>>>>> add http connection pool and custom ssl context factory
ctx = ClientContextFactory.getContext(self)
# Enable all workarounds to SSL bugs as documented by
# http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html