From e4fe7c63b0f84b893205269c2988c4e223d3c110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gra=C3=B1a?= Date: Mon, 14 May 2012 17:04:08 -0300 Subject: [PATCH] add http connection pool and custom ssl context factory --- scrapy/core/downloader/handlers/http11.py | 134 ++++++++++++---------- scrapy/core/downloader/webclient.py | 4 + 2 files changed, 76 insertions(+), 62 deletions(-) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 5ce482fbf..3793663a1 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -6,7 +6,8 @@ from urlparse import urldefrag from zope.interface import implements from twisted.internet import defer, reactor, protocol -from twisted.web.client import Agent, ProxyAgent, ResponseDone, ResponseFailed +from twisted.web.client import Agent, ProxyAgent, ResponseDone, \ + ResponseFailed, HTTPConnectionPool from twisted.web.http_headers import Headers from twisted.web.http import PotentialDataLoss from twisted.web.iweb import IBodyProducer @@ -15,39 +16,88 @@ from twisted.internet.endpoints import TCP4ClientEndpoint from scrapy.http import Headers as ScrapyHeaders from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse +from scrapy.utils.misc import load_object +from scrapy.conf import settings +from scrapy import log + + +ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) + + +class Http11DownloadHandler(object): + + def __init__(self): + self._pool = HTTPConnectionPool(reactor, persistent=False) + self._contextFactory = ClientContextFactory() + + def download_request(self, request, spider): + """Return a deferred for the HTTP download""" + agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) + return agent.download_request(request) class ScrapyAgent(object): - def __init__(self, contextFactory=None, connectTimeout=180, bindAddress=None): + _Agent = Agent + _ProxyAgent = ProxyAgent + + def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None): self._contextFactory = contextFactory self._connectTimeout = connectTimeout self._bindAddress = bindAddress + self._pool = pool - def launchRequest(self, request): - request_timeout = request.meta.get('download_timeout') or self._connectTimeout + def download_request(self, request): + url = urldefrag(request.url)[0] + method = request.method + headers = Headers(request.headers) + bodyproducer = _RequestBodyProducer(request.body or '') + agent = self._get_agent(request) + start_time = time() + d = agent.request(method, url, headers, bodyproducer) + d.addBoth(self._download_latency, request, start_time) + d.addCallback(self._agentrequest_downloaded, request) + d.addErrback(self._agentrequest_failed, request) + return d + def _get_agent(self, request): + timeout = request.meta.get('download_timeout') or self._connectTimeout + bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') - if proxy is not None and proxy != '': + if proxy: scheme, _, host, port, _ = _parse(proxy) - endpoint = TCP4ClientEndpoint(reactor, - host, port, - timeout=request_timeout, - bindAddress=self._bindAddress) - agent = ProxyAgent(endpoint) - else: - agent = Agent(reactor, - contextFactory=self._contextFactory, - connectTimeout=request_timeout, - bindAddress=self._bindAddress) + endpoint = TCP4ClientEndpoint(reactor, host, port, timeout=timeout, + bindAddress=bindaddress) + return self._ProxyAgent(endpoint) - request._tw_start_time = time() - return agent.request( - request.method, - urldefrag(request.url)[0], - Headers(request.headers), - _RequestBodyProducer(request.body or ''), - ) + return self._Agent(reactor, contextFactory=self._contextFactory, + connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) + + def _download_latency(self, any_, request, start_time): + request.meta['download_latency'] = time() - start_time + return any_ + + def _agentrequest_downloaded(self, txresponse, request): + if txresponse.length == 0: + return self._build_response(('', None), txresponse, request) + finished = defer.Deferred() + finished.addCallback(self._build_response, txresponse, request) + txresponse.deliverBody(_ResponseReader(finished)) + return finished + + def _build_response(self, (body, flag), txresponse, request): + if flag is not None: + request.meta[flag] = True + url = urldefrag(request.url)[0] + status = int(txresponse.code) + headers = ScrapyHeaders(txresponse.headers.getAllRawHeaders()) + respcls = responsetypes.from_args(headers=headers, url=url) + return respcls(url=url, status=status, headers=headers, body=body) + + def _agentrequest_failed(self, failure, request): + # be clear it is an HTTP failure with new downloader + log.err(failure, 'HTTP11 failure: %s' % request) + return failure class _RequestBodyProducer(object): @@ -85,43 +135,3 @@ class _ResponseReader(protocol.Protocol): self._finished.callback((body, 'partial_download')) else: self._finished.errback(reason) - - -class Http11DownloadHandler(object): - - def __init__(self): - self.debug = False - - def download_request(self, request, spider): - """Return a deferred for the HTTP download""" - agent = ScrapyAgent(reactor) - d = agent.launchRequest(request) - d.addBoth(self._download_latency, request, time()) - d.addCallback(self._agent_callback, request) - d.addErrback(self._agent_errback, request) - return d - - def _download_latency(self, any_, request, start_time): - request.meta['download_latency'] = time() - start_time - return any_ - - def _agent_callback(self, txresponse, request): - if txresponse.length == 0: - return self._build_response(('', None), txresponse, request) - finished = defer.Deferred() - finished.addCallback(self._build_response, txresponse, request) - txresponse.deliverBody(_ResponseReader(finished)) - return finished - - def _build_response(self, (body, flag), txresponse, request): - if flag is not None: - request.meta[flag] = True - url = urldefrag(request.url)[0] - status = int(txresponse.code) - headers = ScrapyHeaders(txresponse.headers.getAllRawHeaders()) - respcls = responsetypes.from_args(headers=headers, url=url) - return respcls(url=url, status=status, headers=headers, body=body) - - def _agent_errback(self, failure, request): - #log.err(failure, 'HTTP11 failure: %s' % request) - return failure diff --git a/scrapy/core/downloader/webclient.py b/scrapy/core/downloader/webclient.py index a7a6140f9..ef17871de 100644 --- a/scrapy/core/downloader/webclient.py +++ b/scrapy/core/downloader/webclient.py @@ -144,12 +144,16 @@ class ScrapyClientContextFactory(ClientContextFactory): # see https://github.com/scrapy/scrapy/issues/82 # and https://github.com/scrapy/scrapy/issues/26 +<<<<<<< HEAD def __init__(self): # see this issue on why we use TLSv1_METHOD by default # https://github.com/scrapy/scrapy/issues/194 self.method = SSL.TLSv1_METHOD def getContext(self): +======= + def getContext(self, hostname, port): +>>>>>>> add http connection pool and custom ssl context factory ctx = ClientContextFactory.getContext(self) # Enable all workarounds to SSL bugs as documented by # http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html