1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 03:43:58 +00:00

Merge pull request #1678 from lopuhin/py3-http-downloaders

[MRG+1] Py3: port http downloaders
This commit is contained in:
Elias Dorneles 2016-01-19 14:10:00 -02:00
commit b4fb9d3534
6 changed files with 125 additions and 78 deletions

View File

@ -2,6 +2,7 @@
"""
from twisted.internet import reactor
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_unicode
class HTTP10DownloadHandler(object):
@ -17,8 +18,8 @@ class HTTP10DownloadHandler(object):
return factory.deferred
def _connect(self, factory):
host, port = factory.host, factory.port
if factory.scheme == 'https':
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
return reactor.connectSSL(host, port, factory,
self.ClientContextFactory())
else:

View File

@ -6,7 +6,7 @@ from io import BytesIO
from time import time
from six.moves.urllib.parse import urldefrag
from zope.interface import implements
from zope.interface import implementer
from twisted.internet import defer, reactor, protocol
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
@ -19,6 +19,7 @@ from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.webclient import _parse
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_bytes, to_unicode
from scrapy import twisted_version
logger = logging.getLogger(__name__)
@ -77,7 +78,7 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
for it.
"""
_responseMatcher = re.compile('HTTP/1\.. 200')
_responseMatcher = re.compile(b'HTTP/1\.. 200')
def __init__(self, reactor, host, port, proxyConf, contextFactory,
timeout=30, bindAddress=None):
@ -91,11 +92,15 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
def requestTunnel(self, protocol):
"""Asks the proxy to open a tunnel."""
tunnelReq = 'CONNECT %s:%s HTTP/1.1\r\n' % (self._tunneledHost,
self._tunneledPort)
tunnelReq = (
b'CONNECT ' +
to_bytes(self._tunneledHost, encoding='ascii') + b':' +
to_bytes(str(self._tunneledPort)) +
b' HTTP/1.1\r\n')
if self._proxyAuthHeader:
tunnelReq += 'Proxy-Authorization: %s\r\n' % self._proxyAuthHeader
tunnelReq += '\r\n'
tunnelReq += \
b'Proxy-Authorization: ' + self._proxyAuthHeader + b'\r\n'
tunnelReq += b'\r\n'
protocol.transport.write(tunnelReq)
self._protocolDataReceived = protocol.dataReceived
protocol.dataReceived = self.processProxyResponse
@ -180,10 +185,11 @@ class ScrapyAgent(object):
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
omitConnectTunnel = proxyParams.find('noconnect') >= 0
if scheme == 'https' and not omitConnectTunnel:
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if scheme == b'https' and not omitConnectTunnel:
proxyConf = (proxyHost, proxyPort,
request.headers.get('Proxy-Authorization', None))
request.headers.get(b'Proxy-Authorization', None))
return self._TunnelingAgent(reactor, proxyConf,
contextFactory=self._contextFactory, connectTimeout=timeout,
bindAddress=bindaddress, pool=self._pool)
@ -201,14 +207,15 @@ class ScrapyAgent(object):
# request details
url = urldefrag(request.url)[0]
method = request.method
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader('Proxy-Authorization')
headers.removeHeader(b'Proxy-Authorization')
bodyproducer = _RequestBodyProducer(request.body) if request.body else None
start_time = time()
d = agent.request(method, url, headers, bodyproducer)
d = agent.request(
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
@ -232,7 +239,7 @@ class ScrapyAgent(object):
def _cb_bodyready(self, txresponse, request):
# deliverBody hangs for responses without body
if txresponse.length == 0:
return txresponse, '', None
return txresponse, b'', None
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
@ -268,8 +275,8 @@ class ScrapyAgent(object):
return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
@implementer(IBodyProducer)
class _RequestBodyProducer(object):
implements(IBodyProducer)
def __init__(self, body):
self.body = body

View File

@ -134,12 +134,12 @@ class Echo(LeafResource):
class Partial(LeafResource):
def render_GET(self, request):
request.setHeader("Content-Length", "1024")
request.setHeader(b"Content-Length", b"1024")
self.deferRequest(request, 0, self._delayedRender, request)
return NOT_DONE_YET
def _delayedRender(self, request):
request.write("partial content\n")
request.write(b"partial content\n")
request.finish()
@ -147,7 +147,7 @@ class Drop(Partial):
def _delayedRender(self, request):
abort = getarg(request, "abort", 0, type=int)
request.write("this connection will be dropped\n")
request.write(b"this connection will be dropped\n")
tr = request.channel.transport
try:
if abort and hasattr(tr, 'abortConnection'):
@ -162,26 +162,26 @@ class Root(Resource):
def __init__(self):
Resource.__init__(self)
self.putChild("status", Status())
self.putChild("follow", Follow())
self.putChild("delay", Delay())
self.putChild("partial", Partial())
self.putChild("drop", Drop())
self.putChild("raw", Raw())
self.putChild("echo", Echo())
self.putChild(b"status", Status())
self.putChild(b"follow", Follow())
self.putChild(b"delay", Delay())
self.putChild(b"partial", Partial())
self.putChild(b"drop", Drop())
self.putChild(b"raw", Raw())
self.putChild(b"echo", Echo())
if six.PY2 and twisted_version > (12, 3, 0):
if twisted_version > (12, 3, 0):
from twisted.web.test.test_webclient import PayloadResource
from twisted.web.server import GzipEncoderFactory
from twisted.web.resource import EncodingResourceWrapper
self.putChild('payload', PayloadResource())
self.putChild("xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()]))
self.putChild(b"payload", PayloadResource())
self.putChild(b"xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()]))
def getChild(self, name, request):
return self
def render(self, request):
return 'Scrapy mock HTTP server\n'
return b'Scrapy mock HTTP server\n'
class MockServer():
@ -199,14 +199,18 @@ class MockServer():
time.sleep(0.2)
def ssl_context_factory():
return ssl.DefaultOpenSSLContextFactory(
os.path.join(os.path.dirname(__file__), 'keys/cert.pem'),
os.path.join(os.path.dirname(__file__), 'keys/cert.pem'),
)
if __name__ == "__main__":
root = Root()
factory = Site(root)
httpPort = reactor.listenTCP(8998, factory)
contextFactory = ssl.DefaultOpenSSLContextFactory(
os.path.join(os.path.dirname(__file__), 'keys/cert.pem'),
os.path.join(os.path.dirname(__file__), 'keys/cert.pem'),
)
contextFactory = ssl_context_factory()
httpsPort = reactor.listenSSL(8999, factory, contextFactory)
def print_listening():

View File

@ -4,7 +4,6 @@ tests/test_command_shell.py
tests/test_exporters.py
tests/test_linkextractors_deprecated.py
tests/test_crawl.py
tests/test_downloader_handlers.py
tests/test_downloadermiddleware_httpcache.py
tests/test_downloadermiddleware_httpcompression.py
tests/test_downloadermiddleware_httpproxy.py
@ -25,8 +24,6 @@ scrapy/xlib/tx/client.py
scrapy/xlib/tx/_newclient.py
scrapy/xlib/tx/__init__.py
scrapy/core/downloader/handlers/s3.py
scrapy/core/downloader/handlers/http11.py
scrapy/core/downloader/handlers/http.py
scrapy/core/downloader/handlers/ftp.py
scrapy/pipelines/images.py
scrapy/pipelines/files.py

View File

@ -1,5 +1,4 @@
import os
import twisted
import six
from twisted.trial import unittest
@ -10,9 +9,7 @@ from twisted.web import server, static, util, resource
from twisted.web.test.test_webclient import ForeverTakingResource, \
NoLengthResource, HostHeaderResource, \
PayloadResource, BrokenDownloadResource
from twisted.protocols.ftp import FTPRealm, FTPFactory
from twisted.cred import portal, checkers, credentials
from twisted.protocols.ftp import FTPClient, ConnectionLost
from w3lib.url import path_to_file_uri
from scrapy import twisted_version
@ -22,15 +19,15 @@ from scrapy.core.downloader.handlers.http import HTTPDownloadHandler, HttpDownlo
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler
from scrapy.utils.python import to_bytes
from scrapy.exceptions import NotConfigured
from tests.mockserver import MockServer
from tests.mockserver import MockServer, ssl_context_factory
from tests.spiders import SingleRequestSpider
class DummyDH(object):
@ -91,7 +88,7 @@ class FileTestCase(unittest.TestCase):
def _test(response):
self.assertEquals(response.url, request.url)
self.assertEquals(response.status, 200)
self.assertEquals(response.body, '0123456789')
self.assertEquals(response.body, b'0123456789')
request = Request(path_to_file_uri(self.tmpname + '^'))
assert request.url.upper().endswith('%5E')
@ -105,23 +102,29 @@ class FileTestCase(unittest.TestCase):
class HttpTestCase(unittest.TestCase):
scheme = 'http'
download_handler_cls = HTTPDownloadHandler
def setUp(self):
name = self.mktemp()
os.mkdir(name)
FilePath(name).child("file").setContent("0123456789")
FilePath(name).child("file").setContent(b"0123456789")
r = static.File(name)
r.putChild("redirect", util.Redirect("/file"))
r.putChild("wait", ForeverTakingResource())
r.putChild("hang-after-headers", ForeverTakingResource(write=True))
r.putChild("nolength", NoLengthResource())
r.putChild("host", HostHeaderResource())
r.putChild("payload", PayloadResource())
r.putChild("broken", BrokenDownloadResource())
r.putChild(b"redirect", util.Redirect(b"/file"))
r.putChild(b"wait", ForeverTakingResource())
r.putChild(b"hang-after-headers", ForeverTakingResource(write=True))
r.putChild(b"nolength", NoLengthResource())
r.putChild(b"host", HostHeaderResource())
r.putChild(b"payload", PayloadResource())
r.putChild(b"broken", BrokenDownloadResource())
self.site = server.Site(r, timeout=None)
self.wrapper = WrappingFactory(self.site)
self.port = reactor.listenTCP(0, self.wrapper, interface='127.0.0.1')
self.host = 'localhost'
if self.scheme == 'https':
self.port = reactor.listenSSL(
0, self.wrapper, ssl_context_factory(), interface=self.host)
else:
self.port = reactor.listenTCP(0, self.wrapper, interface=self.host)
self.portno = self.port.getHost().port
self.download_handler = self.download_handler_cls(Settings())
self.download_request = self.download_handler.download_request
@ -133,20 +136,20 @@ class HttpTestCase(unittest.TestCase):
yield self.download_handler.close()
def getURL(self, path):
return "http://127.0.0.1:%d/%s" % (self.portno, path)
return "%s://%s:%d/%s" % (self.scheme, self.host, self.portno, path)
def test_download(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
d.addCallback(self.assertEquals, b"0123456789")
return d
def test_download_head(self):
request = Request(self.getURL('file'), method='HEAD')
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, '')
d.addCallback(self.assertEquals, b'')
return d
def test_redirect_status(self):
@ -165,6 +168,9 @@ class HttpTestCase(unittest.TestCase):
@defer.inlineCallbacks
def test_timeout_download_from_spider(self):
if self.scheme == 'https':
raise unittest.SkipTest(
'test_timeout_download_from_spider skipped under https')
spider = Spider('foo')
meta = {'download_timeout': 0.2}
# client connects but no data is received
@ -178,7 +184,8 @@ class HttpTestCase(unittest.TestCase):
def test_host_header_not_in_request_headers(self):
def _test(response):
self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
self.assertEquals(
response.body, to_bytes('%s:%d' % (self.host, self.portno)))
self.assertEquals(request.headers, {})
request = Request(self.getURL('host'))
@ -186,19 +193,19 @@ class HttpTestCase(unittest.TestCase):
def test_host_header_seted_in_request_headers(self):
def _test(response):
self.assertEquals(response.body, 'example.com')
self.assertEquals(request.headers.get('Host'), 'example.com')
self.assertEquals(response.body, b'example.com')
self.assertEquals(request.headers.get('Host'), b'example.com')
request = Request(self.getURL('host'), headers={'Host': 'example.com'})
return self.download_request(request, Spider('foo')).addCallback(_test)
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, 'example.com')
d.addCallback(self.assertEquals, b'example.com')
return d
def test_payload(self):
body = '1'*100 # PayloadResource requires body length to be 100
body = b'1'*100 # PayloadResource requires body length to be 100
request = Request(self.getURL('payload'), method='POST', body=body)
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
@ -216,6 +223,10 @@ class Http10TestCase(HttpTestCase):
download_handler_cls = HTTP10DownloadHandler
class Https10TestCase(Http10TestCase):
scheme = 'https'
class Http11TestCase(HttpTestCase):
"""HTTP 1.1 test case"""
download_handler_cls = HTTP11DownloadHandler
@ -226,7 +237,7 @@ class Http11TestCase(HttpTestCase):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
d.addCallback(self.assertEquals, b"0123456789")
return d
@defer.inlineCallbacks
@ -237,7 +248,7 @@ class Http11TestCase(HttpTestCase):
# response body. (regardless of headers)
d = self.download_request(request, Spider('foo', download_maxsize=10))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
d.addCallback(self.assertEquals, b"0123456789")
yield d
d = self.download_request(request, Spider('foo', download_maxsize=9))
@ -260,10 +271,14 @@ class Http11TestCase(HttpTestCase):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=100))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
d.addCallback(self.assertEquals, b"0123456789")
return d
class Https11TestCase(Http11TestCase):
scheme = 'https'
class Http11MockServerTestCase(unittest.TestCase):
"""HTTP 1.1 test case with MockServer"""
if twisted_version < (11, 1, 0):
@ -297,27 +312,30 @@ class Http11MockServerTestCase(unittest.TestCase):
@defer.inlineCallbacks
def test_download_gzip_response(self):
if six.PY2 and twisted_version > (12, 3, 0):
if twisted_version > (12, 3, 0):
crawler = get_crawler(SingleRequestSpider)
body = '1'*100 # PayloadResource requires body length to be 100
body = b'1'*100 # PayloadResource requires body length to be 100
request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50})
yield crawler.crawl(seed=request)
failure = crawler.spider.meta['failure']
# download_maxsize < 100, hence the CancelledError
self.assertIsInstance(failure.value, defer.CancelledError)
request.headers.setdefault('Accept-Encoding', 'gzip,deflate')
request = request.replace(url='http://localhost:8998/xpayload')
yield crawler.crawl(seed=request)
# download_maxsize = 50 is enough for the gzipped response
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
if six.PY2:
request.headers.setdefault(b'Accept-Encoding', b'gzip,deflate')
request = request.replace(url='http://localhost:8998/xpayload')
yield crawler.crawl(seed=request)
# download_maxsize = 50 is enough for the gzipped response
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
else:
# See issue https://twistedmatrix.com/trac/ticket/8175
raise unittest.SkipTest("xpayload only enabled for PY2")
else:
raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x")
raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0")
class UriResource(resource.Resource):
@ -354,7 +372,7 @@ class HttpProxyTestCase(unittest.TestCase):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, 'http://example.com')
self.assertEquals(response.body, b'http://example.com')
http_proxy = self.getURL('')
request = Request('http://example.com', meta={'proxy': http_proxy})
@ -364,7 +382,7 @@ class HttpProxyTestCase(unittest.TestCase):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, 'https://example.com')
self.assertEquals(response.body, b'https://example.com')
http_proxy = '%s?noconnect' % self.getURL('')
request = Request('https://example.com', meta={'proxy': http_proxy})
@ -374,7 +392,7 @@ class HttpProxyTestCase(unittest.TestCase):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, '/path/to/resource')
self.assertEquals(response.body, b'/path/to/resource')
request = Request(self.getURL('path/to/resource'))
return self.download_request(request, Spider('foo')).addCallback(_test)
@ -394,6 +412,17 @@ class Http11ProxyTestCase(HttpProxyTestCase):
if twisted_version < (11, 1, 0):
skip = 'HTTP1.1 not supported in twisted < 11.1.0'
@defer.inlineCallbacks
def test_download_with_proxy_https_timeout(self):
""" Test TunnelingTCP4ClientEndpoint """
http_proxy = self.getURL('')
domain = 'https://no-such-domain.nosuch'
request = Request(
domain, meta={'proxy': http_proxy, 'download_timeout': 0.2})
d = self.download_request(request, Spider('foo'))
timeout = yield self.assertFailure(d, error.TimeoutError)
self.assertIn(domain, timeout.osError)
class HttpDownloadHandlerMock(object):
def __init__(self, settings):
@ -518,8 +547,13 @@ class FTPTestCase(unittest.TestCase):
if twisted_version < (10, 2, 0):
skip = "Twisted pre 10.2.0 doesn't allow to set home path other than /home"
if six.PY3:
skip = "Twisted missing ftp support for PY3"
def setUp(self):
from twisted.protocols.ftp import FTPRealm, FTPFactory
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
# setup dirs and test file
self.directory = self.mktemp()
os.mkdir(self.directory)
@ -601,6 +635,8 @@ class FTPTestCase(unittest.TestCase):
return self._add_test_callbacks(d, _test)
def test_invalid_credentials(self):
from twisted.protocols.ftp import ConnectionLost
request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
meta={"ftp_user": self.username, "ftp_password": 'invalid'})
d = self.download_handler.download_request(request, None)

View File

@ -17,6 +17,8 @@ class BaseResponseTest(unittest.TestCase):
# Response requires url in the consturctor
self.assertRaises(Exception, self.response_class)
self.assertTrue(isinstance(self.response_class('http://example.com/'), self.response_class))
if not six.PY2:
self.assertRaises(TypeError, self.response_class, b"http://example.com")
# body can be str or None
self.assertTrue(isinstance(self.response_class('http://example.com/', body=b''), self.response_class))
self.assertTrue(isinstance(self.response_class('http://example.com/', body=b'body'), self.response_class))