mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Merge pull request #6634 from wRAR/deprecate-http10
Deprecate HTTP/1.0 code.
This commit is contained in:
commit
a8d9746f56
@ -22,6 +22,7 @@ from scrapy.core.downloader.tls import (
|
||||
openssl_methods,
|
||||
)
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.deprecate import method_is_overridden
|
||||
from scrapy.utils.misc import build_from_crawler, load_object
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -62,6 +63,13 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
|
||||
else:
|
||||
self.tls_ciphers = DEFAULT_CIPHERS
|
||||
if method_is_overridden(type(self), ScrapyClientContextFactory, "getContext"):
|
||||
warnings.warn(
|
||||
"Overriding ScrapyClientContextFactory.getContext() is deprecated and that method"
|
||||
" will be removed in a future Scrapy version. Override creatorForNetloc() instead.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_settings(
|
||||
|
@ -2,8 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.misc import build_from_crawler, load_object
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
@ -26,6 +28,11 @@ class HTTP10DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings: BaseSettings, crawler: Crawler):
|
||||
warnings.warn(
|
||||
"HTTP10DownloadHandler is deprecated and will be removed in a future Scrapy version.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self.HTTPClientFactory: type[ScrapyHTTPClientFactory] = load_object(
|
||||
settings["DOWNLOADER_HTTPCLIENTFACTORY"]
|
||||
)
|
||||
|
@ -1,6 +1,9 @@
|
||||
"""Deprecated HTTP/1.0 helper classes used by HTTP10DownloadHandler."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
|
||||
@ -9,6 +12,7 @@ from twisted.internet import defer
|
||||
from twisted.internet.protocol import ClientFactory
|
||||
from twisted.web.http import HTTPClient
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Headers, Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
@ -49,6 +53,14 @@ def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
|
||||
class ScrapyHTTPPageGetter(HTTPClient):
|
||||
delimiter = b"\n"
|
||||
|
||||
def __init__(self):
|
||||
warnings.warn(
|
||||
"ScrapyHTTPPageGetter is deprecated and will be removed in a future Scrapy version.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__()
|
||||
|
||||
def connectionMade(self):
|
||||
self.headers = Headers() # bucket for response headers
|
||||
|
||||
@ -140,6 +152,12 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
self.path = self.url
|
||||
|
||||
def __init__(self, request: Request, timeout: float = 180):
|
||||
warnings.warn(
|
||||
"ScrapyHTTPClientFactory is deprecated and will be removed in a future Scrapy version.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
self._url: str = urldefrag(request.url)[0]
|
||||
# converting to bytes to comply to Twisted interface
|
||||
self.url: bytes = to_bytes(self._url, encoding="ascii")
|
||||
|
@ -1,6 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from typing import Any
|
||||
|
||||
import OpenSSL.SSL
|
||||
import pytest
|
||||
from twisted.internet import reactor
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.protocols.policies import WrappingFactory
|
||||
from twisted.trial import unittest
|
||||
from twisted.web import server, static
|
||||
from twisted.web.client import Agent, BrowserLikePolicyForHTTPS, readBody
|
||||
from twisted.web.client import Response as TxResponse
|
||||
|
||||
from scrapy.core.downloader import Slot
|
||||
from scrapy.core.downloader.contextfactory import (
|
||||
ScrapyClientContextFactory,
|
||||
load_context_factory_from_settings,
|
||||
)
|
||||
from scrapy.core.downloader.handlers.http11 import _RequestBodyProducer
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.defer import deferred_f_from_coro_f, maybe_deferred_to_future
|
||||
from scrapy.utils.misc import build_from_crawler
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.test import get_crawler
|
||||
from tests.mockserver import PayloadResource, ssl_context_factory
|
||||
|
||||
|
||||
class SlotTest(unittest.TestCase):
|
||||
@ -10,3 +37,136 @@ class SlotTest(unittest.TestCase):
|
||||
repr(slot),
|
||||
"Slot(concurrency=8, delay=0.10, randomize_delay=True)",
|
||||
)
|
||||
|
||||
|
||||
class ContextFactoryBaseTestCase(unittest.TestCase):
|
||||
context_factory = None
|
||||
|
||||
def _listen(self, site):
|
||||
return reactor.listenSSL(
|
||||
0,
|
||||
site,
|
||||
contextFactory=self.context_factory or ssl_context_factory(),
|
||||
interface="127.0.0.1",
|
||||
)
|
||||
|
||||
def getURL(self, path):
|
||||
return f"https://127.0.0.1:{self.portno}/{path}"
|
||||
|
||||
def setUp(self):
|
||||
self.tmpname = Path(mkdtemp())
|
||||
(self.tmpname / "file").write_bytes(b"0123456789")
|
||||
r = static.File(str(self.tmpname))
|
||||
r.putChild(b"payload", PayloadResource())
|
||||
self.site = server.Site(r, timeout=None)
|
||||
self.wrapper = WrappingFactory(self.site)
|
||||
self.port = self._listen(self.wrapper)
|
||||
self.portno = self.port.getHost().port
|
||||
|
||||
@inlineCallbacks
|
||||
def tearDown(self):
|
||||
yield self.port.stopListening()
|
||||
shutil.rmtree(self.tmpname)
|
||||
|
||||
@staticmethod
|
||||
async def get_page(
|
||||
url: str,
|
||||
client_context_factory: BrowserLikePolicyForHTTPS,
|
||||
body: str | None = None,
|
||||
) -> bytes:
|
||||
agent = Agent(reactor, contextFactory=client_context_factory)
|
||||
body_producer = _RequestBodyProducer(body.encode()) if body else None
|
||||
response: TxResponse = await maybe_deferred_to_future(
|
||||
agent.request(b"GET", url.encode(), bodyProducer=body_producer)
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
# https://github.com/twisted/twisted/issues/8227
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
category=DeprecationWarning,
|
||||
message=r".*does not have an abortConnection method",
|
||||
)
|
||||
d: Deferred[bytes] = readBody(response) # type: ignore[arg-type]
|
||||
return await maybe_deferred_to_future(d)
|
||||
|
||||
|
||||
class ContextFactoryTestCase(ContextFactoryBaseTestCase):
|
||||
@deferred_f_from_coro_f
|
||||
async def testPayload(self):
|
||||
s = "0123456789" * 10
|
||||
crawler = get_crawler()
|
||||
settings = Settings()
|
||||
client_context_factory = load_context_factory_from_settings(settings, crawler)
|
||||
body = await self.get_page(
|
||||
self.getURL("payload"), client_context_factory, body=s
|
||||
)
|
||||
self.assertEqual(body, to_bytes(s))
|
||||
|
||||
def test_override_getContext(self):
|
||||
class MyFactory(ScrapyClientContextFactory):
|
||||
def getContext(
|
||||
self, hostname: Any = None, port: Any = None
|
||||
) -> OpenSSL.SSL.Context:
|
||||
ctx: OpenSSL.SSL.Context = super().getContext(hostname, port)
|
||||
return ctx
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
MyFactory()
|
||||
self.assertEqual(len(w), 1)
|
||||
self.assertIn(
|
||||
"Overriding ScrapyClientContextFactory.getContext() is deprecated",
|
||||
str(w[0].message),
|
||||
)
|
||||
|
||||
|
||||
class ContextFactoryTLSMethodTestCase(ContextFactoryBaseTestCase):
|
||||
async def _assert_factory_works(
|
||||
self, client_context_factory: ScrapyClientContextFactory
|
||||
) -> None:
|
||||
s = "0123456789" * 10
|
||||
body = await self.get_page(
|
||||
self.getURL("payload"), client_context_factory, body=s
|
||||
)
|
||||
self.assertEqual(body, to_bytes(s))
|
||||
|
||||
@deferred_f_from_coro_f
|
||||
async def test_setting_default(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings()
|
||||
client_context_factory = load_context_factory_from_settings(settings, crawler)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
|
||||
await self._assert_factory_works(client_context_factory)
|
||||
|
||||
def test_setting_none(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": None})
|
||||
with pytest.raises(KeyError):
|
||||
load_context_factory_from_settings(settings, crawler)
|
||||
|
||||
def test_setting_bad(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
|
||||
with pytest.raises(KeyError):
|
||||
load_context_factory_from_settings(settings, crawler)
|
||||
|
||||
@deferred_f_from_coro_f
|
||||
async def test_setting_explicit(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "TLSv1.2"})
|
||||
client_context_factory = load_context_factory_from_settings(settings, crawler)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
|
||||
await self._assert_factory_works(client_context_factory)
|
||||
|
||||
@deferred_f_from_coro_f
|
||||
async def test_direct_from_crawler(self):
|
||||
# the setting is ignored
|
||||
crawler = get_crawler(settings_dict={"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
|
||||
client_context_factory = build_from_crawler(ScrapyClientContextFactory, crawler)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
|
||||
await self._assert_factory_works(client_context_factory)
|
||||
|
||||
@deferred_f_from_coro_f
|
||||
async def test_direct_init(self):
|
||||
client_context_factory = ScrapyClientContextFactory(OpenSSL.SSL.TLSv1_2_METHOD)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
|
||||
await self._assert_factory_works(client_context_factory)
|
||||
|
@ -422,6 +422,7 @@ class HttpTestCase(unittest.TestCase):
|
||||
return self.download_request(request, Spider("foo")).addCallback(_test)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class Http10TestCase(HttpTestCase):
|
||||
"""HTTP 1.0 test case"""
|
||||
|
||||
@ -780,6 +781,7 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
return self.download_request(request, Spider("foo")).addCallback(_test)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class Http10ProxyTestCase(HttpProxyTestCase):
|
||||
download_handler_cls: type = HTTP10DownloadHandler
|
||||
|
||||
|
@ -8,12 +8,11 @@ from __future__ import annotations
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from typing import Any
|
||||
|
||||
import OpenSSL.SSL
|
||||
from pytest import raises
|
||||
import pytest
|
||||
from twisted.internet import defer, reactor
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.internet.defer import inlineCallbacks
|
||||
from twisted.internet.testing import StringTransport
|
||||
from twisted.protocols.policies import WrappingFactory
|
||||
from twisted.trial import unittest
|
||||
@ -22,10 +21,8 @@ from twisted.web import resource, server, static, util
|
||||
from scrapy.core.downloader import webclient as client
|
||||
from scrapy.core.downloader.contextfactory import (
|
||||
ScrapyClientContextFactory,
|
||||
load_context_factory_from_settings,
|
||||
)
|
||||
from scrapy.http import Headers, Request
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.misc import build_from_crawler
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
from scrapy.utils.test import get_crawler
|
||||
@ -38,6 +35,7 @@ from tests.mockserver import (
|
||||
PayloadResource,
|
||||
ssl_context_factory,
|
||||
)
|
||||
from tests.test_core_downloader import ContextFactoryBaseTestCase
|
||||
|
||||
|
||||
def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
|
||||
@ -129,6 +127,7 @@ class ParseUrlTestCase(unittest.TestCase):
|
||||
self.assertEqual(client._parse(url), test, url)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
def test_earlyHeaders(self):
|
||||
# basic test stolen from twisted HTTPageGetter
|
||||
@ -272,6 +271,7 @@ class EncodingResource(resource.Resource):
|
||||
return body.encode(self.out_encoding)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class WebClientTestCase(unittest.TestCase):
|
||||
def _listen(self, site):
|
||||
return reactor.listenTCP(0, site, interface="127.0.0.1")
|
||||
@ -427,35 +427,8 @@ class WebClientTestCase(unittest.TestCase):
|
||||
)
|
||||
|
||||
|
||||
class WebClientSSLTestCase(unittest.TestCase):
|
||||
context_factory = None
|
||||
|
||||
def _listen(self, site):
|
||||
return reactor.listenSSL(
|
||||
0,
|
||||
site,
|
||||
contextFactory=self.context_factory or ssl_context_factory(),
|
||||
interface="127.0.0.1",
|
||||
)
|
||||
|
||||
def getURL(self, path):
|
||||
return f"https://127.0.0.1:{self.portno}/{path}"
|
||||
|
||||
def setUp(self):
|
||||
self.tmpname = Path(mkdtemp())
|
||||
(self.tmpname / "file").write_bytes(b"0123456789")
|
||||
r = static.File(str(self.tmpname))
|
||||
r.putChild(b"payload", PayloadResource())
|
||||
self.site = server.Site(r, timeout=None)
|
||||
self.wrapper = WrappingFactory(self.site)
|
||||
self.port = self._listen(self.wrapper)
|
||||
self.portno = self.port.getHost().port
|
||||
|
||||
@inlineCallbacks
|
||||
def tearDown(self):
|
||||
yield self.port.stopListening()
|
||||
shutil.rmtree(self.tmpname)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class WebClientSSLTestCase(ContextFactoryBaseTestCase):
|
||||
def testPayload(self):
|
||||
s = "0123456789" * 10
|
||||
return getPage(self.getURL("payload"), body=s).addCallback(
|
||||
@ -490,51 +463,3 @@ class WebClientCustomCiphersSSLTestCase(WebClientSSLTestCase):
|
||||
self.getURL("payload"), body=s, contextFactory=client_context_factory
|
||||
)
|
||||
return self.assertFailure(d, OpenSSL.SSL.Error)
|
||||
|
||||
|
||||
class WebClientTLSMethodTestCase(WebClientSSLTestCase):
|
||||
def _assert_factory_works(
|
||||
self, client_context_factory: ScrapyClientContextFactory
|
||||
) -> Deferred[Any]:
|
||||
s = "0123456789" * 10
|
||||
return getPage(
|
||||
self.getURL("payload"), body=s, contextFactory=client_context_factory
|
||||
).addCallback(self.assertEqual, to_bytes(s))
|
||||
|
||||
def test_setting_default(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings()
|
||||
client_context_factory = load_context_factory_from_settings(settings, crawler)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
|
||||
return self._assert_factory_works(client_context_factory)
|
||||
|
||||
def test_setting_none(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": None})
|
||||
with raises(KeyError):
|
||||
load_context_factory_from_settings(settings, crawler)
|
||||
|
||||
def test_setting_bad(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
|
||||
with raises(KeyError):
|
||||
load_context_factory_from_settings(settings, crawler)
|
||||
|
||||
def test_setting_explicit(self):
|
||||
crawler = get_crawler()
|
||||
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "TLSv1.2"})
|
||||
client_context_factory = load_context_factory_from_settings(settings, crawler)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
|
||||
return self._assert_factory_works(client_context_factory)
|
||||
|
||||
def test_direct_from_crawler(self):
|
||||
# the setting is ignored
|
||||
crawler = get_crawler(settings_dict={"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
|
||||
client_context_factory = build_from_crawler(ScrapyClientContextFactory, crawler)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
|
||||
return self._assert_factory_works(client_context_factory)
|
||||
|
||||
def test_direct_init(self):
|
||||
client_context_factory = ScrapyClientContextFactory(OpenSSL.SSL.TLSv1_2_METHOD)
|
||||
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
|
||||
return self._assert_factory_works(client_context_factory)
|
||||
|
Loading…
x
Reference in New Issue
Block a user