1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Merge pull request #6634 from wRAR/deprecate-http10

Deprecate HTTP/1.0 code.
This commit is contained in:
Andrey Rakhmatullin 2025-01-28 13:29:25 +04:00 committed by GitHub
commit a8d9746f56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 202 additions and 82 deletions

View File

@ -22,6 +22,7 @@ from scrapy.core.downloader.tls import (
openssl_methods,
)
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.deprecate import method_is_overridden
from scrapy.utils.misc import build_from_crawler, load_object
if TYPE_CHECKING:
@ -62,6 +63,13 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
else:
self.tls_ciphers = DEFAULT_CIPHERS
if method_is_overridden(type(self), ScrapyClientContextFactory, "getContext"):
warnings.warn(
"Overriding ScrapyClientContextFactory.getContext() is deprecated and that method"
" will be removed in a future Scrapy version. Override creatorForNetloc() instead.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
@classmethod
def from_settings(

View File

@ -2,8 +2,10 @@
from __future__ import annotations
import warnings
from typing import TYPE_CHECKING
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.misc import build_from_crawler, load_object
from scrapy.utils.python import to_unicode
@ -26,6 +28,11 @@ class HTTP10DownloadHandler:
lazy = False
def __init__(self, settings: BaseSettings, crawler: Crawler):
warnings.warn(
"HTTP10DownloadHandler is deprecated and will be removed in a future Scrapy version.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
self.HTTPClientFactory: type[ScrapyHTTPClientFactory] = load_object(
settings["DOWNLOADER_HTTPCLIENTFACTORY"]
)

View File

@ -1,6 +1,9 @@
"""Deprecated HTTP/1.0 helper classes used by HTTP10DownloadHandler."""
from __future__ import annotations
import re
import warnings
from time import time
from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
@ -9,6 +12,7 @@ from twisted.internet import defer
from twisted.internet.protocol import ClientFactory
from twisted.web.http import HTTPClient
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
@ -49,6 +53,14 @@ def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
class ScrapyHTTPPageGetter(HTTPClient):
delimiter = b"\n"
def __init__(self):
warnings.warn(
"ScrapyHTTPPageGetter is deprecated and will be removed in a future Scrapy version.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
super().__init__()
def connectionMade(self):
self.headers = Headers() # bucket for response headers
@ -140,6 +152,12 @@ class ScrapyHTTPClientFactory(ClientFactory):
self.path = self.url
def __init__(self, request: Request, timeout: float = 180):
warnings.warn(
"ScrapyHTTPClientFactory is deprecated and will be removed in a future Scrapy version.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
self._url: str = urldefrag(request.url)[0]
# converting to bytes to comply to Twisted interface
self.url: bytes = to_bytes(self._url, encoding="ascii")

View File

@ -1,6 +1,33 @@
from __future__ import annotations
import shutil
import warnings
from pathlib import Path
from tempfile import mkdtemp
from typing import Any
import OpenSSL.SSL
import pytest
from twisted.internet import reactor
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.protocols.policies import WrappingFactory
from twisted.trial import unittest
from twisted.web import server, static
from twisted.web.client import Agent, BrowserLikePolicyForHTTPS, readBody
from twisted.web.client import Response as TxResponse
from scrapy.core.downloader import Slot
from scrapy.core.downloader.contextfactory import (
ScrapyClientContextFactory,
load_context_factory_from_settings,
)
from scrapy.core.downloader.handlers.http11 import _RequestBodyProducer
from scrapy.settings import Settings
from scrapy.utils.defer import deferred_f_from_coro_f, maybe_deferred_to_future
from scrapy.utils.misc import build_from_crawler
from scrapy.utils.python import to_bytes
from scrapy.utils.test import get_crawler
from tests.mockserver import PayloadResource, ssl_context_factory
class SlotTest(unittest.TestCase):
@ -10,3 +37,136 @@ class SlotTest(unittest.TestCase):
repr(slot),
"Slot(concurrency=8, delay=0.10, randomize_delay=True)",
)
class ContextFactoryBaseTestCase(unittest.TestCase):
context_factory = None
def _listen(self, site):
return reactor.listenSSL(
0,
site,
contextFactory=self.context_factory or ssl_context_factory(),
interface="127.0.0.1",
)
def getURL(self, path):
return f"https://127.0.0.1:{self.portno}/{path}"
def setUp(self):
self.tmpname = Path(mkdtemp())
(self.tmpname / "file").write_bytes(b"0123456789")
r = static.File(str(self.tmpname))
r.putChild(b"payload", PayloadResource())
self.site = server.Site(r, timeout=None)
self.wrapper = WrappingFactory(self.site)
self.port = self._listen(self.wrapper)
self.portno = self.port.getHost().port
@inlineCallbacks
def tearDown(self):
yield self.port.stopListening()
shutil.rmtree(self.tmpname)
@staticmethod
async def get_page(
url: str,
client_context_factory: BrowserLikePolicyForHTTPS,
body: str | None = None,
) -> bytes:
agent = Agent(reactor, contextFactory=client_context_factory)
body_producer = _RequestBodyProducer(body.encode()) if body else None
response: TxResponse = await maybe_deferred_to_future(
agent.request(b"GET", url.encode(), bodyProducer=body_producer)
)
with warnings.catch_warnings():
# https://github.com/twisted/twisted/issues/8227
warnings.filterwarnings(
"ignore",
category=DeprecationWarning,
message=r".*does not have an abortConnection method",
)
d: Deferred[bytes] = readBody(response) # type: ignore[arg-type]
return await maybe_deferred_to_future(d)
class ContextFactoryTestCase(ContextFactoryBaseTestCase):
@deferred_f_from_coro_f
async def testPayload(self):
s = "0123456789" * 10
crawler = get_crawler()
settings = Settings()
client_context_factory = load_context_factory_from_settings(settings, crawler)
body = await self.get_page(
self.getURL("payload"), client_context_factory, body=s
)
self.assertEqual(body, to_bytes(s))
def test_override_getContext(self):
class MyFactory(ScrapyClientContextFactory):
def getContext(
self, hostname: Any = None, port: Any = None
) -> OpenSSL.SSL.Context:
ctx: OpenSSL.SSL.Context = super().getContext(hostname, port)
return ctx
with warnings.catch_warnings(record=True) as w:
MyFactory()
self.assertEqual(len(w), 1)
self.assertIn(
"Overriding ScrapyClientContextFactory.getContext() is deprecated",
str(w[0].message),
)
class ContextFactoryTLSMethodTestCase(ContextFactoryBaseTestCase):
async def _assert_factory_works(
self, client_context_factory: ScrapyClientContextFactory
) -> None:
s = "0123456789" * 10
body = await self.get_page(
self.getURL("payload"), client_context_factory, body=s
)
self.assertEqual(body, to_bytes(s))
@deferred_f_from_coro_f
async def test_setting_default(self):
crawler = get_crawler()
settings = Settings()
client_context_factory = load_context_factory_from_settings(settings, crawler)
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
await self._assert_factory_works(client_context_factory)
def test_setting_none(self):
crawler = get_crawler()
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": None})
with pytest.raises(KeyError):
load_context_factory_from_settings(settings, crawler)
def test_setting_bad(self):
crawler = get_crawler()
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
with pytest.raises(KeyError):
load_context_factory_from_settings(settings, crawler)
@deferred_f_from_coro_f
async def test_setting_explicit(self):
crawler = get_crawler()
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "TLSv1.2"})
client_context_factory = load_context_factory_from_settings(settings, crawler)
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
await self._assert_factory_works(client_context_factory)
@deferred_f_from_coro_f
async def test_direct_from_crawler(self):
# the setting is ignored
crawler = get_crawler(settings_dict={"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
client_context_factory = build_from_crawler(ScrapyClientContextFactory, crawler)
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
await self._assert_factory_works(client_context_factory)
@deferred_f_from_coro_f
async def test_direct_init(self):
client_context_factory = ScrapyClientContextFactory(OpenSSL.SSL.TLSv1_2_METHOD)
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
await self._assert_factory_works(client_context_factory)

View File

@ -422,6 +422,7 @@ class HttpTestCase(unittest.TestCase):
return self.download_request(request, Spider("foo")).addCallback(_test)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class Http10TestCase(HttpTestCase):
"""HTTP 1.0 test case"""
@ -780,6 +781,7 @@ class HttpProxyTestCase(unittest.TestCase):
return self.download_request(request, Spider("foo")).addCallback(_test)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class Http10ProxyTestCase(HttpProxyTestCase):
download_handler_cls: type = HTTP10DownloadHandler

View File

@ -8,12 +8,11 @@ from __future__ import annotations
import shutil
from pathlib import Path
from tempfile import mkdtemp
from typing import Any
import OpenSSL.SSL
from pytest import raises
import pytest
from twisted.internet import defer, reactor
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.internet.defer import inlineCallbacks
from twisted.internet.testing import StringTransport
from twisted.protocols.policies import WrappingFactory
from twisted.trial import unittest
@ -22,10 +21,8 @@ from twisted.web import resource, server, static, util
from scrapy.core.downloader import webclient as client
from scrapy.core.downloader.contextfactory import (
ScrapyClientContextFactory,
load_context_factory_from_settings,
)
from scrapy.http import Headers, Request
from scrapy.settings import Settings
from scrapy.utils.misc import build_from_crawler
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.test import get_crawler
@ -38,6 +35,7 @@ from tests.mockserver import (
PayloadResource,
ssl_context_factory,
)
from tests.test_core_downloader import ContextFactoryBaseTestCase
def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
@ -129,6 +127,7 @@ class ParseUrlTestCase(unittest.TestCase):
self.assertEqual(client._parse(url), test, url)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class ScrapyHTTPPageGetterTests(unittest.TestCase):
def test_earlyHeaders(self):
# basic test stolen from twisted HTTPageGetter
@ -272,6 +271,7 @@ class EncodingResource(resource.Resource):
return body.encode(self.out_encoding)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class WebClientTestCase(unittest.TestCase):
def _listen(self, site):
return reactor.listenTCP(0, site, interface="127.0.0.1")
@ -427,35 +427,8 @@ class WebClientTestCase(unittest.TestCase):
)
class WebClientSSLTestCase(unittest.TestCase):
context_factory = None
def _listen(self, site):
return reactor.listenSSL(
0,
site,
contextFactory=self.context_factory or ssl_context_factory(),
interface="127.0.0.1",
)
def getURL(self, path):
return f"https://127.0.0.1:{self.portno}/{path}"
def setUp(self):
self.tmpname = Path(mkdtemp())
(self.tmpname / "file").write_bytes(b"0123456789")
r = static.File(str(self.tmpname))
r.putChild(b"payload", PayloadResource())
self.site = server.Site(r, timeout=None)
self.wrapper = WrappingFactory(self.site)
self.port = self._listen(self.wrapper)
self.portno = self.port.getHost().port
@inlineCallbacks
def tearDown(self):
yield self.port.stopListening()
shutil.rmtree(self.tmpname)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class WebClientSSLTestCase(ContextFactoryBaseTestCase):
def testPayload(self):
s = "0123456789" * 10
return getPage(self.getURL("payload"), body=s).addCallback(
@ -490,51 +463,3 @@ class WebClientCustomCiphersSSLTestCase(WebClientSSLTestCase):
self.getURL("payload"), body=s, contextFactory=client_context_factory
)
return self.assertFailure(d, OpenSSL.SSL.Error)
class WebClientTLSMethodTestCase(WebClientSSLTestCase):
def _assert_factory_works(
self, client_context_factory: ScrapyClientContextFactory
) -> Deferred[Any]:
s = "0123456789" * 10
return getPage(
self.getURL("payload"), body=s, contextFactory=client_context_factory
).addCallback(self.assertEqual, to_bytes(s))
def test_setting_default(self):
crawler = get_crawler()
settings = Settings()
client_context_factory = load_context_factory_from_settings(settings, crawler)
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
return self._assert_factory_works(client_context_factory)
def test_setting_none(self):
crawler = get_crawler()
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": None})
with raises(KeyError):
load_context_factory_from_settings(settings, crawler)
def test_setting_bad(self):
crawler = get_crawler()
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
with raises(KeyError):
load_context_factory_from_settings(settings, crawler)
def test_setting_explicit(self):
crawler = get_crawler()
settings = Settings({"DOWNLOADER_CLIENT_TLS_METHOD": "TLSv1.2"})
client_context_factory = load_context_factory_from_settings(settings, crawler)
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
return self._assert_factory_works(client_context_factory)
def test_direct_from_crawler(self):
# the setting is ignored
crawler = get_crawler(settings_dict={"DOWNLOADER_CLIENT_TLS_METHOD": "bad"})
client_context_factory = build_from_crawler(ScrapyClientContextFactory, crawler)
assert client_context_factory._ssl_method == OpenSSL.SSL.SSLv23_METHOD
return self._assert_factory_works(client_context_factory)
def test_direct_init(self):
client_context_factory = ScrapyClientContextFactory(OpenSSL.SSL.TLSv1_2_METHOD)
assert client_context_factory._ssl_method == OpenSSL.SSL.TLSv1_2_METHOD
return self._assert_factory_works(client_context_factory)