1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Merge pull request #6635 from wRAR/webclient-cleanup

Remove scrapy.core.downloader.webclient._parse().
This commit is contained in:
Andrey Rakhmatullin 2025-01-28 23:08:06 +04:00 committed by GitHub
commit 340819eff0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 38 additions and 123 deletions

View File

@ -9,7 +9,7 @@ from contextlib import suppress
from io import BytesIO
from time import time
from typing import TYPE_CHECKING, Any, TypedDict, TypeVar
from urllib.parse import urldefrag, urlunparse
from urllib.parse import urldefrag, urlparse
from twisted.internet import ssl
from twisted.internet.defer import CancelledError, Deferred, succeed
@ -32,11 +32,12 @@ from zope.interface import implementer
from scrapy import Request, Spider, signals
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
from scrapy.exceptions import StopDownload
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.url import add_http_if_no_scheme
if TYPE_CHECKING:
from twisted.internet.base import ReactorBase
@ -378,12 +379,15 @@ class ScrapyAgent:
bindaddress = request.meta.get("bindaddress") or self._bindAddress
proxy = request.meta.get("proxy")
if proxy:
proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost_str = to_unicode(proxyHost)
if scheme == b"https":
proxy = add_http_if_no_scheme(proxy)
proxy_parsed = urlparse(proxy)
proxy_host = proxy_parsed.hostname
proxy_port = proxy_parsed.port
if not proxy_port:
proxy_port = 443 if proxy_parsed.scheme == "https" else 80
if urlparse_cached(request).scheme == "https":
proxyAuth = request.headers.get(b"Proxy-Authorization", None)
proxyConf = (proxyHost_str, proxyPort, proxyAuth)
proxyConf = (proxy_host, proxy_port, proxyAuth)
return self._TunnelingAgent(
reactor=reactor,
proxyConf=proxyConf,
@ -392,13 +396,9 @@ class ScrapyAgent:
bindAddress=bindaddress,
pool=self._pool,
)
proxyScheme = proxyScheme or b"http"
proxyURI = urlunparse(
(proxyScheme, proxyNetloc, proxyParams, b"", b"", b"")
)
return self._ProxyAgent(
reactor=reactor,
proxyURI=to_bytes(proxyURI, encoding="ascii"),
proxyURI=to_bytes(proxy, encoding="ascii"),
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,

View File

@ -8,8 +8,8 @@ from twisted.internet.error import TimeoutError
from twisted.web.client import URI
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
from scrapy.core.http2.agent import H2Agent, H2ConnectionPool, ScrapyProxyH2Agent
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
if TYPE_CHECKING:
@ -75,10 +75,7 @@ class ScrapyH2Agent:
bind_address = request.meta.get("bindaddress") or self._bind_address
proxy = request.meta.get("proxy")
if proxy:
_, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
scheme = _parse(request.url)[0]
if scheme == b"https":
if urlparse_cached(request).scheme == "https":
# ToDo
raise NotImplementedError(
"Tunneling via CONNECT method using HTTP/2.0 is not yet supported"

View File

@ -2,11 +2,10 @@
from __future__ import annotations
import re
import warnings
from time import time
from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
from urllib.parse import urldefrag, urlparse, urlunparse
from twisted.internet import defer
from twisted.internet.protocol import ClientFactory
@ -22,34 +21,6 @@ if TYPE_CHECKING:
from scrapy import Request
def _parsed_url_args(parsed: ParseResult) -> tuple[bytes, bytes, bytes, int, bytes]:
# Assume parsed is urlparse-d from Request.url,
# which was passed via safe_url_string and is ascii-only.
path_str = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
path = to_bytes(path_str, encoding="ascii")
assert parsed.hostname is not None
host = to_bytes(parsed.hostname, encoding="ascii")
port = parsed.port
scheme = to_bytes(parsed.scheme, encoding="ascii")
netloc = to_bytes(parsed.netloc, encoding="ascii")
if port is None:
port = 443 if scheme == b"https" else 80
return scheme, netloc, host, port, path
def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
"""Return tuple of (scheme, netloc, host, port, path),
all in bytes except for port which is int.
Assume url is from Request.url, which was passed via safe_url_string
and is ascii-only.
"""
url = url.strip()
if not re.match(r"^\w+://", url):
url = "//" + url
parsed = urlparse(url)
return _parsed_url_args(parsed)
class ScrapyHTTPPageGetter(HTTPClient):
delimiter = b"\n"
@ -142,14 +113,29 @@ class ScrapyHTTPClientFactory(ClientFactory):
)
def _set_connection_attributes(self, request):
parsed = urlparse_cached(request)
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(
parsed
)
proxy = request.meta.get("proxy")
if proxy:
self.scheme, _, self.host, self.port, _ = _parse(proxy)
proxy_parsed = urlparse(to_bytes(proxy, encoding="ascii"))
self.scheme = proxy_parsed.scheme
self.host = proxy_parsed.hostname
self.port = proxy_parsed.port
self.netloc = proxy_parsed.netloc
if self.port is None:
self.port = 443 if proxy_parsed.scheme == b"https" else 80
self.path = self.url
else:
parsed = urlparse_cached(request)
path_str = urlunparse(
("", "", parsed.path or "/", parsed.params, parsed.query, "")
)
self.path = to_bytes(path_str, encoding="ascii")
assert parsed.hostname is not None
self.host = to_bytes(parsed.hostname, encoding="ascii")
self.port = parsed.port
self.scheme = to_bytes(parsed.scheme, encoding="ascii")
self.netloc = to_bytes(parsed.netloc, encoding="ascii")
if self.port is None:
self.port = 443 if self.scheme == b"https" else 80
def __init__(self, request: Request, timeout: float = 180):
warnings.warn(

View File

@ -785,9 +785,6 @@ class HttpProxyTestCase(unittest.TestCase):
class Http10ProxyTestCase(HttpProxyTestCase):
download_handler_cls: type = HTTP10DownloadHandler
def test_download_with_proxy_https_noconnect(self):
raise unittest.SkipTest("noconnect is not supported in HTTP10DownloadHandler")
class Http11ProxyTestCase(HttpProxyTestCase):
download_handler_cls: type = HTTP11DownloadHandler

View File

@ -8,6 +8,7 @@ from __future__ import annotations
import shutil
from pathlib import Path
from tempfile import mkdtemp
from urllib.parse import urlparse
import OpenSSL.SSL
import pytest
@ -61,72 +62,6 @@ def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
).deferred
class ParseUrlTestCase(unittest.TestCase):
"""Test URL parsing facility and defaults values."""
def _parse(self, url):
f = client.ScrapyHTTPClientFactory(Request(url))
return (f.scheme, f.netloc, f.host, f.port, f.path)
def testParse(self):
lip = "127.0.0.1"
tests = (
(
"http://127.0.0.1?c=v&c2=v2#fragment",
("http", lip, lip, 80, "/?c=v&c2=v2"),
),
(
"http://127.0.0.1/?c=v&c2=v2#fragment",
("http", lip, lip, 80, "/?c=v&c2=v2"),
),
(
"http://127.0.0.1/foo?c=v&c2=v2#frag",
("http", lip, lip, 80, "/foo?c=v&c2=v2"),
),
(
"http://127.0.0.1:100?c=v&c2=v2#fragment",
("http", lip + ":100", lip, 100, "/?c=v&c2=v2"),
),
(
"http://127.0.0.1:100/?c=v&c2=v2#frag",
("http", lip + ":100", lip, 100, "/?c=v&c2=v2"),
),
(
"http://127.0.0.1:100/foo?c=v&c2=v2#frag",
("http", lip + ":100", lip, 100, "/foo?c=v&c2=v2"),
),
("http://127.0.0.1", ("http", lip, lip, 80, "/")),
("http://127.0.0.1/", ("http", lip, lip, 80, "/")),
("http://127.0.0.1/foo", ("http", lip, lip, 80, "/foo")),
("http://127.0.0.1?param=value", ("http", lip, lip, 80, "/?param=value")),
("http://127.0.0.1/?param=value", ("http", lip, lip, 80, "/?param=value")),
(
"http://127.0.0.1:12345/foo",
("http", lip + ":12345", lip, 12345, "/foo"),
),
("http://spam:12345/foo", ("http", "spam:12345", "spam", 12345, "/foo")),
(
"http://spam.test.org/foo",
("http", "spam.test.org", "spam.test.org", 80, "/foo"),
),
("https://127.0.0.1/foo", ("https", lip, lip, 443, "/foo")),
(
"https://127.0.0.1/?param=value",
("https", lip, lip, 443, "/?param=value"),
),
("https://127.0.0.1:12345/", ("https", lip + ":12345", lip, 12345, "/")),
(
"http://scrapytest.org/foo ",
("http", "scrapytest.org", "scrapytest.org", 80, "/foo"),
),
("http://egg:7890 ", ("http", "egg:7890", "egg", 7890, "/")),
)
for url, test in tests:
test = tuple(to_bytes(x) if not isinstance(x, int) else x for x in test)
self.assertEqual(client._parse(url), test, url)
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class ScrapyHTTPPageGetterTests(unittest.TestCase):
def test_earlyHeaders(self):
@ -388,9 +323,9 @@ class WebClientTestCase(unittest.TestCase):
def testFactoryInfo(self):
url = self.getURL("file")
_, _, host, port, _ = client._parse(url)
parsed = urlparse(url)
factory = client.ScrapyHTTPClientFactory(Request(url))
reactor.connectTCP(to_unicode(host), port, factory)
reactor.connectTCP(parsed.hostname, parsed.port, factory)
return factory.deferred.addCallback(self._cbFactoryInfo, factory)
def _cbFactoryInfo(self, ignoredResult, factory):