Merge pull request #6635 from wRAR/webclient-cleanup

Remove scrapy.core.downloader.webclient._parse().
2025-02-06 11:00:46 +00:00 · 2025-01-28 23:08:06 +04:00 · 2025-01-28 23:08:06 +04:00 · 340819eff0
commit 340819eff0
parent a8d9746f56 0a80871c3a
5 changed files with 38 additions and 123 deletions
--- a/scrapy/core/downloader/handlers/http11.py
+++ b/scrapy/core/downloader/handlers/http11.py
@ -9,7 +9,7 @@ from contextlib import suppress
 from io import BytesIO
 from time import time
 from typing import TYPE_CHECKING, Any, TypedDict, TypeVar
-from urllib.parse import urldefrag, urlunparse
+from urllib.parse import urldefrag, urlparse

 from twisted.internet import ssl
 from twisted.internet.defer import CancelledError, Deferred, succeed
@ -32,11 +32,12 @@ from zope.interface import implementer

 from scrapy import Request, Spider, signals
 from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
-from scrapy.core.downloader.webclient import _parse
 from scrapy.exceptions import StopDownload
 from scrapy.http import Headers, Response
 from scrapy.responsetypes import responsetypes
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.python import to_bytes, to_unicode
+from scrapy.utils.url import add_http_if_no_scheme

 if TYPE_CHECKING:
    from twisted.internet.base import ReactorBase
@ -378,12 +379,15 @@ class ScrapyAgent:
        bindaddress = request.meta.get("bindaddress") or self._bindAddress
        proxy = request.meta.get("proxy")
        if proxy:
-            proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
-            scheme = _parse(request.url)[0]
-            proxyHost_str = to_unicode(proxyHost)
-            if scheme == b"https":
+            proxy = add_http_if_no_scheme(proxy)
+            proxy_parsed = urlparse(proxy)
+            proxy_host = proxy_parsed.hostname
+            proxy_port = proxy_parsed.port
+            if not proxy_port:
+                proxy_port = 443 if proxy_parsed.scheme == "https" else 80
+            if urlparse_cached(request).scheme == "https":
                proxyAuth = request.headers.get(b"Proxy-Authorization", None)
-                proxyConf = (proxyHost_str, proxyPort, proxyAuth)
+                proxyConf = (proxy_host, proxy_port, proxyAuth)
                return self._TunnelingAgent(
                    reactor=reactor,
                    proxyConf=proxyConf,
@ -392,13 +396,9 @@ class ScrapyAgent:
                    bindAddress=bindaddress,
                    pool=self._pool,
                )
-            proxyScheme = proxyScheme or b"http"
-            proxyURI = urlunparse(
-                (proxyScheme, proxyNetloc, proxyParams, b"", b"", b"")
-            )
            return self._ProxyAgent(
                reactor=reactor,
-                proxyURI=to_bytes(proxyURI, encoding="ascii"),
+                proxyURI=to_bytes(proxy, encoding="ascii"),
                connectTimeout=timeout,
                bindAddress=bindaddress,
                pool=self._pool,
--- a/scrapy/core/downloader/handlers/http2.py
+++ b/scrapy/core/downloader/handlers/http2.py
@ -8,8 +8,8 @@ from twisted.internet.error import TimeoutError
 from twisted.web.client import URI

 from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
-from scrapy.core.downloader.webclient import _parse
 from scrapy.core.http2.agent import H2Agent, H2ConnectionPool, ScrapyProxyH2Agent
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.python import to_bytes

 if TYPE_CHECKING:
@ -75,10 +75,7 @@ class ScrapyH2Agent:
        bind_address = request.meta.get("bindaddress") or self._bind_address
        proxy = request.meta.get("proxy")
        if proxy:
-            _, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
-            scheme = _parse(request.url)[0]
-
-            if scheme == b"https":
+            if urlparse_cached(request).scheme == "https":
                # ToDo
                raise NotImplementedError(
                    "Tunneling via CONNECT method using HTTP/2.0 is not yet supported"
--- a/scrapy/core/downloader/webclient.py
+++ b/scrapy/core/downloader/webclient.py
@ -2,11 +2,10 @@

 from __future__ import annotations

-import re
 import warnings
 from time import time
 from typing import TYPE_CHECKING
-from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
+from urllib.parse import urldefrag, urlparse, urlunparse

 from twisted.internet import defer
 from twisted.internet.protocol import ClientFactory
@ -22,34 +21,6 @@ if TYPE_CHECKING:
    from scrapy import Request


-def _parsed_url_args(parsed: ParseResult) -> tuple[bytes, bytes, bytes, int, bytes]:
-    # Assume parsed is urlparse-d from Request.url,
-    # which was passed via safe_url_string and is ascii-only.
-    path_str = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
-    path = to_bytes(path_str, encoding="ascii")
-    assert parsed.hostname is not None
-    host = to_bytes(parsed.hostname, encoding="ascii")
-    port = parsed.port
-    scheme = to_bytes(parsed.scheme, encoding="ascii")
-    netloc = to_bytes(parsed.netloc, encoding="ascii")
-    if port is None:
-        port = 443 if scheme == b"https" else 80
-    return scheme, netloc, host, port, path
-
-
-def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
-    """Return tuple of (scheme, netloc, host, port, path),
-    all in bytes except for port which is int.
-    Assume url is from Request.url, which was passed via safe_url_string
-    and is ascii-only.
-    """
-    url = url.strip()
-    if not re.match(r"^\w+://", url):
-        url = "//" + url
-    parsed = urlparse(url)
-    return _parsed_url_args(parsed)
-
-
 class ScrapyHTTPPageGetter(HTTPClient):
    delimiter = b"\n"

@ -142,14 +113,29 @@ class ScrapyHTTPClientFactory(ClientFactory):
        )

    def _set_connection_attributes(self, request):
-        parsed = urlparse_cached(request)
-        self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(
-            parsed
-        )
        proxy = request.meta.get("proxy")
        if proxy:
-            self.scheme, _, self.host, self.port, _ = _parse(proxy)
+            proxy_parsed = urlparse(to_bytes(proxy, encoding="ascii"))
+            self.scheme = proxy_parsed.scheme
+            self.host = proxy_parsed.hostname
+            self.port = proxy_parsed.port
+            self.netloc = proxy_parsed.netloc
+            if self.port is None:
+                self.port = 443 if proxy_parsed.scheme == b"https" else 80
            self.path = self.url
+        else:
+            parsed = urlparse_cached(request)
+            path_str = urlunparse(
+                ("", "", parsed.path or "/", parsed.params, parsed.query, "")
+            )
+            self.path = to_bytes(path_str, encoding="ascii")
+            assert parsed.hostname is not None
+            self.host = to_bytes(parsed.hostname, encoding="ascii")
+            self.port = parsed.port
+            self.scheme = to_bytes(parsed.scheme, encoding="ascii")
+            self.netloc = to_bytes(parsed.netloc, encoding="ascii")
+            if self.port is None:
+                self.port = 443 if self.scheme == b"https" else 80

    def __init__(self, request: Request, timeout: float = 180):
        warnings.warn(
--- a/tests/test_downloader_handlers.py
+++ b/tests/test_downloader_handlers.py
@ -785,9 +785,6 @@ class HttpProxyTestCase(unittest.TestCase):
 class Http10ProxyTestCase(HttpProxyTestCase):
    download_handler_cls: type = HTTP10DownloadHandler

-    def test_download_with_proxy_https_noconnect(self):
-        raise unittest.SkipTest("noconnect is not supported in HTTP10DownloadHandler")
-

 class Http11ProxyTestCase(HttpProxyTestCase):
    download_handler_cls: type = HTTP11DownloadHandler
--- a/tests/test_webclient.py
+++ b/tests/test_webclient.py
@ -8,6 +8,7 @@ from __future__ import annotations
 import shutil
 from pathlib import Path
 from tempfile import mkdtemp
+from urllib.parse import urlparse

 import OpenSSL.SSL
 import pytest
@ -61,72 +62,6 @@ def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
    ).deferred


-class ParseUrlTestCase(unittest.TestCase):
-    """Test URL parsing facility and defaults values."""
-
-    def _parse(self, url):
-        f = client.ScrapyHTTPClientFactory(Request(url))
-        return (f.scheme, f.netloc, f.host, f.port, f.path)
-
-    def testParse(self):
-        lip = "127.0.0.1"
-        tests = (
-            (
-                "http://127.0.0.1?c=v&c2=v2#fragment",
-                ("http", lip, lip, 80, "/?c=v&c2=v2"),
-            ),
-            (
-                "http://127.0.0.1/?c=v&c2=v2#fragment",
-                ("http", lip, lip, 80, "/?c=v&c2=v2"),
-            ),
-            (
-                "http://127.0.0.1/foo?c=v&c2=v2#frag",
-                ("http", lip, lip, 80, "/foo?c=v&c2=v2"),
-            ),
-            (
-                "http://127.0.0.1:100?c=v&c2=v2#fragment",
-                ("http", lip + ":100", lip, 100, "/?c=v&c2=v2"),
-            ),
-            (
-                "http://127.0.0.1:100/?c=v&c2=v2#frag",
-                ("http", lip + ":100", lip, 100, "/?c=v&c2=v2"),
-            ),
-            (
-                "http://127.0.0.1:100/foo?c=v&c2=v2#frag",
-                ("http", lip + ":100", lip, 100, "/foo?c=v&c2=v2"),
-            ),
-            ("http://127.0.0.1", ("http", lip, lip, 80, "/")),
-            ("http://127.0.0.1/", ("http", lip, lip, 80, "/")),
-            ("http://127.0.0.1/foo", ("http", lip, lip, 80, "/foo")),
-            ("http://127.0.0.1?param=value", ("http", lip, lip, 80, "/?param=value")),
-            ("http://127.0.0.1/?param=value", ("http", lip, lip, 80, "/?param=value")),
-            (
-                "http://127.0.0.1:12345/foo",
-                ("http", lip + ":12345", lip, 12345, "/foo"),
-            ),
-            ("http://spam:12345/foo", ("http", "spam:12345", "spam", 12345, "/foo")),
-            (
-                "http://spam.test.org/foo",
-                ("http", "spam.test.org", "spam.test.org", 80, "/foo"),
-            ),
-            ("https://127.0.0.1/foo", ("https", lip, lip, 443, "/foo")),
-            (
-                "https://127.0.0.1/?param=value",
-                ("https", lip, lip, 443, "/?param=value"),
-            ),
-            ("https://127.0.0.1:12345/", ("https", lip + ":12345", lip, 12345, "/")),
-            (
-                "http://scrapytest.org/foo ",
-                ("http", "scrapytest.org", "scrapytest.org", 80, "/foo"),
-            ),
-            ("http://egg:7890 ", ("http", "egg:7890", "egg", 7890, "/")),
-        )
-
-        for url, test in tests:
-            test = tuple(to_bytes(x) if not isinstance(x, int) else x for x in test)
-            self.assertEqual(client._parse(url), test, url)
-
-
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
 class ScrapyHTTPPageGetterTests(unittest.TestCase):
    def test_earlyHeaders(self):
@ -388,9 +323,9 @@ class WebClientTestCase(unittest.TestCase):

    def testFactoryInfo(self):
        url = self.getURL("file")
-        _, _, host, port, _ = client._parse(url)
+        parsed = urlparse(url)
        factory = client.ScrapyHTTPClientFactory(Request(url))
-        reactor.connectTCP(to_unicode(host), port, factory)
+        reactor.connectTCP(parsed.hostname, parsed.port, factory)
        return factory.deferred.addCallback(self._cbFactoryInfo, factory)

    def _cbFactoryInfo(self, ignoredResult, factory):