mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Merge pull request #6635 from wRAR/webclient-cleanup
Remove scrapy.core.downloader.webclient._parse().
This commit is contained in:
commit
340819eff0
@ -9,7 +9,7 @@ from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Any, TypedDict, TypeVar
|
||||
from urllib.parse import urldefrag, urlunparse
|
||||
from urllib.parse import urldefrag, urlparse
|
||||
|
||||
from twisted.internet import ssl
|
||||
from twisted.internet.defer import CancelledError, Deferred, succeed
|
||||
@ -32,11 +32,12 @@ from zope.interface import implementer
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.exceptions import StopDownload
|
||||
from scrapy.http import Headers, Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
from scrapy.utils.url import add_http_if_no_scheme
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from twisted.internet.base import ReactorBase
|
||||
@ -378,12 +379,15 @@ class ScrapyAgent:
|
||||
bindaddress = request.meta.get("bindaddress") or self._bindAddress
|
||||
proxy = request.meta.get("proxy")
|
||||
if proxy:
|
||||
proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
|
||||
scheme = _parse(request.url)[0]
|
||||
proxyHost_str = to_unicode(proxyHost)
|
||||
if scheme == b"https":
|
||||
proxy = add_http_if_no_scheme(proxy)
|
||||
proxy_parsed = urlparse(proxy)
|
||||
proxy_host = proxy_parsed.hostname
|
||||
proxy_port = proxy_parsed.port
|
||||
if not proxy_port:
|
||||
proxy_port = 443 if proxy_parsed.scheme == "https" else 80
|
||||
if urlparse_cached(request).scheme == "https":
|
||||
proxyAuth = request.headers.get(b"Proxy-Authorization", None)
|
||||
proxyConf = (proxyHost_str, proxyPort, proxyAuth)
|
||||
proxyConf = (proxy_host, proxy_port, proxyAuth)
|
||||
return self._TunnelingAgent(
|
||||
reactor=reactor,
|
||||
proxyConf=proxyConf,
|
||||
@ -392,13 +396,9 @@ class ScrapyAgent:
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
proxyScheme = proxyScheme or b"http"
|
||||
proxyURI = urlunparse(
|
||||
(proxyScheme, proxyNetloc, proxyParams, b"", b"", b"")
|
||||
)
|
||||
return self._ProxyAgent(
|
||||
reactor=reactor,
|
||||
proxyURI=to_bytes(proxyURI, encoding="ascii"),
|
||||
proxyURI=to_bytes(proxy, encoding="ascii"),
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
|
@ -8,8 +8,8 @@ from twisted.internet.error import TimeoutError
|
||||
from twisted.web.client import URI
|
||||
|
||||
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.core.http2.agent import H2Agent, H2ConnectionPool, ScrapyProxyH2Agent
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -75,10 +75,7 @@ class ScrapyH2Agent:
|
||||
bind_address = request.meta.get("bindaddress") or self._bind_address
|
||||
proxy = request.meta.get("proxy")
|
||||
if proxy:
|
||||
_, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
|
||||
scheme = _parse(request.url)[0]
|
||||
|
||||
if scheme == b"https":
|
||||
if urlparse_cached(request).scheme == "https":
|
||||
# ToDo
|
||||
raise NotImplementedError(
|
||||
"Tunneling via CONNECT method using HTTP/2.0 is not yet supported"
|
||||
|
@ -2,11 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
|
||||
from urllib.parse import urldefrag, urlparse, urlunparse
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.protocol import ClientFactory
|
||||
@ -22,34 +21,6 @@ if TYPE_CHECKING:
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
def _parsed_url_args(parsed: ParseResult) -> tuple[bytes, bytes, bytes, int, bytes]:
|
||||
# Assume parsed is urlparse-d from Request.url,
|
||||
# which was passed via safe_url_string and is ascii-only.
|
||||
path_str = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
|
||||
path = to_bytes(path_str, encoding="ascii")
|
||||
assert parsed.hostname is not None
|
||||
host = to_bytes(parsed.hostname, encoding="ascii")
|
||||
port = parsed.port
|
||||
scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||
netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||
if port is None:
|
||||
port = 443 if scheme == b"https" else 80
|
||||
return scheme, netloc, host, port, path
|
||||
|
||||
|
||||
def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
|
||||
"""Return tuple of (scheme, netloc, host, port, path),
|
||||
all in bytes except for port which is int.
|
||||
Assume url is from Request.url, which was passed via safe_url_string
|
||||
and is ascii-only.
|
||||
"""
|
||||
url = url.strip()
|
||||
if not re.match(r"^\w+://", url):
|
||||
url = "//" + url
|
||||
parsed = urlparse(url)
|
||||
return _parsed_url_args(parsed)
|
||||
|
||||
|
||||
class ScrapyHTTPPageGetter(HTTPClient):
|
||||
delimiter = b"\n"
|
||||
|
||||
@ -142,14 +113,29 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
)
|
||||
|
||||
def _set_connection_attributes(self, request):
|
||||
parsed = urlparse_cached(request)
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(
|
||||
parsed
|
||||
)
|
||||
proxy = request.meta.get("proxy")
|
||||
if proxy:
|
||||
self.scheme, _, self.host, self.port, _ = _parse(proxy)
|
||||
proxy_parsed = urlparse(to_bytes(proxy, encoding="ascii"))
|
||||
self.scheme = proxy_parsed.scheme
|
||||
self.host = proxy_parsed.hostname
|
||||
self.port = proxy_parsed.port
|
||||
self.netloc = proxy_parsed.netloc
|
||||
if self.port is None:
|
||||
self.port = 443 if proxy_parsed.scheme == b"https" else 80
|
||||
self.path = self.url
|
||||
else:
|
||||
parsed = urlparse_cached(request)
|
||||
path_str = urlunparse(
|
||||
("", "", parsed.path or "/", parsed.params, parsed.query, "")
|
||||
)
|
||||
self.path = to_bytes(path_str, encoding="ascii")
|
||||
assert parsed.hostname is not None
|
||||
self.host = to_bytes(parsed.hostname, encoding="ascii")
|
||||
self.port = parsed.port
|
||||
self.scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||
self.netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||
if self.port is None:
|
||||
self.port = 443 if self.scheme == b"https" else 80
|
||||
|
||||
def __init__(self, request: Request, timeout: float = 180):
|
||||
warnings.warn(
|
||||
|
@ -785,9 +785,6 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
class Http10ProxyTestCase(HttpProxyTestCase):
|
||||
download_handler_cls: type = HTTP10DownloadHandler
|
||||
|
||||
def test_download_with_proxy_https_noconnect(self):
|
||||
raise unittest.SkipTest("noconnect is not supported in HTTP10DownloadHandler")
|
||||
|
||||
|
||||
class Http11ProxyTestCase(HttpProxyTestCase):
|
||||
download_handler_cls: type = HTTP11DownloadHandler
|
||||
|
@ -8,6 +8,7 @@ from __future__ import annotations
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import OpenSSL.SSL
|
||||
import pytest
|
||||
@ -61,72 +62,6 @@ def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
|
||||
).deferred
|
||||
|
||||
|
||||
class ParseUrlTestCase(unittest.TestCase):
|
||||
"""Test URL parsing facility and defaults values."""
|
||||
|
||||
def _parse(self, url):
|
||||
f = client.ScrapyHTTPClientFactory(Request(url))
|
||||
return (f.scheme, f.netloc, f.host, f.port, f.path)
|
||||
|
||||
def testParse(self):
|
||||
lip = "127.0.0.1"
|
||||
tests = (
|
||||
(
|
||||
"http://127.0.0.1?c=v&c2=v2#fragment",
|
||||
("http", lip, lip, 80, "/?c=v&c2=v2"),
|
||||
),
|
||||
(
|
||||
"http://127.0.0.1/?c=v&c2=v2#fragment",
|
||||
("http", lip, lip, 80, "/?c=v&c2=v2"),
|
||||
),
|
||||
(
|
||||
"http://127.0.0.1/foo?c=v&c2=v2#frag",
|
||||
("http", lip, lip, 80, "/foo?c=v&c2=v2"),
|
||||
),
|
||||
(
|
||||
"http://127.0.0.1:100?c=v&c2=v2#fragment",
|
||||
("http", lip + ":100", lip, 100, "/?c=v&c2=v2"),
|
||||
),
|
||||
(
|
||||
"http://127.0.0.1:100/?c=v&c2=v2#frag",
|
||||
("http", lip + ":100", lip, 100, "/?c=v&c2=v2"),
|
||||
),
|
||||
(
|
||||
"http://127.0.0.1:100/foo?c=v&c2=v2#frag",
|
||||
("http", lip + ":100", lip, 100, "/foo?c=v&c2=v2"),
|
||||
),
|
||||
("http://127.0.0.1", ("http", lip, lip, 80, "/")),
|
||||
("http://127.0.0.1/", ("http", lip, lip, 80, "/")),
|
||||
("http://127.0.0.1/foo", ("http", lip, lip, 80, "/foo")),
|
||||
("http://127.0.0.1?param=value", ("http", lip, lip, 80, "/?param=value")),
|
||||
("http://127.0.0.1/?param=value", ("http", lip, lip, 80, "/?param=value")),
|
||||
(
|
||||
"http://127.0.0.1:12345/foo",
|
||||
("http", lip + ":12345", lip, 12345, "/foo"),
|
||||
),
|
||||
("http://spam:12345/foo", ("http", "spam:12345", "spam", 12345, "/foo")),
|
||||
(
|
||||
"http://spam.test.org/foo",
|
||||
("http", "spam.test.org", "spam.test.org", 80, "/foo"),
|
||||
),
|
||||
("https://127.0.0.1/foo", ("https", lip, lip, 443, "/foo")),
|
||||
(
|
||||
"https://127.0.0.1/?param=value",
|
||||
("https", lip, lip, 443, "/?param=value"),
|
||||
),
|
||||
("https://127.0.0.1:12345/", ("https", lip + ":12345", lip, 12345, "/")),
|
||||
(
|
||||
"http://scrapytest.org/foo ",
|
||||
("http", "scrapytest.org", "scrapytest.org", 80, "/foo"),
|
||||
),
|
||||
("http://egg:7890 ", ("http", "egg:7890", "egg", 7890, "/")),
|
||||
)
|
||||
|
||||
for url, test in tests:
|
||||
test = tuple(to_bytes(x) if not isinstance(x, int) else x for x in test)
|
||||
self.assertEqual(client._parse(url), test, url)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
def test_earlyHeaders(self):
|
||||
@ -388,9 +323,9 @@ class WebClientTestCase(unittest.TestCase):
|
||||
|
||||
def testFactoryInfo(self):
|
||||
url = self.getURL("file")
|
||||
_, _, host, port, _ = client._parse(url)
|
||||
parsed = urlparse(url)
|
||||
factory = client.ScrapyHTTPClientFactory(Request(url))
|
||||
reactor.connectTCP(to_unicode(host), port, factory)
|
||||
reactor.connectTCP(parsed.hostname, parsed.port, factory)
|
||||
return factory.deferred.addCallback(self._cbFactoryInfo, factory)
|
||||
|
||||
def _cbFactoryInfo(self, ignoredResult, factory):
|
||||
|
Loading…
x
Reference in New Issue
Block a user