mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 08:49:32 +00:00
Refactor downloader tests (#6647)
* Make download handler test base classes abstract. * Small cleanup. * Don't run the full test suite for special HTTP cases. * Don't run tests in imported base classes. * Remove an obsolete service_identity check. * Move FTP imports back to the top level. * Simplify the H2DownloadHandler import. * Forbig pytest 8.2.x. * Revert "Simplify the H2DownloadHandler import." This reverts commit ed187046ac53c395c7423c0f5e6fb2bc7c27838f.
This commit is contained in:
parent
16e39661e9
commit
ba5df629a2
@ -180,6 +180,7 @@ disable = [
|
||||
"unused-argument",
|
||||
"unused-import",
|
||||
"unused-variable",
|
||||
"useless-import-alias", # used as a hint to mypy
|
||||
"useless-return", # https://github.com/pylint-dev/pylint/issues/6530
|
||||
"wrong-import-position",
|
||||
|
||||
@ -319,6 +320,8 @@ ignore = [
|
||||
"D403",
|
||||
# `try`-`except` within a loop incurs performance overhead
|
||||
"PERF203",
|
||||
# Import alias does not rename original package
|
||||
"PLC0414",
|
||||
# Too many return statements
|
||||
"PLR0911",
|
||||
# Too many branches
|
||||
|
@ -4,6 +4,7 @@ import contextlib
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp, mkstemp
|
||||
from unittest import SkipTest, mock
|
||||
@ -12,17 +13,18 @@ import pytest
|
||||
from testfixtures import LogCapture
|
||||
from twisted.cred import checkers, credentials, portal
|
||||
from twisted.internet import defer, error, reactor
|
||||
from twisted.protocols.ftp import FTPFactory, FTPRealm
|
||||
from twisted.protocols.policies import WrappingFactory
|
||||
from twisted.trial import unittest
|
||||
from twisted.web import resource, server, static, util
|
||||
from twisted.web._newclient import ResponseFailed
|
||||
from twisted.web.client import ResponseFailed
|
||||
from twisted.web.http import _DataLoss
|
||||
from w3lib.url import path_to_file_uri
|
||||
|
||||
from scrapy.core.downloader.handlers import DownloadHandlers
|
||||
from scrapy.core.downloader.handlers import DownloadHandlerProtocol, DownloadHandlers
|
||||
from scrapy.core.downloader.handlers.datauri import DataURIDownloadHandler
|
||||
from scrapy.core.downloader.handlers.file import FileDownloadHandler
|
||||
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
|
||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
|
||||
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
|
||||
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
|
||||
@ -183,10 +185,7 @@ class BrokenDownloadResource(resource.Resource):
|
||||
def closeConnection(request):
|
||||
# We have to force a disconnection for HTTP/1.1 clients. Otherwise
|
||||
# client keeps the connection open waiting for more data.
|
||||
if hasattr(request.channel, "loseConnection"): # twisted >=16.3.0
|
||||
request.channel.loseConnection()
|
||||
else:
|
||||
request.channel.transport.loseConnection()
|
||||
request.channel.loseConnection()
|
||||
request.finish()
|
||||
|
||||
|
||||
@ -218,14 +217,18 @@ class DuplicateHeaderResource(resource.Resource):
|
||||
return b""
|
||||
|
||||
|
||||
class HttpTestCase(unittest.TestCase):
|
||||
class HttpTestCase(unittest.TestCase, ABC):
|
||||
scheme = "http"
|
||||
download_handler_cls: type = HTTPDownloadHandler
|
||||
|
||||
# only used for HTTPS tests
|
||||
keyfile = "keys/localhost.key"
|
||||
certfile = "keys/localhost.crt"
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
raise NotImplementedError
|
||||
|
||||
def setUp(self):
|
||||
self.tmpname = Path(mkdtemp())
|
||||
(self.tmpname / "file").write_bytes(b"0123456789")
|
||||
@ -426,7 +429,9 @@ class HttpTestCase(unittest.TestCase):
|
||||
class Http10TestCase(HttpTestCase):
|
||||
"""HTTP 1.0 test case"""
|
||||
|
||||
download_handler_cls: type = HTTP10DownloadHandler
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return HTTP10DownloadHandler
|
||||
|
||||
def test_protocol(self):
|
||||
request = Request(self.getURL("host"), method="GET")
|
||||
@ -443,7 +448,9 @@ class Https10TestCase(Http10TestCase):
|
||||
class Http11TestCase(HttpTestCase):
|
||||
"""HTTP 1.1 test case"""
|
||||
|
||||
download_handler_cls: type = HTTP11DownloadHandler
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return HTTP11DownloadHandler
|
||||
|
||||
def test_download_without_maxsize_limit(self):
|
||||
request = Request(self.getURL("file"))
|
||||
@ -604,50 +611,16 @@ class Https11TestCase(Http11TestCase):
|
||||
yield download_handler.close()
|
||||
|
||||
|
||||
class Https11WrongHostnameTestCase(Http11TestCase):
|
||||
scheme = "https"
|
||||
|
||||
# above tests use a server certificate for "localhost",
|
||||
# client connection to "localhost" too.
|
||||
# here we test that even if the server certificate is for another domain,
|
||||
# "www.example.com" in this case,
|
||||
# the tests still pass
|
||||
keyfile = "keys/example-com.key.pem"
|
||||
certfile = "keys/example-com.cert.pem"
|
||||
|
||||
|
||||
class Https11InvalidDNSId(Https11TestCase):
|
||||
"""Connect to HTTPS hosts with IP while certificate uses domain names IDs."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.host = "127.0.0.1"
|
||||
|
||||
|
||||
class Https11InvalidDNSPattern(Https11TestCase):
|
||||
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""
|
||||
|
||||
keyfile = "keys/localhost.ip.key"
|
||||
certfile = "keys/localhost.ip.crt"
|
||||
|
||||
def setUp(self):
|
||||
try:
|
||||
from service_identity.exceptions import CertificateError # noqa: F401
|
||||
except ImportError:
|
||||
raise unittest.SkipTest("cryptography lib is too old")
|
||||
self.tls_log_message = (
|
||||
'SSL connection certificate: issuer "/C=IE/O=Scrapy/CN=127.0.0.1", '
|
||||
'subject "/C=IE/O=Scrapy/CN=127.0.0.1"'
|
||||
)
|
||||
super().setUp()
|
||||
|
||||
|
||||
class Https11CustomCiphers(unittest.TestCase):
|
||||
scheme = "https"
|
||||
download_handler_cls: type = HTTP11DownloadHandler
|
||||
class SimpleHttpsTest(unittest.TestCase):
|
||||
"""Base class for special cases tested with just one simple request"""
|
||||
|
||||
keyfile = "keys/localhost.key"
|
||||
certfile = "keys/localhost.crt"
|
||||
cipher_string: str | None = None
|
||||
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return HTTP11DownloadHandler
|
||||
|
||||
def setUp(self):
|
||||
self.tmpname = Path(mkdtemp())
|
||||
@ -659,14 +632,16 @@ class Https11CustomCiphers(unittest.TestCase):
|
||||
0,
|
||||
self.site,
|
||||
ssl_context_factory(
|
||||
self.keyfile, self.certfile, cipher_string="CAMELLIA256-SHA"
|
||||
self.keyfile, self.certfile, cipher_string=self.cipher_string
|
||||
),
|
||||
interface=self.host,
|
||||
)
|
||||
self.portno = self.port.getHost().port
|
||||
crawler = get_crawler(
|
||||
settings_dict={"DOWNLOADER_CLIENT_TLS_CIPHERS": "CAMELLIA256-SHA"}
|
||||
)
|
||||
if self.cipher_string is not None:
|
||||
settings_dict = {"DOWNLOADER_CLIENT_TLS_CIPHERS": self.cipher_string}
|
||||
else:
|
||||
settings_dict = None
|
||||
crawler = get_crawler(settings_dict=settings_dict)
|
||||
self.download_handler = build_from_crawler(self.download_handler_cls, crawler)
|
||||
self.download_request = self.download_handler.download_request
|
||||
|
||||
@ -678,7 +653,7 @@ class Https11CustomCiphers(unittest.TestCase):
|
||||
shutil.rmtree(self.tmpname)
|
||||
|
||||
def getURL(self, path):
|
||||
return f"{self.scheme}://{self.host}:{self.portno}/{path}"
|
||||
return f"https://{self.host}:{self.portno}/{path}"
|
||||
|
||||
def test_download(self):
|
||||
request = Request(self.getURL("file"))
|
||||
@ -688,10 +663,40 @@ class Https11CustomCiphers(unittest.TestCase):
|
||||
return d
|
||||
|
||||
|
||||
class Https11WrongHostnameTestCase(SimpleHttpsTest):
|
||||
# above tests use a server certificate for "localhost",
|
||||
# client connection to "localhost" too.
|
||||
# here we test that even if the server certificate is for another domain,
|
||||
# "www.example.com" in this case,
|
||||
# the tests still pass
|
||||
keyfile = "keys/example-com.key.pem"
|
||||
certfile = "keys/example-com.cert.pem"
|
||||
|
||||
|
||||
class Https11InvalidDNSId(SimpleHttpsTest):
|
||||
"""Connect to HTTPS hosts with IP while certificate uses domain names IDs."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.host = "127.0.0.1"
|
||||
|
||||
|
||||
class Https11InvalidDNSPattern(SimpleHttpsTest):
|
||||
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""
|
||||
|
||||
keyfile = "keys/localhost.ip.key"
|
||||
certfile = "keys/localhost.ip.crt"
|
||||
|
||||
|
||||
class Https11CustomCiphers(SimpleHttpsTest):
|
||||
cipher_string = "CAMELLIA256-SHA"
|
||||
|
||||
|
||||
class Http11MockServerTestCase(unittest.TestCase):
|
||||
"""HTTP 1.1 test case with MockServer"""
|
||||
|
||||
settings_dict: dict | None = None
|
||||
is_secure = False
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@ -709,7 +714,8 @@ class Http11MockServerTestCase(unittest.TestCase):
|
||||
# download it
|
||||
yield crawler.crawl(
|
||||
seed=Request(
|
||||
url=self.mockserver.url("/partial"), meta={"download_maxsize": 1000}
|
||||
url=self.mockserver.url("/partial", is_secure=self.is_secure),
|
||||
meta={"download_maxsize": 1000},
|
||||
)
|
||||
)
|
||||
failure = crawler.spider.meta["failure"]
|
||||
@ -718,7 +724,9 @@ class Http11MockServerTestCase(unittest.TestCase):
|
||||
@defer.inlineCallbacks
|
||||
def test_download(self):
|
||||
crawler = get_crawler(SingleRequestSpider, self.settings_dict)
|
||||
yield crawler.crawl(seed=Request(url=self.mockserver.url("")))
|
||||
yield crawler.crawl(
|
||||
seed=Request(url=self.mockserver.url("", is_secure=self.is_secure))
|
||||
)
|
||||
failure = crawler.spider.meta.get("failure")
|
||||
self.assertTrue(failure is None)
|
||||
reason = crawler.spider.meta["close_reason"]
|
||||
@ -740,10 +748,14 @@ class UriResource(resource.Resource):
|
||||
return b""
|
||||
|
||||
|
||||
class HttpProxyTestCase(unittest.TestCase):
|
||||
download_handler_cls: type = HTTPDownloadHandler
|
||||
class HttpProxyTestCase(unittest.TestCase, ABC):
|
||||
expected_http_proxy_request_body = b"http://example.com"
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
raise NotImplementedError
|
||||
|
||||
def setUp(self):
|
||||
site = server.Site(UriResource(), timeout=None)
|
||||
wrapper = WrappingFactory(site)
|
||||
@ -785,11 +797,15 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
||||
class Http10ProxyTestCase(HttpProxyTestCase):
|
||||
download_handler_cls: type = HTTP10DownloadHandler
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return HTTP10DownloadHandler
|
||||
|
||||
|
||||
class Http11ProxyTestCase(HttpProxyTestCase):
|
||||
download_handler_cls: type = HTTP11DownloadHandler
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return HTTP11DownloadHandler
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_download_with_proxy_https_timeout(self):
|
||||
@ -1008,10 +1024,6 @@ class BaseFTPTestCase(unittest.TestCase):
|
||||
)
|
||||
|
||||
def setUp(self):
|
||||
from twisted.protocols.ftp import FTPFactory, FTPRealm
|
||||
|
||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||
|
||||
# setup dirs and test file
|
||||
self.directory = Path(mkdtemp())
|
||||
userdir = self.directory / self.username
|
||||
@ -1155,10 +1167,6 @@ class AnonymousFTPTestCase(BaseFTPTestCase):
|
||||
req_meta = {}
|
||||
|
||||
def setUp(self):
|
||||
from twisted.protocols.ftp import FTPFactory, FTPRealm
|
||||
|
||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||
|
||||
# setup dir and test file
|
||||
self.directory = Path(mkdtemp())
|
||||
for filename, content in self.test_files:
|
||||
|
@ -1,7 +1,7 @@
|
||||
import json
|
||||
from unittest import mock, skipIf
|
||||
from unittest import mock
|
||||
|
||||
from pytest import mark
|
||||
import pytest
|
||||
from testfixtures import LogCapture
|
||||
from twisted.internet import defer, error, reactor
|
||||
from twisted.trial import unittest
|
||||
@ -9,30 +9,60 @@ from twisted.web import server
|
||||
from twisted.web.error import SchemeNotSupported
|
||||
from twisted.web.http import H2_ENABLED
|
||||
|
||||
from scrapy.core.downloader.handlers import DownloadHandlerProtocol
|
||||
from scrapy.http import Request
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.misc import build_from_crawler
|
||||
from scrapy.utils.test import get_crawler
|
||||
from tests.mockserver import ssl_context_factory
|
||||
from tests.test_downloader_handlers import (
|
||||
Http11MockServerTestCase,
|
||||
Http11ProxyTestCase,
|
||||
Https11CustomCiphers,
|
||||
Https11TestCase,
|
||||
UriResource,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not H2_ENABLED, reason="HTTP/2 support in Twisted is not enabled"
|
||||
)
|
||||
|
||||
@skipIf(not H2_ENABLED, "HTTP/2 support in Twisted is not enabled")
|
||||
class Https2TestCase(Https11TestCase):
|
||||
|
||||
class BaseTestClasses:
|
||||
# A hack to prevent tests from the imported classes to run here too.
|
||||
# See https://stackoverflow.com/q/1323455/113586 for other ways.
|
||||
from tests.test_downloader_handlers import (
|
||||
Http11MockServerTestCase as Http11MockServerTestCase,
|
||||
)
|
||||
from tests.test_downloader_handlers import (
|
||||
Http11ProxyTestCase as Http11ProxyTestCase,
|
||||
)
|
||||
from tests.test_downloader_handlers import (
|
||||
Https11CustomCiphers as Https11CustomCiphers,
|
||||
)
|
||||
from tests.test_downloader_handlers import (
|
||||
Https11InvalidDNSId as Https11InvalidDNSId,
|
||||
)
|
||||
from tests.test_downloader_handlers import (
|
||||
Https11InvalidDNSPattern as Https11InvalidDNSPattern,
|
||||
)
|
||||
from tests.test_downloader_handlers import (
|
||||
Https11TestCase as Https11TestCase,
|
||||
)
|
||||
from tests.test_downloader_handlers import (
|
||||
Https11WrongHostnameTestCase as Https11WrongHostnameTestCase,
|
||||
)
|
||||
|
||||
|
||||
def _get_dh() -> type[DownloadHandlerProtocol]:
|
||||
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
|
||||
|
||||
return H2DownloadHandler
|
||||
|
||||
|
||||
class Https2TestCase(BaseTestClasses.Https11TestCase):
|
||||
scheme = "https"
|
||||
HTTP2_DATALOSS_SKIP_REASON = "Content-Length mismatch raises InvalidBodyLengthError"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
|
||||
|
||||
cls.download_handler_cls = H2DownloadHandler
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return _get_dh()
|
||||
|
||||
def test_protocol(self):
|
||||
request = Request(self.getURL("host"), method="GET")
|
||||
@ -99,7 +129,7 @@ class Https2TestCase(Https11TestCase):
|
||||
|
||||
return defer.DeferredList([d1, d2])
|
||||
|
||||
@mark.xfail(reason="https://github.com/python-hyper/h2/issues/1247")
|
||||
@pytest.mark.xfail(reason="https://github.com/python-hyper/h2/issues/1247")
|
||||
def test_connect_request(self):
|
||||
request = Request(self.getURL("file"), method="CONNECT")
|
||||
d = self.download_request(request, Spider("foo"))
|
||||
@ -150,61 +180,31 @@ class Https2TestCase(Https11TestCase):
|
||||
return d
|
||||
|
||||
|
||||
class Https2WrongHostnameTestCase(Https2TestCase):
|
||||
tls_log_message = (
|
||||
'SSL connection certificate: issuer "/C=XW/ST=XW/L=The '
|
||||
'Internet/O=Scrapy/CN=www.example.com/emailAddress=test@example.com", '
|
||||
'subject "/C=XW/ST=XW/L=The '
|
||||
'Internet/O=Scrapy/CN=www.example.com/emailAddress=test@example.com"'
|
||||
)
|
||||
|
||||
# above tests use a server certificate for "localhost",
|
||||
# client connection to "localhost" too.
|
||||
# here we test that even if the server certificate is for another domain,
|
||||
# "www.example.com" in this case,
|
||||
# the tests still pass
|
||||
keyfile = "keys/example-com.key.pem"
|
||||
certfile = "keys/example-com.cert.pem"
|
||||
class Https2WrongHostnameTestCase(BaseTestClasses.Https11WrongHostnameTestCase):
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return _get_dh()
|
||||
|
||||
|
||||
class Https2InvalidDNSId(Https2TestCase):
|
||||
"""Connect to HTTPS hosts with IP while certificate uses domain names IDs."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.host = "127.0.0.1"
|
||||
class Https2InvalidDNSId(BaseTestClasses.Https11InvalidDNSId):
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return _get_dh()
|
||||
|
||||
|
||||
class Https2InvalidDNSPattern(Https2TestCase):
|
||||
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""
|
||||
|
||||
keyfile = "keys/localhost.ip.key"
|
||||
certfile = "keys/localhost.ip.crt"
|
||||
|
||||
def setUp(self):
|
||||
try:
|
||||
from service_identity.exceptions import CertificateError # noqa: F401
|
||||
except ImportError:
|
||||
raise unittest.SkipTest("cryptography lib is too old")
|
||||
self.tls_log_message = (
|
||||
'SSL connection certificate: issuer "/C=IE/O=Scrapy/CN=127.0.0.1", '
|
||||
'subject "/C=IE/O=Scrapy/CN=127.0.0.1"'
|
||||
)
|
||||
super().setUp()
|
||||
class Https2InvalidDNSPattern(BaseTestClasses.Https11InvalidDNSPattern):
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return _get_dh()
|
||||
|
||||
|
||||
@skipIf(not H2_ENABLED, "HTTP/2 support in Twisted is not enabled")
|
||||
class Https2CustomCiphers(Https11CustomCiphers):
|
||||
scheme = "https"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
|
||||
|
||||
cls.download_handler_cls = H2DownloadHandler
|
||||
class Https2CustomCiphers(BaseTestClasses.Https11CustomCiphers):
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return _get_dh()
|
||||
|
||||
|
||||
class Http2MockServerTestCase(Http11MockServerTestCase):
|
||||
class Http2MockServerTestCase(BaseTestClasses.Http11MockServerTestCase):
|
||||
"""HTTP 2.0 test case with MockServer"""
|
||||
|
||||
settings_dict = {
|
||||
@ -212,10 +212,10 @@ class Http2MockServerTestCase(Http11MockServerTestCase):
|
||||
"https": "scrapy.core.downloader.handlers.http2.H2DownloadHandler"
|
||||
}
|
||||
}
|
||||
is_secure = True
|
||||
|
||||
|
||||
@skipIf(not H2_ENABLED, "HTTP/2 support in Twisted is not enabled")
|
||||
class Https2ProxyTestCase(Http11ProxyTestCase):
|
||||
class Https2ProxyTestCase(BaseTestClasses.Http11ProxyTestCase):
|
||||
# only used for HTTPS tests
|
||||
keyfile = "keys/localhost.key"
|
||||
certfile = "keys/localhost.crt"
|
||||
@ -225,11 +225,9 @@ class Https2ProxyTestCase(Http11ProxyTestCase):
|
||||
|
||||
expected_http_proxy_request_body = b"/"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
|
||||
|
||||
cls.download_handler_cls = H2DownloadHandler
|
||||
@property
|
||||
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
|
||||
return _get_dh()
|
||||
|
||||
def setUp(self):
|
||||
site = server.Site(UriResource(), timeout=None)
|
||||
|
Loading…
x
Reference in New Issue
Block a user