1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 08:49:32 +00:00

Refactor downloader tests (#6647)

* Make download handler test base classes abstract.

* Small cleanup.

* Don't run the full test suite for special HTTP cases.

* Don't run tests in imported base classes.

* Remove an obsolete service_identity check.

* Move FTP imports back to the top level.

* Simplify the H2DownloadHandler import.

* Forbig pytest 8.2.x.

* Revert "Simplify the H2DownloadHandler import."

This reverts commit ed187046ac53c395c7423c0f5e6fb2bc7c27838f.
This commit is contained in:
Andrey Rakhmatullin 2025-02-03 19:11:47 +04:00 committed by GitHub
parent 16e39661e9
commit ba5df629a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 149 additions and 140 deletions

View File

@ -180,6 +180,7 @@ disable = [
"unused-argument",
"unused-import",
"unused-variable",
"useless-import-alias", # used as a hint to mypy
"useless-return", # https://github.com/pylint-dev/pylint/issues/6530
"wrong-import-position",
@ -319,6 +320,8 @@ ignore = [
"D403",
# `try`-`except` within a loop incurs performance overhead
"PERF203",
# Import alias does not rename original package
"PLC0414",
# Too many return statements
"PLR0911",
# Too many branches

View File

@ -4,6 +4,7 @@ import contextlib
import os
import shutil
import sys
from abc import ABC, abstractmethod
from pathlib import Path
from tempfile import mkdtemp, mkstemp
from unittest import SkipTest, mock
@ -12,17 +13,18 @@ import pytest
from testfixtures import LogCapture
from twisted.cred import checkers, credentials, portal
from twisted.internet import defer, error, reactor
from twisted.protocols.ftp import FTPFactory, FTPRealm
from twisted.protocols.policies import WrappingFactory
from twisted.trial import unittest
from twisted.web import resource, server, static, util
from twisted.web._newclient import ResponseFailed
from twisted.web.client import ResponseFailed
from twisted.web.http import _DataLoss
from w3lib.url import path_to_file_uri
from scrapy.core.downloader.handlers import DownloadHandlers
from scrapy.core.downloader.handlers import DownloadHandlerProtocol, DownloadHandlers
from scrapy.core.downloader.handlers.datauri import DataURIDownloadHandler
from scrapy.core.downloader.handlers.file import FileDownloadHandler
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
@ -183,10 +185,7 @@ class BrokenDownloadResource(resource.Resource):
def closeConnection(request):
# We have to force a disconnection for HTTP/1.1 clients. Otherwise
# client keeps the connection open waiting for more data.
if hasattr(request.channel, "loseConnection"): # twisted >=16.3.0
request.channel.loseConnection()
else:
request.channel.transport.loseConnection()
request.channel.loseConnection()
request.finish()
@ -218,14 +217,18 @@ class DuplicateHeaderResource(resource.Resource):
return b""
class HttpTestCase(unittest.TestCase):
class HttpTestCase(unittest.TestCase, ABC):
scheme = "http"
download_handler_cls: type = HTTPDownloadHandler
# only used for HTTPS tests
keyfile = "keys/localhost.key"
certfile = "keys/localhost.crt"
@property
@abstractmethod
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
raise NotImplementedError
def setUp(self):
self.tmpname = Path(mkdtemp())
(self.tmpname / "file").write_bytes(b"0123456789")
@ -426,7 +429,9 @@ class HttpTestCase(unittest.TestCase):
class Http10TestCase(HttpTestCase):
"""HTTP 1.0 test case"""
download_handler_cls: type = HTTP10DownloadHandler
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return HTTP10DownloadHandler
def test_protocol(self):
request = Request(self.getURL("host"), method="GET")
@ -443,7 +448,9 @@ class Https10TestCase(Http10TestCase):
class Http11TestCase(HttpTestCase):
"""HTTP 1.1 test case"""
download_handler_cls: type = HTTP11DownloadHandler
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return HTTP11DownloadHandler
def test_download_without_maxsize_limit(self):
request = Request(self.getURL("file"))
@ -604,50 +611,16 @@ class Https11TestCase(Http11TestCase):
yield download_handler.close()
class Https11WrongHostnameTestCase(Http11TestCase):
scheme = "https"
# above tests use a server certificate for "localhost",
# client connection to "localhost" too.
# here we test that even if the server certificate is for another domain,
# "www.example.com" in this case,
# the tests still pass
keyfile = "keys/example-com.key.pem"
certfile = "keys/example-com.cert.pem"
class Https11InvalidDNSId(Https11TestCase):
"""Connect to HTTPS hosts with IP while certificate uses domain names IDs."""
def setUp(self):
super().setUp()
self.host = "127.0.0.1"
class Https11InvalidDNSPattern(Https11TestCase):
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""
keyfile = "keys/localhost.ip.key"
certfile = "keys/localhost.ip.crt"
def setUp(self):
try:
from service_identity.exceptions import CertificateError # noqa: F401
except ImportError:
raise unittest.SkipTest("cryptography lib is too old")
self.tls_log_message = (
'SSL connection certificate: issuer "/C=IE/O=Scrapy/CN=127.0.0.1", '
'subject "/C=IE/O=Scrapy/CN=127.0.0.1"'
)
super().setUp()
class Https11CustomCiphers(unittest.TestCase):
scheme = "https"
download_handler_cls: type = HTTP11DownloadHandler
class SimpleHttpsTest(unittest.TestCase):
"""Base class for special cases tested with just one simple request"""
keyfile = "keys/localhost.key"
certfile = "keys/localhost.crt"
cipher_string: str | None = None
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return HTTP11DownloadHandler
def setUp(self):
self.tmpname = Path(mkdtemp())
@ -659,14 +632,16 @@ class Https11CustomCiphers(unittest.TestCase):
0,
self.site,
ssl_context_factory(
self.keyfile, self.certfile, cipher_string="CAMELLIA256-SHA"
self.keyfile, self.certfile, cipher_string=self.cipher_string
),
interface=self.host,
)
self.portno = self.port.getHost().port
crawler = get_crawler(
settings_dict={"DOWNLOADER_CLIENT_TLS_CIPHERS": "CAMELLIA256-SHA"}
)
if self.cipher_string is not None:
settings_dict = {"DOWNLOADER_CLIENT_TLS_CIPHERS": self.cipher_string}
else:
settings_dict = None
crawler = get_crawler(settings_dict=settings_dict)
self.download_handler = build_from_crawler(self.download_handler_cls, crawler)
self.download_request = self.download_handler.download_request
@ -678,7 +653,7 @@ class Https11CustomCiphers(unittest.TestCase):
shutil.rmtree(self.tmpname)
def getURL(self, path):
return f"{self.scheme}://{self.host}:{self.portno}/{path}"
return f"https://{self.host}:{self.portno}/{path}"
def test_download(self):
request = Request(self.getURL("file"))
@ -688,10 +663,40 @@ class Https11CustomCiphers(unittest.TestCase):
return d
class Https11WrongHostnameTestCase(SimpleHttpsTest):
# above tests use a server certificate for "localhost",
# client connection to "localhost" too.
# here we test that even if the server certificate is for another domain,
# "www.example.com" in this case,
# the tests still pass
keyfile = "keys/example-com.key.pem"
certfile = "keys/example-com.cert.pem"
class Https11InvalidDNSId(SimpleHttpsTest):
"""Connect to HTTPS hosts with IP while certificate uses domain names IDs."""
def setUp(self):
super().setUp()
self.host = "127.0.0.1"
class Https11InvalidDNSPattern(SimpleHttpsTest):
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""
keyfile = "keys/localhost.ip.key"
certfile = "keys/localhost.ip.crt"
class Https11CustomCiphers(SimpleHttpsTest):
cipher_string = "CAMELLIA256-SHA"
class Http11MockServerTestCase(unittest.TestCase):
"""HTTP 1.1 test case with MockServer"""
settings_dict: dict | None = None
is_secure = False
@classmethod
def setUpClass(cls):
@ -709,7 +714,8 @@ class Http11MockServerTestCase(unittest.TestCase):
# download it
yield crawler.crawl(
seed=Request(
url=self.mockserver.url("/partial"), meta={"download_maxsize": 1000}
url=self.mockserver.url("/partial", is_secure=self.is_secure),
meta={"download_maxsize": 1000},
)
)
failure = crawler.spider.meta["failure"]
@ -718,7 +724,9 @@ class Http11MockServerTestCase(unittest.TestCase):
@defer.inlineCallbacks
def test_download(self):
crawler = get_crawler(SingleRequestSpider, self.settings_dict)
yield crawler.crawl(seed=Request(url=self.mockserver.url("")))
yield crawler.crawl(
seed=Request(url=self.mockserver.url("", is_secure=self.is_secure))
)
failure = crawler.spider.meta.get("failure")
self.assertTrue(failure is None)
reason = crawler.spider.meta["close_reason"]
@ -740,10 +748,14 @@ class UriResource(resource.Resource):
return b""
class HttpProxyTestCase(unittest.TestCase):
download_handler_cls: type = HTTPDownloadHandler
class HttpProxyTestCase(unittest.TestCase, ABC):
expected_http_proxy_request_body = b"http://example.com"
@property
@abstractmethod
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
raise NotImplementedError
def setUp(self):
site = server.Site(UriResource(), timeout=None)
wrapper = WrappingFactory(site)
@ -785,11 +797,15 @@ class HttpProxyTestCase(unittest.TestCase):
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
class Http10ProxyTestCase(HttpProxyTestCase):
download_handler_cls: type = HTTP10DownloadHandler
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return HTTP10DownloadHandler
class Http11ProxyTestCase(HttpProxyTestCase):
download_handler_cls: type = HTTP11DownloadHandler
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return HTTP11DownloadHandler
@defer.inlineCallbacks
def test_download_with_proxy_https_timeout(self):
@ -1008,10 +1024,6 @@ class BaseFTPTestCase(unittest.TestCase):
)
def setUp(self):
from twisted.protocols.ftp import FTPFactory, FTPRealm
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
# setup dirs and test file
self.directory = Path(mkdtemp())
userdir = self.directory / self.username
@ -1155,10 +1167,6 @@ class AnonymousFTPTestCase(BaseFTPTestCase):
req_meta = {}
def setUp(self):
from twisted.protocols.ftp import FTPFactory, FTPRealm
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
# setup dir and test file
self.directory = Path(mkdtemp())
for filename, content in self.test_files:

View File

@ -1,7 +1,7 @@
import json
from unittest import mock, skipIf
from unittest import mock
from pytest import mark
import pytest
from testfixtures import LogCapture
from twisted.internet import defer, error, reactor
from twisted.trial import unittest
@ -9,30 +9,60 @@ from twisted.web import server
from twisted.web.error import SchemeNotSupported
from twisted.web.http import H2_ENABLED
from scrapy.core.downloader.handlers import DownloadHandlerProtocol
from scrapy.http import Request
from scrapy.spiders import Spider
from scrapy.utils.misc import build_from_crawler
from scrapy.utils.test import get_crawler
from tests.mockserver import ssl_context_factory
from tests.test_downloader_handlers import (
Http11MockServerTestCase,
Http11ProxyTestCase,
Https11CustomCiphers,
Https11TestCase,
UriResource,
)
pytestmark = pytest.mark.skipif(
not H2_ENABLED, reason="HTTP/2 support in Twisted is not enabled"
)
@skipIf(not H2_ENABLED, "HTTP/2 support in Twisted is not enabled")
class Https2TestCase(Https11TestCase):
class BaseTestClasses:
# A hack to prevent tests from the imported classes to run here too.
# See https://stackoverflow.com/q/1323455/113586 for other ways.
from tests.test_downloader_handlers import (
Http11MockServerTestCase as Http11MockServerTestCase,
)
from tests.test_downloader_handlers import (
Http11ProxyTestCase as Http11ProxyTestCase,
)
from tests.test_downloader_handlers import (
Https11CustomCiphers as Https11CustomCiphers,
)
from tests.test_downloader_handlers import (
Https11InvalidDNSId as Https11InvalidDNSId,
)
from tests.test_downloader_handlers import (
Https11InvalidDNSPattern as Https11InvalidDNSPattern,
)
from tests.test_downloader_handlers import (
Https11TestCase as Https11TestCase,
)
from tests.test_downloader_handlers import (
Https11WrongHostnameTestCase as Https11WrongHostnameTestCase,
)
def _get_dh() -> type[DownloadHandlerProtocol]:
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
return H2DownloadHandler
class Https2TestCase(BaseTestClasses.Https11TestCase):
scheme = "https"
HTTP2_DATALOSS_SKIP_REASON = "Content-Length mismatch raises InvalidBodyLengthError"
@classmethod
def setUpClass(cls):
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
cls.download_handler_cls = H2DownloadHandler
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return _get_dh()
def test_protocol(self):
request = Request(self.getURL("host"), method="GET")
@ -99,7 +129,7 @@ class Https2TestCase(Https11TestCase):
return defer.DeferredList([d1, d2])
@mark.xfail(reason="https://github.com/python-hyper/h2/issues/1247")
@pytest.mark.xfail(reason="https://github.com/python-hyper/h2/issues/1247")
def test_connect_request(self):
request = Request(self.getURL("file"), method="CONNECT")
d = self.download_request(request, Spider("foo"))
@ -150,61 +180,31 @@ class Https2TestCase(Https11TestCase):
return d
class Https2WrongHostnameTestCase(Https2TestCase):
tls_log_message = (
'SSL connection certificate: issuer "/C=XW/ST=XW/L=The '
'Internet/O=Scrapy/CN=www.example.com/emailAddress=test@example.com", '
'subject "/C=XW/ST=XW/L=The '
'Internet/O=Scrapy/CN=www.example.com/emailAddress=test@example.com"'
)
# above tests use a server certificate for "localhost",
# client connection to "localhost" too.
# here we test that even if the server certificate is for another domain,
# "www.example.com" in this case,
# the tests still pass
keyfile = "keys/example-com.key.pem"
certfile = "keys/example-com.cert.pem"
class Https2WrongHostnameTestCase(BaseTestClasses.Https11WrongHostnameTestCase):
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return _get_dh()
class Https2InvalidDNSId(Https2TestCase):
"""Connect to HTTPS hosts with IP while certificate uses domain names IDs."""
def setUp(self):
super().setUp()
self.host = "127.0.0.1"
class Https2InvalidDNSId(BaseTestClasses.Https11InvalidDNSId):
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return _get_dh()
class Https2InvalidDNSPattern(Https2TestCase):
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""
keyfile = "keys/localhost.ip.key"
certfile = "keys/localhost.ip.crt"
def setUp(self):
try:
from service_identity.exceptions import CertificateError # noqa: F401
except ImportError:
raise unittest.SkipTest("cryptography lib is too old")
self.tls_log_message = (
'SSL connection certificate: issuer "/C=IE/O=Scrapy/CN=127.0.0.1", '
'subject "/C=IE/O=Scrapy/CN=127.0.0.1"'
)
super().setUp()
class Https2InvalidDNSPattern(BaseTestClasses.Https11InvalidDNSPattern):
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return _get_dh()
@skipIf(not H2_ENABLED, "HTTP/2 support in Twisted is not enabled")
class Https2CustomCiphers(Https11CustomCiphers):
scheme = "https"
@classmethod
def setUpClass(cls):
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
cls.download_handler_cls = H2DownloadHandler
class Https2CustomCiphers(BaseTestClasses.Https11CustomCiphers):
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return _get_dh()
class Http2MockServerTestCase(Http11MockServerTestCase):
class Http2MockServerTestCase(BaseTestClasses.Http11MockServerTestCase):
"""HTTP 2.0 test case with MockServer"""
settings_dict = {
@ -212,10 +212,10 @@ class Http2MockServerTestCase(Http11MockServerTestCase):
"https": "scrapy.core.downloader.handlers.http2.H2DownloadHandler"
}
}
is_secure = True
@skipIf(not H2_ENABLED, "HTTP/2 support in Twisted is not enabled")
class Https2ProxyTestCase(Http11ProxyTestCase):
class Https2ProxyTestCase(BaseTestClasses.Http11ProxyTestCase):
# only used for HTTPS tests
keyfile = "keys/localhost.key"
certfile = "keys/localhost.crt"
@ -225,11 +225,9 @@ class Https2ProxyTestCase(Http11ProxyTestCase):
expected_http_proxy_request_body = b"/"
@classmethod
def setUpClass(cls):
from scrapy.core.downloader.handlers.http2 import H2DownloadHandler
cls.download_handler_cls = H2DownloadHandler
@property
def download_handler_cls(self) -> type[DownloadHandlerProtocol]:
return _get_dh()
def setUp(self):
site = server.Site(UriResource(), timeout=None)

View File

@ -14,7 +14,7 @@ deps =
pexpect >= 4.8.0
pyftpdlib >= 2.0.1
pygments
pytest
pytest != 8.2.* # https://github.com/pytest-dev/pytest/issues/12275
pytest-cov >= 4.0.0
pytest-xdist
sybil >= 1.3.0 # https://github.com/cjw296/sybil/issues/20#issuecomment-605433422