1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Remove --keep-runtime-typing from pyupgrade.

This commit is contained in:
Andrey Rakhmatullin 2024-10-17 21:22:34 +05:00
parent c8e87ab21a
commit c9095ef927
122 changed files with 947 additions and 981 deletions

View File

@ -30,7 +30,7 @@ repos:
additional_dependencies:
- black==24.4.2
- repo: https://github.com/asottile/pyupgrade
rev: v3.16.0
rev: v3.18.0
hooks:
- id: pyupgrade
args: [--py39-plus, --keep-runtime-typing]
args: [--py39-plus]

View File

@ -6,7 +6,7 @@ import inspect
import os
import sys
from importlib.metadata import entry_points
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
import scrapy
from scrapy.commands import BaseRunSpiderCommand, ScrapyCommand, ScrapyHelpFormatter
@ -30,7 +30,7 @@ if TYPE_CHECKING:
class ScrapyArgumentParser(argparse.ArgumentParser):
def _parse_optional(
self, arg_string: str
) -> Optional[tuple[Optional[argparse.Action], str, Optional[str]]]:
) -> tuple[argparse.Action | None, str, str | None] | None:
# if starts with -: it means that is a parameter not a argument
if arg_string[:2] == "-:":
return None
@ -89,7 +89,7 @@ def _get_commands_dict(
return cmds
def _pop_command_name(argv: list[str]) -> Optional[str]:
def _pop_command_name(argv: list[str]) -> str | None:
i = 0
for arg in argv[1:]:
if not arg.startswith("-"):
@ -147,9 +147,7 @@ def _run_print_help(
sys.exit(2)
def execute(
argv: Optional[list[str]] = None, settings: Optional[Settings] = None
) -> None:
def execute(argv: list[str] | None = None, settings: Settings | None = None) -> None:
if argv is None:
argv = sys.argv

View File

@ -8,7 +8,7 @@ import argparse
import builtins
import os
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from twisted.python import failure
@ -23,7 +23,7 @@ if TYPE_CHECKING:
class ScrapyCommand:
requires_project: bool = False
crawler_process: Optional[CrawlerProcess] = None
crawler_process: CrawlerProcess | None = None
# default settings to be used for this command instead of global defaults
default_settings: dict[str, Any] = {}
@ -195,7 +195,7 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
prog: str,
indent_increment: int = 2,
max_help_position: int = 24,
width: Optional[int] = None,
width: int | None = None,
):
super().__init__(
prog,

View File

@ -1,10 +1,12 @@
from __future__ import annotations
import argparse
import os
import shutil
import string
from importlib import import_module
from pathlib import Path
from typing import Any, Optional, Union, cast
from typing import Any, cast
from urllib.parse import urlparse
import scrapy
@ -140,7 +142,7 @@ class Command(ScrapyCommand):
name: str,
url: str,
template_name: str,
template_file: Union[str, os.PathLike],
template_file: str | os.PathLike,
) -> None:
"""Generate the spider module, based on the given template"""
tvars = self._generate_template_variables(module, name, url, template_name)
@ -161,7 +163,7 @@ class Command(ScrapyCommand):
if spiders_module:
print(f"in module:\n {spiders_module.__name__}.{module}")
def _find_template(self, template: str) -> Optional[Path]:
def _find_template(self, template: str) -> Path | None:
template_file = Path(self.templates_dir, f"{template}.tmpl")
if template_file.exists():
return template_file

View File

@ -5,7 +5,7 @@ import functools
import inspect
import json
import logging
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, overload
from typing import TYPE_CHECKING, Any, TypeVar, overload
from itemadapter import ItemAdapter, is_item
from twisted.internet.defer import Deferred, maybeDeferred
@ -38,10 +38,10 @@ _T = TypeVar("_T")
class Command(BaseRunSpiderCommand):
requires_project = True
spider: Optional[Spider] = None
spider: Spider | None = None
items: dict[int, list[Any]] = {}
requests: dict[int, list[Request]] = {}
spidercls: Optional[type[Spider]]
spidercls: type[Spider] | None
first_response = None
@ -137,13 +137,13 @@ class Command(BaseRunSpiderCommand):
@overload
def iterate_spider_output(
self, result: Union[AsyncGenerator[_T, None], Coroutine[Any, Any, _T]]
self, result: AsyncGenerator[_T] | Coroutine[Any, Any, _T]
) -> Deferred[_T]: ...
@overload
def iterate_spider_output(self, result: _T) -> Iterable[Any]: ...
def iterate_spider_output(self, result: Any) -> Union[Iterable[Any], Deferred[Any]]:
def iterate_spider_output(self, result: Any) -> Iterable[Any] | Deferred[Any]:
if inspect.isasyncgen(result):
d = deferred_from_coro(
collect_asyncgen(aiter_errback(result, self.handle_exception))
@ -164,7 +164,7 @@ class Command(BaseRunSpiderCommand):
old_reqs = self.requests.get(lvl, [])
self.requests[lvl] = old_reqs + new_reqs
def print_items(self, lvl: Optional[int] = None, colour: bool = True) -> None:
def print_items(self, lvl: int | None = None, colour: bool = True) -> None:
if lvl is None:
items = [item for lst in self.items.values() for item in lst]
else:
@ -173,7 +173,7 @@ class Command(BaseRunSpiderCommand):
print("# Scraped Items ", "-" * 60)
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
def print_requests(self, lvl: Optional[int] = None, colour: bool = True) -> None:
def print_requests(self, lvl: int | None = None, colour: bool = True) -> None:
if lvl is None:
if self.requests:
requests = self.requests[max(self.requests)]
@ -222,7 +222,7 @@ class Command(BaseRunSpiderCommand):
self,
response: Response,
callback: CallbackT,
cb_kwargs: Optional[dict[str, Any]] = None,
cb_kwargs: dict[str, Any] | None = None,
) -> Deferred[Any]:
cb_kwargs = cb_kwargs or {}
d = maybeDeferred(self.iterate_spider_output, callback(response, **cb_kwargs))
@ -230,7 +230,7 @@ class Command(BaseRunSpiderCommand):
def get_callback_from_rules(
self, spider: Spider, response: Response
) -> Union[CallbackT, str, None]:
) -> CallbackT | str | None:
if getattr(spider, "rules", None):
for rule in spider.rules: # type: ignore[attr-defined]
if rule.link_extractor.matches(response.url):
@ -303,9 +303,9 @@ class Command(BaseRunSpiderCommand):
*,
spider: Spider,
opts: argparse.Namespace,
response: Optional[Response] = None,
response: Response | None = None,
) -> CallbackT:
cb: Union[str, CallbackT, None] = None
cb: str | CallbackT | None = None
if response:
cb = response.meta["_callback"]
if not cb:

View File

@ -4,7 +4,7 @@ import argparse
import sys
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
@ -15,7 +15,7 @@ if TYPE_CHECKING:
from types import ModuleType
def _import_file(filepath: Union[str, PathLike[str]]) -> ModuleType:
def _import_file(filepath: str | PathLike[str]) -> ModuleType:
abspath = Path(filepath).resolve()
if abspath.suffix not in (".py", ".pyw"):
raise ValueError(f"Not a Python source file: {abspath}")

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import argparse
import os
import re
@ -6,7 +8,6 @@ from importlib.util import find_spec
from pathlib import Path
from shutil import copy2, copystat, ignore_patterns, move
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
from typing import Union
import scrapy
from scrapy.commands import ScrapyCommand
@ -24,7 +25,7 @@ TEMPLATES_TO_RENDER: tuple[tuple[str, ...], ...] = (
IGNORE = ignore_patterns("*.pyc", "__pycache__", ".svn")
def _make_writable(path: Union[str, os.PathLike]) -> None:
def _make_writable(path: str | os.PathLike) -> None:
current_permissions = os.stat(path).st_mode
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)

View File

@ -6,7 +6,7 @@ from collections.abc import AsyncGenerator, Iterable
from functools import wraps
from inspect import getmembers
from types import CoroutineType
from typing import TYPE_CHECKING, Any, Optional, cast
from typing import TYPE_CHECKING, Any, cast
from unittest import TestCase, TestResult
from scrapy.http import Request, Response
@ -24,7 +24,7 @@ if TYPE_CHECKING:
class Contract:
"""Abstract class for contracts"""
request_cls: Optional[type[Request]] = None
request_cls: type[Request] | None = None
name: str
def __init__(self, method: Callable, *args: Any):
@ -126,10 +126,8 @@ class ContractsManager:
return contracts
def from_spider(
self, spider: Spider, results: TestResult
) -> list[Optional[Request]]:
requests: list[Optional[Request]] = []
def from_spider(self, spider: Spider, results: TestResult) -> list[Request | None]:
requests: list[Request | None] = []
for method in self.tested_methods_from_spidercls(type(spider)):
bound_method = spider.__getattribute__(method)
try:
@ -140,7 +138,7 @@ class ContractsManager:
return requests
def from_method(self, method: Callable, results: TestResult) -> Optional[Request]:
def from_method(self, method: Callable, results: TestResult) -> Request | None:
contracts = self.extract_contracts(method)
if contracts:
request_cls = Request

View File

@ -1,5 +1,7 @@
from __future__ import annotations
import json
from typing import Any, Callable, Optional
from typing import Any, Callable
from itemadapter import ItemAdapter, is_item
@ -63,7 +65,7 @@ class ReturnsContract(Contract):
"""
name = "returns"
object_type_verifiers: dict[Optional[str], Callable[[Any], bool]] = {
object_type_verifiers: dict[str | None, Callable[[Any], bool]] = {
"request": lambda x: isinstance(x, Request),
"requests": lambda x: isinstance(x, Request),
"item": is_item,

View File

@ -5,7 +5,7 @@ import warnings
from collections import deque
from datetime import datetime
from time import time
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, TypeVar, cast
from twisted.internet import task
from twisted.internet.defer import Deferred
@ -37,7 +37,7 @@ class Slot:
delay: float,
randomize_delay: bool,
*,
throttle: Optional[bool] = None,
throttle: bool | None = None,
):
self.concurrency: int = concurrency
self.delay: float = delay
@ -119,15 +119,13 @@ class Downloader:
"DOWNLOAD_SLOTS", {}
)
def fetch(
self, request: Request, spider: Spider
) -> Deferred[Union[Response, Request]]:
def fetch(self, request: Request, spider: Spider) -> Deferred[Response | Request]:
def _deactivate(response: _T) -> _T:
self.active.remove(request)
return response
self.active.add(request)
dfd: Deferred[Union[Response, Request]] = self.middleware.download(
dfd: Deferred[Response | Request] = self.middleware.download(
self._enqueue_request, request, spider
)
return dfd.addBoth(_deactivate)
@ -164,7 +162,7 @@ class Downloader:
return key
def _get_slot_key(self, request: Request, spider: Optional[Spider]) -> str:
def _get_slot_key(self, request: Request, spider: Spider | None) -> str:
warnings.warn(
"Use of this protected method is deprecated. Consider using its corresponding public method get_slot_key() instead.",
ScrapyDeprecationWarning,

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import warnings
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from OpenSSL import SSL
from twisted.internet._sslverify import _setAcceptableProtocols
@ -49,7 +49,7 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
self,
method: int = SSL.SSLv23_METHOD,
tls_verbose_logging: bool = False,
tls_ciphers: Optional[str] = None,
tls_ciphers: str | None = None,
*args: Any,
**kwargs: Any,
):
@ -73,7 +73,7 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
tls_verbose_logging: bool = settings.getbool(
"DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING"
)
tls_ciphers: Optional[str] = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
tls_ciphers: str | None = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
return cls( # type: ignore[misc]
method=method,
tls_verbose_logging=tls_verbose_logging,

View File

@ -4,7 +4,7 @@ from __future__ import annotations
import logging
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Optional, Protocol, Union, cast
from typing import TYPE_CHECKING, Any, Protocol, cast
from twisted.internet import defer
@ -35,16 +35,16 @@ class DownloadHandlerProtocol(Protocol):
class DownloadHandlers:
def __init__(self, crawler: Crawler):
self._crawler: Crawler = crawler
self._schemes: dict[str, Union[str, Callable[..., Any]]] = (
self._schemes: dict[str, str | Callable[..., Any]] = (
{}
) # stores acceptable schemes on instancing
self._handlers: dict[str, DownloadHandlerProtocol] = (
{}
) # stores instanced handlers for schemes
self._notconfigured: dict[str, str] = {} # remembers failed handlers
handlers: dict[str, Union[str, Callable[..., Any]]] = without_none_values(
handlers: dict[str, str | Callable[..., Any]] = without_none_values(
cast(
dict[str, Union[str, Callable[..., Any]]],
"dict[str, str | Callable[..., Any]]",
crawler.settings.getwithbase("DOWNLOAD_HANDLERS"),
)
)
@ -54,7 +54,7 @@ class DownloadHandlers:
crawler.signals.connect(self._close, signals.engine_stopped)
def _get_handler(self, scheme: str) -> Optional[DownloadHandlerProtocol]:
def _get_handler(self, scheme: str) -> DownloadHandlerProtocol | None:
"""Lazy-load the downloadhandler for a scheme
only on the first request for that scheme.
"""
@ -70,7 +70,7 @@ class DownloadHandlers:
def _load_handler(
self, scheme: str, skip_lazy: bool = False
) -> Optional[DownloadHandlerProtocol]:
) -> DownloadHandlerProtocol | None:
path = self._schemes[scheme]
try:
dhcls: type[DownloadHandlerProtocol] = load_object(path)

View File

@ -32,7 +32,7 @@ from __future__ import annotations
import re
from io import BytesIO
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
from typing import TYPE_CHECKING, Any, BinaryIO
from urllib.parse import unquote
from twisted.internet.protocol import ClientCreator, Protocol
@ -56,8 +56,8 @@ if TYPE_CHECKING:
class ReceivedDataProtocol(Protocol):
def __init__(self, filename: Optional[str] = None):
self.__filename: Optional[str] = filename
def __init__(self, filename: str | None = None):
self.__filename: str | None = filename
self.body: BinaryIO = open(filename, "wb") if filename else BytesIO()
self.size: int = 0
@ -66,7 +66,7 @@ class ReceivedDataProtocol(Protocol):
self.size += len(data)
@property
def filename(self) -> Optional[str]:
def filename(self) -> str | None:
return self.__filename
def close(self) -> None:

View File

@ -8,7 +8,7 @@ import re
from contextlib import suppress
from io import BytesIO
from time import time
from typing import TYPE_CHECKING, Any, Optional, TypedDict, TypeVar, Union
from typing import TYPE_CHECKING, Any, TypedDict, TypeVar
from urllib.parse import urldefrag, urlunparse
from twisted.internet import ssl
@ -52,10 +52,10 @@ _T = TypeVar("_T")
class _ResultT(TypedDict):
txresponse: TxResponse
body: bytes
flags: Optional[list[str]]
certificate: Optional[ssl.Certificate]
ip_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address, None]
failure: NotRequired[Optional[Failure]]
flags: list[str] | None
certificate: ssl.Certificate | None
ip_address: ipaddress.IPv4Address | ipaddress.IPv6Address | None
failure: NotRequired[Failure | None]
class HTTP11DownloadHandler:
@ -143,10 +143,10 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
reactor: ReactorBase,
host: str,
port: int,
proxyConf: tuple[str, int, Optional[bytes]],
proxyConf: tuple[str, int, bytes | None],
contextFactory: IPolicyForHTTPS,
timeout: float = 30,
bindAddress: Optional[tuple[str, int]] = None,
bindAddress: tuple[str, int] | None = None,
):
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
@ -220,7 +220,7 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
def tunnel_request_data(
host: str, port: int, proxy_auth_header: Optional[bytes] = None
host: str, port: int, proxy_auth_header: bytes | None = None
) -> bytes:
r"""
Return binary content of a CONNECT request.
@ -254,14 +254,14 @@ class TunnelingAgent(Agent):
self,
*,
reactor: ReactorBase,
proxyConf: tuple[str, int, Optional[bytes]],
proxyConf: tuple[str, int, bytes | None],
contextFactory: IPolicyForHTTPS,
connectTimeout: Optional[float] = None,
bindAddress: Optional[bytes] = None,
pool: Optional[HTTPConnectionPool] = None,
connectTimeout: float | None = None,
bindAddress: bytes | None = None,
pool: HTTPConnectionPool | None = None,
):
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
self._proxyConf: tuple[str, int, Optional[bytes]] = proxyConf
self._proxyConf: tuple[str, int, bytes | None] = proxyConf
self._contextFactory: IPolicyForHTTPS = contextFactory
def _getEndpoint(self, uri: URI) -> TunnelingTCP4ClientEndpoint:
@ -281,8 +281,8 @@ class TunnelingAgent(Agent):
endpoint: TCP4ClientEndpoint,
method: bytes,
parsedURI: bytes,
headers: Optional[TxHeaders],
bodyProducer: Optional[IBodyProducer],
headers: TxHeaders | None,
bodyProducer: IBodyProducer | None,
requestPath: bytes,
) -> Deferred[TxResponse]:
# proxy host and port are required for HTTP pool `key`
@ -305,9 +305,9 @@ class ScrapyProxyAgent(Agent):
self,
reactor: ReactorBase,
proxyURI: bytes,
connectTimeout: Optional[float] = None,
bindAddress: Optional[bytes] = None,
pool: Optional[HTTPConnectionPool] = None,
connectTimeout: float | None = None,
bindAddress: bytes | None = None,
pool: HTTPConnectionPool | None = None,
):
super().__init__(
reactor=reactor,
@ -321,8 +321,8 @@ class ScrapyProxyAgent(Agent):
self,
method: bytes,
uri: bytes,
headers: Optional[TxHeaders] = None,
bodyProducer: Optional[IBodyProducer] = None,
headers: TxHeaders | None = None,
bodyProducer: IBodyProducer | None = None,
) -> Deferred[TxResponse]:
"""
Issue a new request via the configured proxy.
@ -350,8 +350,8 @@ class ScrapyAgent:
*,
contextFactory: IPolicyForHTTPS,
connectTimeout: float = 10,
bindAddress: Optional[bytes] = None,
pool: Optional[HTTPConnectionPool] = None,
bindAddress: bytes | None = None,
pool: HTTPConnectionPool | None = None,
maxsize: int = 0,
warnsize: int = 0,
fail_on_dataloss: bool = True,
@ -359,12 +359,12 @@ class ScrapyAgent:
):
self._contextFactory: IPolicyForHTTPS = contextFactory
self._connectTimeout: float = connectTimeout
self._bindAddress: Optional[bytes] = bindAddress
self._pool: Optional[HTTPConnectionPool] = pool
self._bindAddress: bytes | None = bindAddress
self._pool: HTTPConnectionPool | None = pool
self._maxsize: int = maxsize
self._warnsize: int = warnsize
self._fail_on_dataloss: bool = fail_on_dataloss
self._txresponse: Optional[TxResponse] = None
self._txresponse: TxResponse | None = None
self._crawler: Crawler = crawler
def _get_agent(self, request: Request, timeout: float) -> Agent:
@ -462,7 +462,7 @@ class ScrapyAgent:
def _cb_bodyready(
self, txresponse: TxResponse, request: Request
) -> Union[_ResultT, Deferred[_ResultT]]:
) -> _ResultT | Deferred[_ResultT]:
headers_received_result = self._crawler.signals.send_catch_log(
signal=signals.headers_received,
headers=self._headers_from_twisted_response(txresponse),
@ -551,7 +551,7 @@ class ScrapyAgent:
def _cb_bodydone(
self, result: _ResultT, request: Request, url: str
) -> Union[Response, Failure]:
) -> Response | Failure:
headers = self._headers_from_twisted_response(result["txresponse"])
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
try:
@ -614,14 +614,12 @@ class _ResponseReader(Protocol):
self._fail_on_dataloss_warned: bool = False
self._reached_warnsize: bool = False
self._bytes_received: int = 0
self._certificate: Optional[ssl.Certificate] = None
self._ip_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address, None] = (
None
)
self._certificate: ssl.Certificate | None = None
self._ip_address: ipaddress.IPv4Address | ipaddress.IPv6Address | None = None
self._crawler: Crawler = crawler
def _finish_response(
self, flags: Optional[list[str]] = None, failure: Optional[Failure] = None
self, flags: list[str] | None = None, failure: Failure | None = None
) -> None:
self._finished.callback(
{

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from time import time
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from urllib.parse import urldefrag
from twisted.internet.error import TimeoutError
@ -60,8 +60,8 @@ class ScrapyH2Agent:
context_factory: IPolicyForHTTPS,
pool: H2ConnectionPool,
connect_timeout: int = 10,
bind_address: Optional[bytes] = None,
crawler: Optional[Crawler] = None,
bind_address: bytes | None = None,
crawler: Crawler | None = None,
) -> None:
self._context_factory = context_factory
self._connect_timeout = connect_timeout
@ -69,7 +69,7 @@ class ScrapyH2Agent:
self._pool = pool
self._crawler = crawler
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
def _get_agent(self, request: Request, timeout: float | None) -> H2Agent:
from twisted.internet import reactor
bind_address = request.meta.get("bindaddress") or self._bind_address

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.exceptions import NotConfigured
@ -26,9 +26,9 @@ class S3DownloadHandler:
settings: BaseSettings,
*,
crawler: Crawler,
aws_access_key_id: Optional[str] = None,
aws_secret_access_key: Optional[str] = None,
aws_session_token: Optional[str] = None,
aws_access_key_id: str | None = None,
aws_secret_access_key: str | None = None,
aws_session_token: str | None = None,
httpdownloadhandler: type[HTTPDownloadHandler] = HTTPDownloadHandler,
**kw: Any,
):

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/downloader-middleware.rst
from __future__ import annotations
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Union, cast
from typing import TYPE_CHECKING, Any, cast
from twisted.internet.defer import Deferred, inlineCallbacks
@ -46,11 +46,11 @@ class DownloaderMiddlewareManager(MiddlewareManager):
download_func: Callable[[Request, Spider], Deferred[Response]],
request: Request,
spider: Spider,
) -> Deferred[Union[Response, Request]]:
) -> Deferred[Response | Request]:
@inlineCallbacks
def process_request(
request: Request,
) -> Generator[Deferred[Any], Any, Union[Response, Request]]:
) -> Generator[Deferred[Any], Any, Response | Request]:
for method in self.methods["process_request"]:
method = cast(Callable, method)
response = yield deferred_from_coro(
@ -69,8 +69,8 @@ class DownloaderMiddlewareManager(MiddlewareManager):
@inlineCallbacks
def process_response(
response: Union[Response, Request]
) -> Generator[Deferred[Any], Any, Union[Response, Request]]:
response: Response | Request,
) -> Generator[Deferred[Any], Any, Response | Request]:
if response is None:
raise TypeError("Received None in process_response")
elif isinstance(response, Request):
@ -93,7 +93,7 @@ class DownloaderMiddlewareManager(MiddlewareManager):
@inlineCallbacks
def process_exception(
failure: Failure,
) -> Generator[Deferred[Any], Any, Union[Failure, Response, Request]]:
) -> Generator[Deferred[Any], Any, Failure | Response | Request]:
exception = failure.value
for method in self.methods["process_exception"]:
method = cast(Callable, method)
@ -111,7 +111,7 @@ class DownloaderMiddlewareManager(MiddlewareManager):
return response
return failure
deferred: Deferred[Union[Response, Request]] = mustbe_deferred(
deferred: Deferred[Response | Request] = mustbe_deferred(
process_request, request
)
deferred.addErrback(process_exception)

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import re
from time import time
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
from twisted.internet import defer
@ -144,9 +144,9 @@ class ScrapyHTTPClientFactory(ClientFactory):
# converting to bytes to comply to Twisted interface
self.url: bytes = to_bytes(self._url, encoding="ascii")
self.method: bytes = to_bytes(request.method, encoding="ascii")
self.body: Optional[bytes] = request.body or None
self.body: bytes | None = request.body or None
self.headers: Headers = Headers(request.headers)
self.response_headers: Optional[Headers] = None
self.response_headers: Headers | None = None
self.timeout: float = request.meta.get("download_timeout") or timeout
self.start_time: float = time()
self.deferred: defer.Deferred[Response] = defer.Deferred().addCallback(

View File

@ -9,7 +9,7 @@ from __future__ import annotations
import logging
from time import time
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, TypeVar, cast
from itemadapter import is_item
from twisted.internet.defer import Deferred, inlineCallbacks, succeed
@ -18,7 +18,7 @@ from twisted.python.failure import Failure
from scrapy import signals
from scrapy.core.downloader import Downloader
from scrapy.core.scraper import Scraper
from scrapy.core.scraper import Scraper, _HandleOutputDeferred
from scrapy.exceptions import CloseSpider, DontCloseSpider, IgnoreRequest
from scrapy.http import Request, Response
from scrapy.logformatter import LogFormatter
@ -32,7 +32,6 @@ if TYPE_CHECKING:
from collections.abc import Callable, Generator, Iterable, Iterator
from scrapy.core.scheduler import BaseScheduler
from scrapy.core.scraper import _HandleOutputDeferred
from scrapy.crawler import Crawler
from scrapy.settings import BaseSettings
from scrapy.spiders import Spider
@ -51,9 +50,9 @@ class Slot:
nextcall: CallLaterOnce[None],
scheduler: BaseScheduler,
) -> None:
self.closing: Optional[Deferred[None]] = None
self.closing: Deferred[None] | None = None
self.inprogress: set[Request] = set()
self.start_requests: Optional[Iterator[Request]] = iter(start_requests)
self.start_requests: Iterator[Request] | None = iter(start_requests)
self.close_if_idle: bool = close_if_idle
self.nextcall: CallLaterOnce[None] = nextcall
self.scheduler: BaseScheduler = scheduler
@ -84,15 +83,15 @@ class ExecutionEngine:
def __init__(
self,
crawler: Crawler,
spider_closed_callback: Callable[[Spider], Optional[Deferred[None]]],
spider_closed_callback: Callable[[Spider], Deferred[None] | None],
) -> None:
self.crawler: Crawler = crawler
self.settings: Settings = crawler.settings
self.signals: SignalManager = crawler.signals
assert crawler.logformatter
self.logformatter: LogFormatter = crawler.logformatter
self.slot: Optional[Slot] = None
self.spider: Optional[Spider] = None
self.slot: Slot | None = None
self.spider: Spider | None = None
self.running: bool = False
self.paused: bool = False
self.scheduler_cls: type[BaseScheduler] = self._get_scheduler_class(
@ -101,10 +100,10 @@ class ExecutionEngine:
downloader_cls: type[Downloader] = load_object(self.settings["DOWNLOADER"])
self.downloader: Downloader = downloader_cls(crawler)
self.scraper: Scraper = Scraper(crawler)
self._spider_closed_callback: Callable[[Spider], Optional[Deferred[None]]] = (
self._spider_closed_callback: Callable[[Spider], Deferred[None] | None] = (
spider_closed_callback
)
self.start_time: Optional[float] = None
self.start_time: float | None = None
def _get_scheduler_class(self, settings: BaseSettings) -> type[BaseScheduler]:
from scrapy.core.scheduler import BaseScheduler
@ -218,7 +217,7 @@ class ExecutionEngine:
or self.scraper.slot.needs_backout()
)
def _next_request_from_scheduler(self) -> Optional[Deferred[None]]:
def _next_request_from_scheduler(self) -> Deferred[None] | None:
assert self.slot is not None # typing
assert self.spider is not None # typing
@ -226,7 +225,7 @@ class ExecutionEngine:
if request is None:
return None
d: Deferred[Union[Response, Request]] = self._download(request)
d: Deferred[Response | Request] = self._download(request)
d.addBoth(self._handle_downloader_output, request)
d.addErrback(
lambda f: logger.info(
@ -260,8 +259,8 @@ class ExecutionEngine:
return d2
def _handle_downloader_output(
self, result: Union[Request, Response, Failure], request: Request
) -> Optional[_HandleOutputDeferred]:
self, result: Request | Response | Failure, request: Request
) -> _HandleOutputDeferred | None:
assert self.spider is not None # typing
if not isinstance(result, (Request, Response, Failure)):
@ -323,24 +322,24 @@ class ExecutionEngine:
"""Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
if self.spider is None:
raise RuntimeError(f"No open spider to crawl: {request}")
d: Deferred[Union[Response, Request]] = self._download(request)
d: Deferred[Response | Request] = self._download(request)
# Deferred.addBoth() overloads don't seem to support a Union[_T, Deferred[_T]] return type
d2: Deferred[Response] = d.addBoth(self._downloaded, request) # type: ignore[call-overload]
return d2
def _downloaded(
self, result: Union[Response, Request, Failure], request: Request
) -> Union[Deferred[Response], Response, Failure]:
self, result: Response | Request | Failure, request: Request
) -> Deferred[Response] | Response | Failure:
assert self.slot is not None # typing
self.slot.remove_request(request)
return self.download(result) if isinstance(result, Request) else result
def _download(self, request: Request) -> Deferred[Union[Response, Request]]:
def _download(self, request: Request) -> Deferred[Response | Request]:
assert self.slot is not None # typing
self.slot.add_request(request)
def _on_success(result: Union[Response, Request]) -> Union[Response, Request]:
def _on_success(result: Response | Request) -> Response | Request:
if not isinstance(result, (Response, Request)):
raise TypeError(
f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
@ -368,9 +367,7 @@ class ExecutionEngine:
return _
assert self.spider is not None
dwld: Deferred[Union[Response, Request]] = self.downloader.fetch(
request, self.spider
)
dwld: Deferred[Response | Request] = self.downloader.fetch(request, self.spider)
dwld.addCallback(_on_success)
dwld.addBoth(_on_complete)
return dwld

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from collections import deque
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from twisted.internet import defer
from twisted.internet.defer import Deferred
@ -121,8 +121,8 @@ class H2Agent:
reactor: ReactorBase,
pool: H2ConnectionPool,
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
connect_timeout: Optional[float] = None,
bind_address: Optional[bytes] = None,
connect_timeout: float | None = None,
bind_address: bytes | None = None,
) -> None:
self._reactor = reactor
self._pool = pool
@ -165,8 +165,8 @@ class ScrapyProxyH2Agent(H2Agent):
proxy_uri: URI,
pool: H2ConnectionPool,
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
connect_timeout: Optional[float] = None,
bind_address: Optional[bytes] = None,
connect_timeout: float | None = None,
bind_address: bytes | None = None,
) -> None:
super().__init__(
reactor=reactor,

View File

@ -4,7 +4,7 @@ import ipaddress
import itertools
import logging
from collections import deque
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from h2.config import H2Configuration
from h2.connection import H2Connection
@ -63,7 +63,7 @@ class InvalidNegotiatedProtocol(H2Error):
class RemoteTerminatedConnection(H2Error):
def __init__(
self,
remote_ip_address: Optional[Union[IPv4Address, IPv6Address]],
remote_ip_address: IPv4Address | IPv6Address | None,
event: ConnectionTerminated,
) -> None:
self.remote_ip_address = remote_ip_address
@ -74,9 +74,7 @@ class RemoteTerminatedConnection(H2Error):
class MethodNotAllowed405(H2Error):
def __init__(
self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]
) -> None:
def __init__(self, remote_ip_address: IPv4Address | IPv6Address | None) -> None:
self.remote_ip_address = remote_ip_address
def __str__(self) -> str:

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
from enum import Enum
from io import BytesIO
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from h2.errors import ErrorCodes
from h2.exceptions import H2Error, ProtocolError, StreamClosedError
@ -382,7 +382,7 @@ class Stream:
def close(
self,
reason: StreamCloseReason,
errors: Optional[list[BaseException]] = None,
errors: list[BaseException] | None = None,
from_protocol: bool = False,
) -> None:
"""Based on the reason sent we will handle each case."""

View File

@ -4,7 +4,7 @@ import json
import logging
from abc import abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, cast
from typing import TYPE_CHECKING, Any, cast
# working around https://github.com/sphinx-doc/sphinx/issues/10400
from twisted.internet.defer import Deferred # noqa: TC002
@ -73,7 +73,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
"""
return cls()
def open(self, spider: Spider) -> Optional[Deferred[None]]:
def open(self, spider: Spider) -> Deferred[None] | None:
"""
Called when the spider is opened by the engine. It receives the spider
instance as argument and it's useful to execute initialization code.
@ -83,7 +83,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
"""
pass
def close(self, reason: str) -> Optional[Deferred[None]]:
def close(self, reason: str) -> Deferred[None] | None:
"""
Called when the spider is closed by the engine. It receives the reason why the crawl
finished as argument and it's useful to execute cleaning code.
@ -115,7 +115,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
raise NotImplementedError()
@abstractmethod
def next_request(self) -> Optional[Request]:
def next_request(self) -> Request | None:
"""
Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
to indicate that there are no requests to be considered ready at the moment.
@ -181,22 +181,22 @@ class Scheduler(BaseScheduler):
def __init__(
self,
dupefilter: BaseDupeFilter,
jobdir: Optional[str] = None,
dqclass: Optional[type[BaseQueue]] = None,
mqclass: Optional[type[BaseQueue]] = None,
jobdir: str | None = None,
dqclass: type[BaseQueue] | None = None,
mqclass: type[BaseQueue] | None = None,
logunser: bool = False,
stats: Optional[StatsCollector] = None,
pqclass: Optional[type[ScrapyPriorityQueue]] = None,
crawler: Optional[Crawler] = None,
stats: StatsCollector | None = None,
pqclass: type[ScrapyPriorityQueue] | None = None,
crawler: Crawler | None = None,
):
self.df: BaseDupeFilter = dupefilter
self.dqdir: Optional[str] = self._dqdir(jobdir)
self.pqclass: Optional[type[ScrapyPriorityQueue]] = pqclass
self.dqclass: Optional[type[BaseQueue]] = dqclass
self.mqclass: Optional[type[BaseQueue]] = mqclass
self.dqdir: str | None = self._dqdir(jobdir)
self.pqclass: type[ScrapyPriorityQueue] | None = pqclass
self.dqclass: type[BaseQueue] | None = dqclass
self.mqclass: type[BaseQueue] | None = mqclass
self.logunser: bool = logunser
self.stats: Optional[StatsCollector] = stats
self.crawler: Optional[Crawler] = crawler
self.stats: StatsCollector | None = stats
self.crawler: Crawler | None = crawler
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
@ -218,7 +218,7 @@ class Scheduler(BaseScheduler):
def has_pending_requests(self) -> bool:
return len(self) > 0
def open(self, spider: Spider) -> Optional[Deferred[None]]:
def open(self, spider: Spider) -> Deferred[None] | None:
"""
(1) initialize the memory queue
(2) initialize the disk queue if the ``jobdir`` attribute is a valid directory
@ -226,10 +226,10 @@ class Scheduler(BaseScheduler):
"""
self.spider: Spider = spider
self.mqs: ScrapyPriorityQueue = self._mq()
self.dqs: Optional[ScrapyPriorityQueue] = self._dq() if self.dqdir else None
self.dqs: ScrapyPriorityQueue | None = self._dq() if self.dqdir else None
return self.df.open()
def close(self, reason: str) -> Optional[Deferred[None]]:
def close(self, reason: str) -> Deferred[None] | None:
"""
(1) dump pending requests to disk if there is a disk queue
(2) return the result of the dupefilter's ``close`` method
@ -263,7 +263,7 @@ class Scheduler(BaseScheduler):
self.stats.inc_value("scheduler/enqueued", spider=self.spider)
return True
def next_request(self) -> Optional[Request]:
def next_request(self) -> Request | None:
"""
Return a :class:`~scrapy.http.Request` object from the memory queue,
falling back to the disk queue if the memory queue is empty.
@ -272,7 +272,7 @@ class Scheduler(BaseScheduler):
Increment the appropriate stats, such as: ``scheduler/dequeued``,
``scheduler/dequeued/disk``, ``scheduler/dequeued/memory``.
"""
request: Optional[Request] = self.mqs.pop()
request: Request | None = self.mqs.pop()
assert self.stats is not None
if request is not None:
self.stats.inc_value("scheduler/dequeued/memory", spider=self.spider)
@ -318,7 +318,7 @@ class Scheduler(BaseScheduler):
def _mqpush(self, request: Request) -> None:
self.mqs.push(request)
def _dqpop(self) -> Optional[Request]:
def _dqpop(self) -> Request | None:
if self.dqs is not None:
return self.dqs.pop()
return None
@ -355,7 +355,7 @@ class Scheduler(BaseScheduler):
)
return q
def _dqdir(self, jobdir: Optional[str]) -> Optional[str]:
def _dqdir(self, jobdir: str | None) -> str | None:
"""Return a folder name to keep disk queue state at"""
if jobdir:
dqdir = Path(jobdir, "requests.queue")

View File

@ -6,7 +6,7 @@ from __future__ import annotations
import logging
from collections import deque
from collections.abc import AsyncIterable, Iterator
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, TypeVar, Union, cast
from itemadapter import is_item
from twisted.internet.defer import Deferred, inlineCallbacks
@ -42,11 +42,8 @@ logger = logging.getLogger(__name__)
_T = TypeVar("_T")
_ParallelResult = list[tuple[bool, Iterator[Any]]]
if TYPE_CHECKING:
# parameterized Deferreds require Twisted 21.7.0
_HandleOutputDeferred = Deferred[Union[_ParallelResult, None]]
QueueTuple = tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
_HandleOutputDeferred = Deferred[Union[_ParallelResult, None]]
QueueTuple = tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
class Slot:
@ -60,10 +57,10 @@ class Slot:
self.active: set[Request] = set()
self.active_size: int = 0
self.itemproc_size: int = 0
self.closing: Optional[Deferred[Spider]] = None
self.closing: Deferred[Spider] | None = None
def add_response_request(
self, result: Union[Response, Failure], request: Request
self, result: Response | Failure, request: Request
) -> _HandleOutputDeferred:
deferred: _HandleOutputDeferred = Deferred()
self.queue.append((result, request, deferred))
@ -78,9 +75,7 @@ class Slot:
self.active.add(request)
return response, request, deferred
def finish_response(
self, result: Union[Response, Failure], request: Request
) -> None:
def finish_response(self, result: Response | Failure, request: Request) -> None:
self.active.remove(request)
if isinstance(result, Response):
self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
@ -96,7 +91,7 @@ class Slot:
class Scraper:
def __init__(self, crawler: Crawler) -> None:
self.slot: Optional[Slot] = None
self.slot: Slot | None = None
self.spidermw: SpiderMiddlewareManager = SpiderMiddlewareManager.from_crawler(
crawler
)
@ -135,7 +130,7 @@ class Scraper:
self.slot.closing.callback(spider)
def enqueue_scrape(
self, result: Union[Response, Failure], request: Request, spider: Spider
self, result: Response | Failure, request: Request, spider: Spider
) -> _HandleOutputDeferred:
if self.slot is None:
raise RuntimeError("Scraper slot not assigned")
@ -167,7 +162,7 @@ class Scraper:
self._scrape(response, request, spider).chainDeferred(deferred)
def _scrape(
self, result: Union[Response, Failure], request: Request, spider: Spider
self, result: Response | Failure, request: Request, spider: Spider
) -> _HandleOutputDeferred:
"""
Handle the downloaded response or failure through the spider callback/errback
@ -176,7 +171,7 @@ class Scraper:
raise TypeError(
f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}"
)
dfd: Deferred[Union[Iterable[Any], AsyncIterable[Any]]] = self._scrape2(
dfd: Deferred[Iterable[Any] | AsyncIterable[Any]] = self._scrape2(
result, request, spider
) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
@ -186,8 +181,8 @@ class Scraper:
return dfd2
def _scrape2(
self, result: Union[Response, Failure], request: Request, spider: Spider
) -> Deferred[Union[Iterable[Any], AsyncIterable[Any]]]:
self, result: Response | Failure, request: Request, spider: Spider
) -> Deferred[Iterable[Any] | AsyncIterable[Any]]:
"""
Handle the different cases of request's result been a Response or a Failure
"""
@ -202,8 +197,8 @@ class Scraper:
return dfd
def call_spider(
self, result: Union[Response, Failure], request: Request, spider: Spider
) -> Deferred[Union[Iterable[Any], AsyncIterable[Any]]]:
self, result: Response | Failure, request: Request, spider: Spider
) -> Deferred[Iterable[Any] | AsyncIterable[Any]]:
dfd: Deferred[Any]
if isinstance(result, Response):
if getattr(result, "request", None) is None:
@ -222,7 +217,7 @@ class Scraper:
if request.errback:
warn_on_generator_with_return_value(spider, request.errback)
dfd.addErrback(request.errback)
dfd2: Deferred[Union[Iterable[Any], AsyncIterable[Any]]] = dfd.addCallback(
dfd2: Deferred[Iterable[Any] | AsyncIterable[Any]] = dfd.addCallback(
iterate_spider_output
)
return dfd2
@ -231,7 +226,7 @@ class Scraper:
self,
_failure: Failure,
request: Request,
response: Union[Response, Failure],
response: Response | Failure,
spider: Spider,
) -> None:
exc = _failure.value
@ -258,14 +253,14 @@ class Scraper:
def handle_spider_output(
self,
result: Union[Iterable[_T], AsyncIterable[_T]],
result: Iterable[_T] | AsyncIterable[_T],
request: Request,
response: Response,
spider: Spider,
) -> _HandleOutputDeferred:
if not result:
return defer_succeed(None)
it: Union[Iterable[_T], AsyncIterable[_T]]
it: Iterable[_T] | AsyncIterable[_T]
dfd: Deferred[_ParallelResult]
if isinstance(result, AsyncIterable):
it = aiter_errback(
@ -296,7 +291,7 @@ class Scraper:
def _process_spidermw_output(
self, output: Any, request: Request, response: Response, spider: Spider
) -> Optional[Deferred[Any]]:
) -> Deferred[Any] | None:
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
@ -316,9 +311,7 @@ class Scraper:
)
return None
def start_itemproc(
self, item: Any, *, response: Optional[Response]
) -> Deferred[Any]:
def start_itemproc(self, item: Any, *, response: Response | None) -> Deferred[Any]:
"""Send *item* to the item pipelines for processing.
*response* is the source of the item data. If the item does not come
@ -337,7 +330,7 @@ class Scraper:
download_failure: Failure,
request: Request,
spider: Spider,
) -> Union[Failure, None]:
) -> Failure | None:
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here).
@ -371,7 +364,7 @@ class Scraper:
return None
def _itemproc_finished(
self, output: Any, item: Any, response: Optional[Response], spider: Spider
self, output: Any, item: Any, response: Response | None, spider: Spider
) -> Deferred[Any]:
"""ItemProcessor finished for the given ``item`` and returned ``output``"""
assert self.slot is not None # typing

View File

@ -10,7 +10,7 @@ import logging
from collections.abc import AsyncIterable, Callable, Iterable
from inspect import isasyncgenfunction, iscoroutine
from itertools import islice
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, TypeVar, Union, cast
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python.failure import Failure
@ -76,7 +76,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
response: Response,
request: Request,
spider: Spider,
) -> Union[Iterable[_T], AsyncIterable[_T]]:
) -> Iterable[_T] | AsyncIterable[_T]:
for method in self.methods["process_spider_input"]:
method = cast(Callable, method)
try:
@ -97,10 +97,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
self,
response: Response,
spider: Spider,
iterable: Union[Iterable[_T], AsyncIterable[_T]],
iterable: Iterable[_T] | AsyncIterable[_T],
exception_processor_index: int,
recover_to: Union[MutableChain[_T], MutableAsyncChain[_T]],
) -> Union[Iterable[_T], AsyncIterable[_T]]:
recover_to: MutableChain[_T] | MutableAsyncChain[_T],
) -> Iterable[_T] | AsyncIterable[_T]:
def process_sync(iterable: Iterable[_T]) -> Iterable[_T]:
try:
yield from iterable
@ -142,7 +142,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
spider: Spider,
_failure: Failure,
start_index: int = 0,
) -> Union[Failure, MutableChain[_T], MutableAsyncChain[_T]]:
) -> Failure | MutableChain[_T] | MutableAsyncChain[_T]:
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
@ -158,7 +158,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
dfd: Deferred[Union[MutableChain[_T], MutableAsyncChain[_T]]] = (
dfd: Deferred[MutableChain[_T] | MutableAsyncChain[_T]] = (
self._process_spider_output(
response, spider, result, method_index + 1
)
@ -192,12 +192,12 @@ class SpiderMiddlewareManager(MiddlewareManager):
self,
response: Response,
spider: Spider,
result: Union[Iterable[_T], AsyncIterable[_T]],
result: Iterable[_T] | AsyncIterable[_T],
start_index: int = 0,
) -> Generator[Deferred[Any], Any, Union[MutableChain[_T], MutableAsyncChain[_T]]]:
) -> Generator[Deferred[Any], Any, MutableChain[_T] | MutableAsyncChain[_T]]:
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered: Union[MutableChain[_T], MutableAsyncChain[_T]]
recovered: MutableChain[_T] | MutableAsyncChain[_T]
last_result_is_async = isinstance(result, AsyncIterable)
if last_result_is_async:
recovered = MutableAsyncChain()
@ -248,10 +248,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result: Union[
Failure, MutableChain[_T], MutableAsyncChain[_T]
] = self._process_spider_exception(
response, spider, Failure(ex), method_index + 1
exception_result: Failure | MutableChain[_T] | MutableAsyncChain[_T] = (
self._process_spider_exception(
response, spider, Failure(ex), method_index + 1
)
)
if isinstance(exception_result, Failure):
raise
@ -283,9 +283,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
self,
response: Response,
spider: Spider,
result: Union[Iterable[_T], AsyncIterable[_T]],
) -> Union[MutableChain[_T], MutableAsyncChain[_T]]:
recovered: Union[MutableChain[_T], MutableAsyncChain[_T]]
result: Iterable[_T] | AsyncIterable[_T],
) -> MutableChain[_T] | MutableAsyncChain[_T]:
recovered: MutableChain[_T] | MutableAsyncChain[_T]
if isinstance(result, AsyncIterable):
recovered = MutableAsyncChain()
else:
@ -293,7 +293,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
result = self._evaluate_iterable(response, spider, result, 0, recovered)
result = await maybe_deferred_to_future(
cast(
"Deferred[Union[Iterable[_T], AsyncIterable[_T]]]",
"Deferred[Iterable[_T] | AsyncIterable[_T]]",
self._process_spider_output(response, spider, result),
)
)
@ -310,22 +310,22 @@ class SpiderMiddlewareManager(MiddlewareManager):
response: Response,
request: Request,
spider: Spider,
) -> Deferred[Union[MutableChain[_T], MutableAsyncChain[_T]]]:
) -> Deferred[MutableChain[_T] | MutableAsyncChain[_T]]:
async def process_callback_output(
result: Union[Iterable[_T], AsyncIterable[_T]]
) -> Union[MutableChain[_T], MutableAsyncChain[_T]]:
result: Iterable[_T] | AsyncIterable[_T],
) -> MutableChain[_T] | MutableAsyncChain[_T]:
return await self._process_callback_output(response, spider, result)
def process_spider_exception(
_failure: Failure,
) -> Union[Failure, MutableChain[_T], MutableAsyncChain[_T]]:
) -> Failure | MutableChain[_T] | MutableAsyncChain[_T]:
return self._process_spider_exception(response, spider, _failure)
dfd: Deferred[Union[Iterable[_T], AsyncIterable[_T]]] = mustbe_deferred(
dfd: Deferred[Iterable[_T] | AsyncIterable[_T]] = mustbe_deferred(
self._process_spider_input, scrape_func, response, request, spider
)
dfd2: Deferred[Union[MutableChain[_T], MutableAsyncChain[_T]]] = (
dfd.addCallback(deferred_f_from_coro_f(process_callback_output))
dfd2: Deferred[MutableChain[_T] | MutableAsyncChain[_T]] = dfd.addCallback(
deferred_f_from_coro_f(process_callback_output)
)
dfd2.addErrback(process_spider_exception)
return dfd2
@ -339,10 +339,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
@staticmethod
def _get_async_method_pair(
mw: Any, methodname: str
) -> Union[None, Callable, tuple[Callable, Callable]]:
normal_method: Optional[Callable] = getattr(mw, methodname, None)
) -> None | Callable | tuple[Callable, Callable]:
normal_method: Callable | None = getattr(mw, methodname, None)
methodname_async = methodname + "_async"
async_method: Optional[Callable] = getattr(mw, methodname_async, None)
async_method: Callable | None = getattr(mw, methodname_async, None)
if not async_method:
return normal_method
if not normal_method:

View File

@ -4,7 +4,7 @@ import logging
import pprint
import signal
import warnings
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, TypeVar, cast
from twisted.internet.defer import (
Deferred,
@ -57,7 +57,7 @@ class Crawler:
def __init__(
self,
spidercls: type[Spider],
settings: Union[None, dict[str, Any], Settings] = None,
settings: None | dict[str, Any] | Settings = None,
init_reactor: bool = False,
):
if isinstance(spidercls, Spider):
@ -78,12 +78,12 @@ class Crawler:
self.crawling: bool = False
self._started: bool = False
self.extensions: Optional[ExtensionManager] = None
self.stats: Optional[StatsCollector] = None
self.logformatter: Optional[LogFormatter] = None
self.request_fingerprinter: Optional[RequestFingerprinter] = None
self.spider: Optional[Spider] = None
self.engine: Optional[ExecutionEngine] = None
self.extensions: ExtensionManager | None = None
self.stats: StatsCollector | None = None
self.logformatter: LogFormatter | None = None
self.request_fingerprinter: RequestFingerprinter | None = None
self.spider: Spider | None = None
self.engine: ExecutionEngine | None = None
def _update_root_log_handler(self) -> None:
if get_scrapy_root_handler() is not None:
@ -181,16 +181,16 @@ class Crawler:
@staticmethod
def _get_component(
component_class: type[_T], components: Iterable[Any]
) -> Optional[_T]:
) -> _T | None:
for component in components:
if isinstance(component, component_class):
return component
return None
def get_addon(self, cls: type[_T]) -> Optional[_T]:
def get_addon(self, cls: type[_T]) -> _T | None:
return self._get_component(cls, self.addons.addons)
def get_downloader_middleware(self, cls: type[_T]) -> Optional[_T]:
def get_downloader_middleware(self, cls: type[_T]) -> _T | None:
if not self.engine:
raise RuntimeError(
"Crawler.get_downloader_middleware() can only be called after "
@ -198,7 +198,7 @@ class Crawler:
)
return self._get_component(cls, self.engine.downloader.middleware.middlewares)
def get_extension(self, cls: type[_T]) -> Optional[_T]:
def get_extension(self, cls: type[_T]) -> _T | None:
if not self.extensions:
raise RuntimeError(
"Crawler.get_extension() can only be called after the "
@ -206,7 +206,7 @@ class Crawler:
)
return self._get_component(cls, self.extensions.middlewares)
def get_item_pipeline(self, cls: type[_T]) -> Optional[_T]:
def get_item_pipeline(self, cls: type[_T]) -> _T | None:
if not self.engine:
raise RuntimeError(
"Crawler.get_item_pipeline() can only be called after the "
@ -214,7 +214,7 @@ class Crawler:
)
return self._get_component(cls, self.engine.scraper.itemproc.middlewares)
def get_spider_middleware(self, cls: type[_T]) -> Optional[_T]:
def get_spider_middleware(self, cls: type[_T]) -> _T | None:
if not self.engine:
raise RuntimeError(
"Crawler.get_spider_middleware() can only be called after the "
@ -250,7 +250,7 @@ class CrawlerRunner:
verifyClass(ISpiderLoader, loader_cls)
return cast("SpiderLoader", loader_cls.from_settings(settings.frozencopy()))
def __init__(self, settings: Union[dict[str, Any], Settings, None] = None):
def __init__(self, settings: dict[str, Any] | Settings | None = None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings: Settings = settings
@ -261,7 +261,7 @@ class CrawlerRunner:
def crawl(
self,
crawler_or_spidercls: Union[type[Spider], str, Crawler],
crawler_or_spidercls: type[Spider] | str | Crawler,
*args: Any,
**kwargs: Any,
) -> Deferred[None]:
@ -308,7 +308,7 @@ class CrawlerRunner:
return d.addBoth(_done)
def create_crawler(
self, crawler_or_spidercls: Union[type[Spider], str, Crawler]
self, crawler_or_spidercls: type[Spider] | str | Crawler
) -> Crawler:
"""
Return a :class:`~scrapy.crawler.Crawler` object.
@ -329,7 +329,7 @@ class CrawlerRunner:
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)
def _create_crawler(self, spidercls: Union[str, type[Spider]]) -> Crawler:
def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, self.settings)
@ -380,7 +380,7 @@ class CrawlerProcess(CrawlerRunner):
def __init__(
self,
settings: Union[dict[str, Any], Settings, None] = None,
settings: dict[str, Any] | Settings | None = None,
install_root_handler: bool = True,
):
super().__init__(settings)
@ -409,7 +409,7 @@ class CrawlerProcess(CrawlerRunner):
)
reactor.callFromThread(self._stop_reactor)
def _create_crawler(self, spidercls: Union[type[Spider], str]) -> Crawler:
def _create_crawler(self, spidercls: type[Spider] | str) -> Crawler:
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
init_reactor = not self._initialized_reactor

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from w3lib import html
@ -43,7 +43,7 @@ class AjaxCrawlMiddleware:
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if not isinstance(response, HtmlResponse) or response.status != 200:
return response

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from tldextract import TLDExtract
@ -70,7 +70,7 @@ class CookiesMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
if request.meta.get("dont_merge_cookies", False):
return None
@ -87,7 +87,7 @@ class CookiesMiddleware:
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if request.meta.get("dont_merge_cookies", False):
return response
@ -123,7 +123,7 @@ class CookiesMiddleware:
msg = f"Received cookies from: {response}\n{cookies}"
logger.debug(msg, extra={"spider": spider})
def _format_cookie(self, cookie: VerboseCookie, request: Request) -> Optional[str]:
def _format_cookie(self, cookie: VerboseCookie, request: Request) -> str | None:
"""
Given a dict consisting of cookie components, return its string representation.
Decode from bytes if necessary.

View File

@ -6,7 +6,7 @@ See documentation in docs/topics/downloader-middleware.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from scrapy.utils.python import without_none_values
@ -32,7 +32,7 @@ class DefaultHeadersMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
for k, v in self._headers:
request.headers.setdefault(k, v)
return None

View File

@ -6,7 +6,7 @@ See documentation in docs/topics/downloader-middleware.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from scrapy import Request, Spider, signals
@ -33,7 +33,7 @@ class DownloadTimeoutMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
if self._timeout:
request.meta.setdefault("download_timeout", self._timeout)
return None

View File

@ -6,7 +6,7 @@ See documentation in docs/topics/downloader-middleware.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from w3lib.http import basic_auth_header
@ -40,7 +40,7 @@ class HttpAuthMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
auth = getattr(self, "auth", None)
if auth and b"Authorization" not in request.headers:
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from email.utils import formatdate
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING
from twisted.internet import defer
from twisted.internet.error import (
@ -69,7 +69,7 @@ class HttpCacheMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
if request.meta.get("dont_cache", False):
return None
@ -79,7 +79,7 @@ class HttpCacheMiddleware:
return None
# Look for cached response and check if expired
cachedresponse: Optional[Response] = self.storage.retrieve_response(
cachedresponse: Response | None = self.storage.retrieve_response(
spider, request
)
if cachedresponse is None:
@ -103,7 +103,7 @@ class HttpCacheMiddleware:
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if request.meta.get("dont_cache", False):
return response
@ -118,7 +118,7 @@ class HttpCacheMiddleware:
response.headers["Date"] = formatdate(usegmt=True)
# Do not validate first-hand responses
cachedresponse: Optional[Response] = request.meta.pop("cached_response", None)
cachedresponse: Response | None = request.meta.pop("cached_response", None)
if cachedresponse is None:
self.stats.inc_value("httpcache/firsthand", spider=spider)
self._cache_response(spider, response, request, cachedresponse)
@ -134,8 +134,8 @@ class HttpCacheMiddleware:
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Union[Request, Response, None]:
cachedresponse: Optional[Response] = request.meta.pop("cached_response", None)
) -> Request | Response | None:
cachedresponse: Response | None = request.meta.pop("cached_response", None)
if cachedresponse is not None and isinstance(
exception, self.DOWNLOAD_EXCEPTIONS
):
@ -148,7 +148,7 @@ class HttpCacheMiddleware:
spider: Spider,
response: Response,
request: Request,
cachedresponse: Optional[Response],
cachedresponse: Response | None,
) -> None:
if self.policy.should_cache_response(response, request):
self.stats.inc_value("httpcache/store", spider=spider)

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import warnings
from itertools import chain
from logging import getLogger
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from scrapy import Request, Spider, signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
@ -54,9 +54,9 @@ class HttpCompressionMiddleware:
def __init__(
self,
stats: Optional[StatsCollector] = None,
stats: StatsCollector | None = None,
*,
crawler: Optional[Crawler] = None,
crawler: Crawler | None = None,
):
if not crawler:
self.stats = stats
@ -96,13 +96,13 @@ class HttpCompressionMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS))
return None
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if request.method == "HEAD":
return response
if isinstance(response, Response):

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import base64
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING
from urllib.parse import unquote, urlunparse
from urllib.request import ( # type: ignore[attr-defined]
_parse_proxy,
@ -23,9 +23,9 @@ if TYPE_CHECKING:
class HttpProxyMiddleware:
def __init__(self, auth_encoding: Optional[str] = "latin-1"):
self.auth_encoding: Optional[str] = auth_encoding
self.proxies: dict[str, tuple[Optional[bytes], str]] = {}
def __init__(self, auth_encoding: str | None = "latin-1"):
self.auth_encoding: str | None = auth_encoding
self.proxies: dict[str, tuple[bytes | None, str]] = {}
for type_, url in getproxies().items():
try:
self.proxies[type_] = self._get_proxy(url, type_)
@ -38,7 +38,7 @@ class HttpProxyMiddleware:
def from_crawler(cls, crawler: Crawler) -> Self:
if not crawler.settings.getbool("HTTPPROXY_ENABLED"):
raise NotConfigured
auth_encoding: Optional[str] = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
auth_encoding: str | None = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
return cls(auth_encoding)
def _basic_auth_header(self, username: str, password: str) -> bytes:
@ -47,7 +47,7 @@ class HttpProxyMiddleware:
)
return base64.b64encode(user_pass)
def _get_proxy(self, url: str, orig_type: str) -> tuple[Optional[bytes], str]:
def _get_proxy(self, url: str, orig_type: str) -> tuple[bytes | None, str]:
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))
@ -60,7 +60,7 @@ class HttpProxyMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
creds, proxy_url, scheme = None, None, None
if "proxy" in request.meta:
if request.meta["proxy"] is not None:
@ -82,9 +82,9 @@ class HttpProxyMiddleware:
def _set_proxy_and_creds(
self,
request: Request,
proxy_url: Optional[str],
creds: Optional[bytes],
scheme: Optional[str],
proxy_url: str | None,
creds: bytes | None,
scheme: str | None,
) -> None:
if scheme:
request.meta["_scheme_proxy"] = True

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Union, cast
from typing import TYPE_CHECKING, Any, cast
from urllib.parse import urljoin
from w3lib.url import safe_url_string
@ -144,7 +144,7 @@ class RedirectMiddleware(BaseRedirectMiddleware):
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if (
request.meta.get("dont_redirect", False)
or response.status in getattr(spider, "handle_httpstatus_list", [])
@ -185,7 +185,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if (
request.meta.get("dont_redirect", False)
or request.method == "HEAD"

View File

@ -14,7 +14,7 @@ from __future__ import annotations
import warnings
from logging import Logger, getLogger
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.settings import BaseSettings, Settings
@ -60,12 +60,12 @@ def get_retry_request(
request: Request,
*,
spider: Spider,
reason: Union[str, Exception, type[Exception]] = "unspecified",
max_retry_times: Optional[int] = None,
priority_adjust: Optional[int] = None,
reason: str | Exception | type[Exception] = "unspecified",
max_retry_times: int | None = None,
priority_adjust: int | None = None,
logger: Logger = retry_logger,
stats_base_key: str = "retry",
) -> Optional[Request]:
) -> Request | None:
"""
Returns a new :class:`~scrapy.Request` object to retry the specified
request, or ``None`` if retries of the specified request have been
@ -167,7 +167,7 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
if request.meta.get("dont_retry", False):
return response
if response.status in self.retry_http_codes:
@ -177,7 +177,7 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
if isinstance(exception, self.exceptions_to_retry) and not request.meta.get(
"dont_retry", False
):
@ -187,9 +187,9 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
def _retry(
self,
request: Request,
reason: Union[str, Exception, type[Exception]],
reason: str | Exception | type[Exception],
spider: Spider,
) -> Optional[Request]:
) -> Request | None:
max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
priority_adjust = request.meta.get("priority_adjust", self.priority_adjust)
return get_retry_request(

View File

@ -7,7 +7,7 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional, TypeVar, Union
from typing import TYPE_CHECKING, TypeVar
from twisted.internet.defer import Deferred, maybeDeferred
@ -41,13 +41,11 @@ class RobotsTxtMiddleware:
if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent: Optional[str] = crawler.settings.get(
self._robotstxt_useragent: str | None = crawler.settings.get(
"ROBOTSTXT_USER_AGENT", None
)
self.crawler: Crawler = crawler
self._parsers: dict[
str, Union[RobotParser, Deferred[Optional[RobotParser]], None]
] = {}
self._parsers: dict[str, RobotParser | Deferred[RobotParser | None] | None] = {}
self._parserimpl: RobotParser = load_object(
crawler.settings.get("ROBOTSTXT_PARSER")
)
@ -61,24 +59,24 @@ class RobotsTxtMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Optional[Deferred[None]]:
) -> Deferred[None] | None:
if request.meta.get("dont_obey_robotstxt"):
return None
if request.url.startswith("data:") or request.url.startswith("file:"):
return None
d: Deferred[Optional[RobotParser]] = maybeDeferred(
d: Deferred[RobotParser | None] = maybeDeferred(
self.robot_parser, request, spider # type: ignore[call-overload]
)
d2: Deferred[None] = d.addCallback(self.process_request_2, request, spider)
return d2
def process_request_2(
self, rp: Optional[RobotParser], request: Request, spider: Spider
self, rp: RobotParser | None, request: Request, spider: Spider
) -> None:
if rp is None:
return
useragent: Union[str, bytes, None] = self._robotstxt_useragent
useragent: str | bytes | None = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
assert useragent is not None
@ -94,7 +92,7 @@ class RobotsTxtMiddleware:
def robot_parser(
self, request: Request, spider: Spider
) -> Union[RobotParser, Deferred[Optional[RobotParser]], None]:
) -> RobotParser | Deferred[RobotParser | None] | None:
url = urlparse_cached(request)
netloc = url.netloc
@ -117,9 +115,9 @@ class RobotsTxtMiddleware:
parser = self._parsers[netloc]
if isinstance(parser, Deferred):
d: Deferred[Optional[RobotParser]] = Deferred()
d: Deferred[RobotParser | None] = Deferred()
def cb(result: Optional[RobotParser]) -> Optional[RobotParser]:
def cb(result: RobotParser | None) -> RobotParser | None:
d.callback(result)
return result

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from twisted.web import http
@ -19,7 +19,7 @@ if TYPE_CHECKING:
def get_header_size(
headers: dict[str, Union[list[Union[str, bytes]], tuple[Union[str, bytes], ...]]]
headers: dict[str, list[str | bytes] | tuple[str | bytes, ...]]
) -> int:
size = 0
for key, value in headers.items():
@ -47,7 +47,7 @@ class DownloaderStats:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
self.stats.inc_value("downloader/request_count", spider=spider)
self.stats.inc_value(
f"downloader/request_method_count/{request.method}", spider=spider
@ -58,7 +58,7 @@ class DownloaderStats:
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response]:
) -> Request | Response:
self.stats.inc_value("downloader/response_count", spider=spider)
self.stats.inc_value(
f"downloader/response_status_count/{response.status}", spider=spider
@ -75,7 +75,7 @@ class DownloaderStats:
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
ex_class = global_object_name(exception.__class__)
self.stats.inc_value("downloader/exception_count", spider=spider)
self.stats.inc_value(

View File

@ -2,7 +2,7 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from scrapy import Request, Spider, signals
@ -31,7 +31,7 @@ class UserAgentMiddleware:
def process_request(
self, request: Request, spider: Spider
) -> Union[Request, Response, None]:
) -> Request | Response | None:
if self.user_agent:
request.headers.setdefault(b"User-Agent", self.user_agent)
return None

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from scrapy.utils.job import job_dir
from scrapy.utils.request import (
@ -31,10 +31,10 @@ class BaseDupeFilter:
def request_seen(self, request: Request) -> bool:
return False
def open(self) -> Optional[Deferred[None]]:
def open(self) -> Deferred[None] | None:
pass
def close(self, reason: str) -> Optional[Deferred[None]]:
def close(self, reason: str) -> Deferred[None] | None:
pass
def log(self, request: Request, spider: Spider) -> None:
@ -47,10 +47,10 @@ class RFPDupeFilter(BaseDupeFilter):
def __init__(
self,
path: Optional[str] = None,
path: str | None = None,
debug: bool = False,
*,
fingerprinter: Optional[RequestFingerprinterProtocol] = None,
fingerprinter: RequestFingerprinterProtocol | None = None,
) -> None:
self.file = None
self.fingerprinter: RequestFingerprinterProtocol = (
@ -70,7 +70,7 @@ class RFPDupeFilter(BaseDupeFilter):
cls,
settings: BaseSettings,
*,
fingerprinter: Optional[RequestFingerprinterProtocol] = None,
fingerprinter: RequestFingerprinterProtocol | None = None,
) -> Self:
debug = settings.getbool("DUPEFILTER_DEBUG")
return cls(job_dir(settings), debug, fingerprinter=fingerprinter)

View File

@ -2,6 +2,8 @@
Item Exporters are used to export/serialize items into different formats.
"""
from __future__ import annotations
import csv
import marshal
import pickle # nosec
@ -9,7 +11,7 @@ import pprint
from collections.abc import Callable, Iterable, Mapping
from io import BytesIO, TextIOWrapper
from json import JSONEncoder
from typing import Any, Optional, Union
from typing import Any
from xml.sax.saxutils import XMLGenerator # nosec
from xml.sax.xmlreader import AttributesImpl # nosec
@ -41,12 +43,12 @@ class BaseItemExporter:
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses ``__init__`` methods)
"""
self.encoding: Optional[str] = options.pop("encoding", None)
self.fields_to_export: Union[Mapping[str, str], Iterable[str], None] = (
options.pop("fields_to_export", None)
self.encoding: str | None = options.pop("encoding", None)
self.fields_to_export: Mapping[str, str] | Iterable[str] | None = options.pop(
"fields_to_export", None
)
self.export_empty_fields: bool = options.pop("export_empty_fields", False)
self.indent: Optional[int] = options.pop("indent", None)
self.indent: int | None = options.pop("indent", None)
if not dont_fail and options:
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
@ -54,7 +56,7 @@ class BaseItemExporter:
raise NotImplementedError
def serialize_field(
self, field: Union[Mapping[str, Any], Field], name: str, value: Any
self, field: Mapping[str, Any] | Field, name: str, value: Any
) -> Any:
serializer: Callable[[Any], Any] = field.get("serializer", lambda x: x)
return serializer(value)
@ -66,7 +68,7 @@ class BaseItemExporter:
pass
def _get_serialized_fields(
self, item: Any, default_value: Any = None, include_empty: Optional[bool] = None
self, item: Any, default_value: Any = None, include_empty: bool | None = None
) -> Iterable[tuple[str, Any]]:
"""Return the fields to export as an iterable of tuples
(name, serialized_value)
@ -225,7 +227,7 @@ class CsvItemExporter(BaseItemExporter):
file: BytesIO,
include_headers_line: bool = True,
join_multivalued: str = ",",
errors: Optional[str] = None,
errors: str | None = None,
**kwargs: Any,
):
super().__init__(dont_fail=True, **kwargs)
@ -245,7 +247,7 @@ class CsvItemExporter(BaseItemExporter):
self._join_multivalued = join_multivalued
def serialize_field(
self, field: Union[Mapping[str, Any], Field], name: str, value: Any
self, field: Mapping[str, Any] | Field, name: str, value: Any
) -> Any:
serializer: Callable[[Any], Any] = field.get("serializer", self._join_if_needed)
return serializer(value)
@ -346,7 +348,7 @@ class PythonItemExporter(BaseItemExporter):
self.encoding = "utf-8"
def serialize_field(
self, field: Union[Mapping[str, Any], Field], name: str, value: Any
self, field: Mapping[str, Any] | Field, name: str, value: Any
) -> Any:
serializer: Callable[[Any], Any] = field.get(
"serializer", self._serialize_value
@ -364,10 +366,10 @@ class PythonItemExporter(BaseItemExporter):
return to_unicode(value, encoding=self.encoding)
return value
def _serialize_item(self, item: Any) -> Iterable[tuple[Union[str, bytes], Any]]:
def _serialize_item(self, item: Any) -> Iterable[tuple[str | bytes, Any]]:
for key, value in ItemAdapter(item).items():
yield key, self._serialize_value(value)
def export_item(self, item: Any) -> dict[Union[str, bytes], Any]: # type: ignore[override]
result: dict[Union[str, bytes], Any] = dict(self._get_serialized_fields(item))
def export_item(self, item: Any) -> dict[str | bytes, Any]: # type: ignore[override]
result: dict[str | bytes, Any] = dict(self._get_serialized_fields(item))
return result

View File

@ -5,7 +5,7 @@ Extension for collecting core stats like items scraped and start/finish times
from __future__ import annotations
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from scrapy import Spider, signals
@ -20,7 +20,7 @@ if TYPE_CHECKING:
class CoreStats:
def __init__(self, stats: StatsCollector):
self.stats: StatsCollector = stats
self.start_time: Optional[datetime] = None
self.start_time: datetime | None = None
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:

View File

@ -12,7 +12,7 @@ import sys
import threading
import traceback
from pdb import Pdb
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from scrapy.utils.engine import format_engine_status
from scrapy.utils.trackref import format_live_refs
@ -43,7 +43,7 @@ class StackTraceDump:
def from_crawler(cls, crawler: Crawler) -> Self:
return cls(crawler)
def dump_stacktrace(self, signum: int, frame: Optional[FrameType]) -> None:
def dump_stacktrace(self, signum: int, frame: FrameType | None) -> None:
assert self.crawler.engine
log_args = {
"stackdumps": self._thread_stacks(),
@ -75,6 +75,6 @@ class Debugger:
# win32 platforms don't support SIGUSR signals
pass
def _enter_debugger(self, signum: int, frame: Optional[FrameType]) -> None:
def _enter_debugger(self, signum: int, frame: FrameType | None) -> None:
assert frame
Pdb().set_trace(frame.f_back) # noqa: T100

View File

@ -14,7 +14,7 @@ from collections.abc import Callable
from datetime import datetime, timezone
from pathlib import Path, PureWindowsPath
from tempfile import NamedTemporaryFile
from typing import IO, TYPE_CHECKING, Any, Optional, Protocol, TypeVar, Union, cast
from typing import IO, TYPE_CHECKING, Any, Optional, Protocol, TypeVar, cast
from urllib.parse import unquote, urlparse
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
@ -67,7 +67,7 @@ def build_storage(
builder: Callable[..., _StorageT],
uri: str,
*args: Any,
feed_options: Optional[dict[str, Any]] = None,
feed_options: dict[str, Any] | None = None,
preargs: Iterable[Any] = (),
**kwargs: Any,
) -> _StorageT:
@ -84,10 +84,10 @@ class ItemFilter:
:type feed_options: dict
"""
feed_options: Optional[dict[str, Any]]
feed_options: dict[str, Any] | None
item_classes: tuple[type, ...]
def __init__(self, feed_options: Optional[dict[str, Any]]) -> None:
def __init__(self, feed_options: dict[str, Any] | None) -> None:
self.feed_options = feed_options
if feed_options is not None:
self.item_classes = tuple(
@ -129,7 +129,7 @@ class IFeedStorage(Interface):
class FeedStorageProtocol(Protocol):
"""Reimplementation of ``IFeedStorage`` that can be used in type hints."""
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
def __init__(self, uri: str, *, feed_options: dict[str, Any] | None = None):
"""Initialize the storage with the parameters given in the URI and the
feed-specific options (see :setting:`FEEDS`)"""
@ -137,7 +137,7 @@ class FeedStorageProtocol(Protocol):
"""Open the storage for the given spider. It must return a file-like
object that will be used for the exporters"""
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
def store(self, file: IO[bytes]) -> Deferred[None] | None:
"""Store the given file stream"""
@ -150,7 +150,7 @@ class BlockingFeedStorage:
return NamedTemporaryFile(prefix="feed-", dir=path)
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
def store(self, file: IO[bytes]) -> Deferred[None] | None:
return deferToThread(self._store_in_thread, file)
def _store_in_thread(self, file: IO[bytes]) -> None:
@ -162,9 +162,9 @@ class StdoutFeedStorage:
def __init__(
self,
uri: str,
_stdout: Optional[IO[bytes]] = None,
_stdout: IO[bytes] | None = None,
*,
feed_options: Optional[dict[str, Any]] = None,
feed_options: dict[str, Any] | None = None,
):
if not _stdout:
_stdout = sys.stdout.buffer
@ -180,13 +180,13 @@ class StdoutFeedStorage:
def open(self, spider: Spider) -> IO[bytes]:
return self._stdout
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
def store(self, file: IO[bytes]) -> Deferred[None] | None:
pass
@implementer(IFeedStorage)
class FileFeedStorage:
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
def __init__(self, uri: str, *, feed_options: dict[str, Any] | None = None):
self.path: str = file_uri_to_path(uri)
feed_options = feed_options or {}
self.write_mode: OpenBinaryMode = (
@ -199,7 +199,7 @@ class FileFeedStorage:
dirname.mkdir(parents=True)
return Path(self.path).open(self.write_mode)
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
def store(self, file: IO[bytes]) -> Deferred[None] | None:
file.close()
return None
@ -208,27 +208,27 @@ class S3FeedStorage(BlockingFeedStorage):
def __init__(
self,
uri: str,
access_key: Optional[str] = None,
secret_key: Optional[str] = None,
acl: Optional[str] = None,
endpoint_url: Optional[str] = None,
access_key: str | None = None,
secret_key: str | None = None,
acl: str | None = None,
endpoint_url: str | None = None,
*,
feed_options: Optional[dict[str, Any]] = None,
session_token: Optional[str] = None,
region_name: Optional[str] = None,
feed_options: dict[str, Any] | None = None,
session_token: str | None = None,
region_name: str | None = None,
):
if not is_botocore_available():
raise NotConfigured("missing botocore library")
u = urlparse(uri)
assert u.hostname
self.bucketname: str = u.hostname
self.access_key: Optional[str] = u.username or access_key
self.secret_key: Optional[str] = u.password or secret_key
self.session_token: Optional[str] = session_token
self.access_key: str | None = u.username or access_key
self.secret_key: str | None = u.password or secret_key
self.session_token: str | None = session_token
self.keyname: str = u.path[1:] # remove first "/"
self.acl: Optional[str] = acl
self.endpoint_url: Optional[str] = endpoint_url
self.region_name: Optional[str] = region_name
self.acl: str | None = acl
self.endpoint_url: str | None = endpoint_url
self.region_name: str | None = region_name
# It can be either botocore.client.BaseClient or mypy_boto3_s3.S3Client,
# there seems to be no good way to infer it statically.
self.s3_client: Any
@ -279,7 +279,7 @@ class S3FeedStorage(BlockingFeedStorage):
crawler: Crawler,
uri: str,
*,
feed_options: Optional[dict[str, Any]] = None,
feed_options: dict[str, Any] | None = None,
) -> Self:
return build_storage(
cls,
@ -310,9 +310,9 @@ class S3FeedStorage(BlockingFeedStorage):
class GCSFeedStorage(BlockingFeedStorage):
def __init__(self, uri: str, project_id: Optional[str], acl: Optional[str]):
self.project_id: Optional[str] = project_id
self.acl: Optional[str] = acl
def __init__(self, uri: str, project_id: str | None, acl: str | None):
self.project_id: str | None = project_id
self.acl: str | None = acl
u = urlparse(uri)
assert u.hostname
self.bucket_name: str = u.hostname
@ -342,7 +342,7 @@ class FTPFeedStorage(BlockingFeedStorage):
uri: str,
use_active_mode: bool = False,
*,
feed_options: Optional[dict[str, Any]] = None,
feed_options: dict[str, Any] | None = None,
):
u = urlparse(uri)
if not u.hostname:
@ -361,7 +361,7 @@ class FTPFeedStorage(BlockingFeedStorage):
crawler: Crawler,
uri: str,
*,
feed_options: Optional[dict[str, Any]] = None,
feed_options: dict[str, Any] | None = None,
) -> Self:
return build_storage(
cls,
@ -399,8 +399,8 @@ class FeedSlot:
settings: BaseSettings,
crawler: Crawler,
):
self.file: Optional[IO[bytes]] = None
self.exporter: Optional[BaseItemExporter] = None
self.file: IO[bytes] | None = None
self.exporter: BaseItemExporter | None = None
self.storage: FeedStorageProtocol = storage
# feed params
self.batch_id: int = batch_id
@ -558,7 +558,7 @@ class FeedExporter:
self.crawler.signals.send_catch_log_deferred(signals.feed_exporter_closed)
)
def _close_slot(self, slot: FeedSlot, spider: Spider) -> Optional[Deferred[None]]:
def _close_slot(self, slot: FeedSlot, spider: Spider) -> Deferred[None] | None:
def get_file(slot_: FeedSlot) -> IO[bytes]:
assert slot_.file
if isinstance(slot_.file, PostProcessingManager):
@ -770,8 +770,8 @@ class FeedExporter:
def _get_uri_params(
self,
spider: Spider,
uri_params_function: Union[str, UriParamsCallableT, None],
slot: Optional[FeedSlot] = None,
uri_params_function: str | UriParamsCallableT | None,
slot: FeedSlot | None = None,
) -> dict[str, Any]:
params = {}
for k in dir(spider):

View File

@ -9,7 +9,7 @@ from importlib import import_module
from pathlib import Path
from time import time
from types import ModuleType
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast
from typing import IO, TYPE_CHECKING, Any, cast
from weakref import WeakKeyDictionary
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
@ -66,16 +66,14 @@ class RFC2616Policy:
self.always_store: bool = settings.getbool("HTTPCACHE_ALWAYS_STORE")
self.ignore_schemes: list[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self._cc_parsed: WeakKeyDictionary[
Union[Request, Response], dict[bytes, Optional[bytes]]
Request | Response, dict[bytes, bytes | None]
] = WeakKeyDictionary()
self.ignore_response_cache_controls: list[bytes] = [
to_bytes(cc)
for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
]
def _parse_cachecontrol(
self, r: Union[Request, Response]
) -> dict[bytes, Optional[bytes]]:
def _parse_cachecontrol(self, r: Request | Response) -> dict[bytes, bytes | None]:
if r not in self._cc_parsed:
cch = r.headers.get(b"Cache-Control", b"")
assert cch is not None
@ -191,7 +189,7 @@ class RFC2616Policy:
if b"ETag" in cachedresponse.headers:
request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
def _get_max_age(self, cc: dict[bytes, Optional[bytes]]) -> Optional[int]:
def _get_max_age(self, cc: dict[bytes, bytes | None]) -> int | None:
try:
return max(0, int(cc[b"max-age"])) # type: ignore[arg-type]
except (KeyError, ValueError):
@ -275,7 +273,7 @@ class DbmCacheStorage:
def close_spider(self, spider: Spider) -> None:
self.db.close()
def retrieve_response(self, spider: Spider, request: Request) -> Optional[Response]:
def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
data = self._read_data(spider, request)
if data is None:
return None # not cached
@ -300,7 +298,7 @@ class DbmCacheStorage:
self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
self.db[f"{key}_time"] = str(time())
def _read_data(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
def _read_data(self, spider: Spider, request: Request) -> dict[str, Any] | None:
key = self._fingerprinter.fingerprint(request).hex()
db = self.db
tkey = f"{key}_time"
@ -320,9 +318,7 @@ class FilesystemCacheStorage:
self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS")
self.use_gzip: bool = settings.getbool("HTTPCACHE_GZIP")
# https://github.com/python/mypy/issues/10740
self._open: Callable[
Concatenate[Union[str, os.PathLike], str, ...], IO[bytes]
] = (
self._open: Callable[Concatenate[str | os.PathLike, str, ...], IO[bytes]] = (
gzip.open if self.use_gzip else open # type: ignore[assignment]
)
@ -339,7 +335,7 @@ class FilesystemCacheStorage:
def close_spider(self, spider: Spider) -> None:
pass
def retrieve_response(self, spider: Spider, request: Request) -> Optional[Response]:
def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is None:
@ -387,7 +383,7 @@ class FilesystemCacheStorage:
key = self._fingerprinter.fingerprint(request).hex()
return str(Path(self.cachedir, spider.name, key[0:2], key))
def _read_meta(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
def _read_meta(self, spider: Spider, request: Request) -> dict[str, Any] | None:
rpath = Path(self._get_request_path(spider, request))
metapath = rpath / "pickled_meta"
if not metapath.exists():
@ -399,7 +395,7 @@ class FilesystemCacheStorage:
return cast(dict[str, Any], pickle.load(f)) # nosec
def parse_cachecontrol(header: bytes) -> dict[bytes, Optional[bytes]]:
def parse_cachecontrol(header: bytes) -> dict[bytes, bytes | None]:
"""Parse Cache-Control header
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
@ -419,7 +415,7 @@ def parse_cachecontrol(header: bytes) -> dict[bytes, Optional[bytes]]:
return directives
def rfc1123_to_epoch(date_str: Union[str, bytes, None]) -> Optional[int]:
def rfc1123_to_epoch(date_str: str | bytes | None) -> int | None:
try:
date_str = to_unicode(date_str, encoding="ascii") # type: ignore[arg-type]
return mktime_tz(parsedate_tz(date_str)) # type: ignore[arg-type]

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING
from twisted.internet import task
@ -29,7 +29,7 @@ class LogStats:
self.stats: StatsCollector = stats
self.interval: float = interval
self.multiplier: float = 60.0 / self.interval
self.task: Optional[task.LoopingCall] = None
self.task: task.LoopingCall | None = None
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
@ -81,7 +81,7 @@ class LogStats:
def calculate_final_stats(
self, spider: Spider
) -> Union[tuple[None, None], tuple[float, float]]:
) -> tuple[None, None] | tuple[float, float]:
start_time = self.stats.get_value("start_time")
finished_time = self.stats.get_value("finished_time")

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
from datetime import datetime, timezone
from json import JSONEncoder
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from twisted.internet import task
@ -36,7 +36,7 @@ class PeriodicLog:
self.stats: StatsCollector = stats
self.interval: float = interval
self.multiplier: float = 60.0 / self.interval
self.task: Optional[task.LoopingCall] = None
self.task: task.LoopingCall | None = None
self.encoder: JSONEncoder = ScrapyJSONEncoder(sort_keys=True, indent=4)
self.ext_stats_enabled: bool = bool(ext_stats)
self.ext_stats_include: list[str] = ext_stats.get("include", [])
@ -52,7 +52,7 @@ class PeriodicLog:
if not interval:
raise NotConfigured
try:
ext_stats: Optional[dict[str, Any]] = crawler.settings.getdict(
ext_stats: dict[str, Any] | None = crawler.settings.getdict(
"PERIODIC_LOG_STATS"
)
except (TypeError, ValueError):
@ -62,7 +62,7 @@ class PeriodicLog:
else None
)
try:
ext_delta: Optional[dict[str, Any]] = crawler.settings.getdict(
ext_delta: dict[str, Any] | None = crawler.settings.getdict(
"PERIODIC_LOG_DELTA"
)
except (TypeError, ValueError):
@ -93,8 +93,8 @@ class PeriodicLog:
def spider_opened(self, spider: Spider) -> None:
self.time_prev: datetime = datetime.now(tz=timezone.utc)
self.delta_prev: dict[str, Union[int, float]] = {}
self.stats_prev: dict[str, Union[int, float]] = {}
self.delta_prev: dict[str, int | float] = {}
self.stats_prev: dict[str, int | float] = {}
self.task = task.LoopingCall(self.log)
self.task.start(self.interval)
@ -110,7 +110,7 @@ class PeriodicLog:
logger.info(self.encoder.encode(data))
def log_delta(self) -> dict[str, Any]:
num_stats: dict[str, Union[int, float]] = {
num_stats: dict[str, int | float] = {
k: v
for k, v in self.stats._stats.items()
if isinstance(v, (int, float))

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import pickle # nosec
from pathlib import Path
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from scrapy import Spider, signals
from scrapy.exceptions import NotConfigured
@ -18,8 +18,8 @@ if TYPE_CHECKING:
class SpiderState:
"""Store and load spider state during a scraping job"""
def __init__(self, jobdir: Optional[str] = None):
self.jobdir: Optional[str] = jobdir
def __init__(self, jobdir: str | None = None):
self.jobdir: str | None = jobdir
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:

View File

@ -6,7 +6,7 @@ Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
from __future__ import annotations
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from scrapy import Spider, signals
from scrapy.exceptions import NotConfigured
@ -39,7 +39,7 @@ class StatsMailer:
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_closed(self, spider: Spider) -> Optional[Deferred[None]]:
def spider_closed(self, spider: Spider) -> Deferred[None] | None:
spider_stats = self.stats.get_stats(spider)
body = "Global stats\n\n"
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from scrapy import Request, Spider, signals
from scrapy.exceptions import NotConfigured
@ -90,8 +90,8 @@ class AutoThrottle:
def _get_slot(
self, request: Request, spider: Spider
) -> tuple[Optional[str], Optional[Slot]]:
key: Optional[str] = request.meta.get("download_slot")
) -> tuple[str | None, Slot | None]:
key: str | None = request.meta.get("download_slot")
if key is None:
return None, None
assert self.crawler.engine

View File

@ -5,7 +5,7 @@ import time
from http.cookiejar import Cookie
from http.cookiejar import CookieJar as _CookieJar
from http.cookiejar import CookiePolicy, DefaultCookiePolicy
from typing import TYPE_CHECKING, Any, Optional, cast
from typing import TYPE_CHECKING, Any, cast
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
@ -28,7 +28,7 @@ IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
class CookieJar:
def __init__(
self,
policy: Optional[CookiePolicy] = None,
policy: CookiePolicy | None = None,
check_expired_frequency: int = 10000,
):
self.policy: CookiePolicy = policy or DefaultCookiePolicy()
@ -83,9 +83,9 @@ class CookieJar:
def clear(
self,
domain: Optional[str] = None,
path: Optional[str] = None,
name: Optional[str] = None,
domain: str | None = None,
path: str | None = None,
name: str | None = None,
) -> None:
self.jar.clear(domain, path, name)
@ -188,7 +188,7 @@ class WrappedRequest:
def has_header(self, name: str) -> bool:
return name in self.request.headers
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
def get_header(self, name: str, default: str | None = None) -> str | None:
value = self.request.headers.get(name, default)
return to_unicode(value, errors="replace") if value is not None else None

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
from typing import TYPE_CHECKING, Any, AnyStr, Union, cast
from w3lib.http import headers_dict_to_raw
@ -25,14 +25,14 @@ class Headers(CaselessDict):
def __init__(
self,
seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
encoding: str = "utf-8",
):
self.encoding: str = encoding
super().__init__(seq)
def update( # type: ignore[override]
self, seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]]]
self, seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]]
) -> None:
seq = seq.items() if isinstance(seq, Mapping) else seq
iseq: dict[bytes, list[bytes]] = {}
@ -44,7 +44,7 @@ class Headers(CaselessDict):
"""Normalize key to bytes"""
return self._tobytes(key.title())
def normvalue(self, value: Union[_RawValueT, Iterable[_RawValueT]]) -> list[bytes]:
def normvalue(self, value: _RawValueT | Iterable[_RawValueT]) -> list[bytes]:
"""Normalize values to bytes"""
_value: Iterable[_RawValueT]
if value is None:
@ -67,13 +67,13 @@ class Headers(CaselessDict):
return str(x).encode(self.encoding)
raise TypeError(f"Unsupported value type: {type(x)}")
def __getitem__(self, key: AnyStr) -> Optional[bytes]:
def __getitem__(self, key: AnyStr) -> bytes | None:
try:
return cast(list[bytes], super().__getitem__(key))[-1]
except IndexError:
return None
def get(self, key: AnyStr, def_val: Any = None) -> Optional[bytes]:
def get(self, key: AnyStr, def_val: Any = None) -> bytes | None:
try:
return cast(list[bytes], super().get(key, def_val))[-1]
except IndexError:
@ -103,7 +103,7 @@ class Headers(CaselessDict):
def items(self) -> Iterable[tuple[bytes, list[bytes]]]: # type: ignore[override]
return ((k, self.getlist(k)) for k in self.keys())
def values(self) -> list[Optional[bytes]]: # type: ignore[override]
def values(self) -> list[bytes | None]: # type: ignore[override]
return [
self[k] for k in self.keys() # pylint: disable=consider-using-dict-items
]

View File

@ -13,7 +13,6 @@ from typing import (
Any,
AnyStr,
NoReturn,
Optional,
TypedDict,
TypeVar,
Union,
@ -112,18 +111,18 @@ class Request(object_ref):
def __init__(
self,
url: str,
callback: Optional[CallbackT] = None,
callback: CallbackT | None = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[dict[str, Any]] = None,
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
body: bytes | str | None = None,
cookies: CookiesT | None = None,
meta: dict[str, Any] | None = None,
encoding: str = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
flags: Optional[list[str]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
errback: Callable[[Failure], Any] | None = None,
flags: list[str] | None = None,
cb_kwargs: dict[str, Any] | None = None,
) -> None:
self._encoding: str = encoding # this one has to be set first
self.method: str = str(method).upper()
@ -139,17 +138,15 @@ class Request(object_ref):
)
if not (callable(errback) or errback is None):
raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
self.callback: Optional[CallbackT] = callback
self.errback: Optional[Callable[[Failure], Any]] = errback
self.callback: CallbackT | None = callback
self.errback: Callable[[Failure], Any] | None = errback
self.cookies: CookiesT = cookies or {}
self.headers: Headers = Headers(headers or {}, encoding=encoding)
self.dont_filter: bool = dont_filter
self._meta: Optional[dict[str, Any]] = dict(meta) if meta else None
self._cb_kwargs: Optional[dict[str, Any]] = (
dict(cb_kwargs) if cb_kwargs else None
)
self._meta: dict[str, Any] | None = dict(meta) if meta else None
self._cb_kwargs: dict[str, Any] | None = dict(cb_kwargs) if cb_kwargs else None
self.flags: list[str] = [] if flags is None else list(flags)
@property
@ -186,7 +183,7 @@ class Request(object_ref):
def body(self) -> bytes:
return self._body
def _set_body(self, body: Optional[Union[str, bytes]]) -> None:
def _set_body(self, body: str | bytes | None) -> None:
self._body = b"" if body is None else to_bytes(body, self.encoding)
@property
@ -208,7 +205,7 @@ class Request(object_ref):
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
def replace(
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
self, *args: Any, cls: type[Request] | None = None, **kwargs: Any
) -> Request:
"""Create a new Request with the same attributes except for those given new values"""
for x in self.attributes:
@ -255,7 +252,7 @@ class Request(object_ref):
request_kwargs.update(kwargs)
return cls(**request_kwargs)
def to_dict(self, *, spider: Optional[scrapy.Spider] = None) -> dict[str, Any]:
def to_dict(self, *, spider: scrapy.Spider | None = None) -> dict[str, Any]:
"""Return a dictionary containing the Request's data.
Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object.

View File

@ -62,14 +62,14 @@ class FormRequest(Request):
def from_response(
cls,
response: TextResponse,
formname: Optional[str] = None,
formid: Optional[str] = None,
formname: str | None = None,
formid: str | None = None,
formnumber: int = 0,
formdata: FormdataType = None,
clickdata: Optional[dict[str, Union[str, int]]] = None,
clickdata: dict[str, str | int] | None = None,
dont_click: bool = False,
formxpath: Optional[str] = None,
formcss: Optional[str] = None,
formxpath: str | None = None,
formcss: str | None = None,
**kwargs: Any,
) -> Self:
kwargs.setdefault("encoding", response.encoding)
@ -92,7 +92,7 @@ class FormRequest(Request):
return cls(url=url, method=method, formdata=formdata, **kwargs)
def _get_form_url(form: FormElement, url: Optional[str]) -> str:
def _get_form_url(form: FormElement, url: str | None) -> str:
assert form.base_url is not None # typing
if url is None:
action = form.get("action")
@ -113,10 +113,10 @@ def _urlencode(seq: Iterable[FormdataKVType], enc: str) -> str:
def _get_form(
response: TextResponse,
formname: Optional[str],
formid: Optional[str],
formname: str | None,
formid: str | None,
formnumber: int,
formxpath: Optional[str],
formxpath: str | None,
) -> FormElement:
"""Find the wanted form element within the given response."""
root = response.selector.root
@ -160,7 +160,7 @@ def _get_inputs(
form: FormElement,
formdata: FormdataType,
dont_click: bool,
clickdata: Optional[dict[str, Union[str, int]]],
clickdata: dict[str, str | int] | None,
) -> list[FormdataKVType]:
"""Return a list of key-value pairs for the inputs found in the given form."""
try:
@ -196,8 +196,8 @@ def _get_inputs(
def _value(
ele: Union[InputElement, SelectElement, TextareaElement]
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
ele: InputElement | SelectElement | TextareaElement,
) -> tuple[str | None, None | str | MultipleSelectOptions]:
n = ele.name
v = ele.value
if ele.tag == "select":
@ -206,8 +206,8 @@ def _value(
def _select_value(
ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions]
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
ele: SelectElement, n: str | None, v: None | str | MultipleSelectOptions
) -> tuple[str | None, None | str | MultipleSelectOptions]:
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
@ -218,8 +218,8 @@ def _select_value(
def _get_clickable(
clickdata: Optional[dict[str, Union[str, int]]], form: FormElement
) -> Optional[tuple[str, str]]:
clickdata: dict[str, str | int] | None, form: FormElement
) -> tuple[str, str] | None:
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first

View File

@ -10,7 +10,7 @@ from __future__ import annotations
import copy
import json
import warnings
from typing import TYPE_CHECKING, Any, Optional, overload
from typing import TYPE_CHECKING, Any, overload
from scrapy.http.request import Request, RequestTypeVar
@ -23,7 +23,7 @@ class JsonRequest(Request):
attributes: tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
def __init__(
self, *args: Any, dumps_kwargs: Optional[dict[str, Any]] = None, **kwargs: Any
self, *args: Any, dumps_kwargs: dict[str, Any] | None = None, **kwargs: Any
) -> None:
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
dumps_kwargs.setdefault("sort_keys", True)
@ -59,7 +59,7 @@ class JsonRequest(Request):
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
def replace(
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
self, *args: Any, cls: type[Request] | None = None, **kwargs: Any
) -> Request:
body_passed = kwargs.get("body", None) is not None
data: Any = kwargs.pop("data", None)

View File

@ -5,8 +5,10 @@ This module implements the XmlRpcRequest class which is a more convenient class
See documentation in docs/topics/request-response.rst
"""
from __future__ import annotations
import xmlrpc.client as xmlrpclib
from typing import Any, Optional
from typing import Any
import defusedxml.xmlrpc
@ -19,7 +21,7 @@ DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
class XmlRpcRequest(Request):
def __init__(self, *args: Any, encoding: Optional[str] = None, **kwargs: Any):
def __init__(self, *args: Any, encoding: str | None = None, **kwargs: Any):
if "body" not in kwargs and "params" in kwargs:
kw = {k: kwargs.pop(k) for k in DUMPS_ARGS if k in kwargs}
kwargs["body"] = xmlrpclib.dumps(**kw)

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/request-response.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Any, AnyStr, Optional, TypeVar, Union, overload
from typing import TYPE_CHECKING, Any, AnyStr, TypeVar, overload
from urllib.parse import urljoin
from scrapy.exceptions import NotSupported
@ -60,23 +60,23 @@ class Response(object_ref):
self,
url: str,
status: int = 200,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
body: bytes = b"",
flags: Optional[list[str]] = None,
request: Optional[Request] = None,
certificate: Optional[Certificate] = None,
ip_address: Union[IPv4Address, IPv6Address, None] = None,
protocol: Optional[str] = None,
flags: list[str] | None = None,
request: Request | None = None,
certificate: Certificate | None = None,
ip_address: IPv4Address | IPv6Address | None = None,
protocol: str | None = None,
):
self.headers: Headers = Headers(headers or {})
self.status: int = int(status)
self._set_body(body)
self._set_url(url)
self.request: Optional[Request] = request
self.request: Request | None = request
self.flags: list[str] = [] if flags is None else list(flags)
self.certificate: Optional[Certificate] = certificate
self.ip_address: Union[IPv4Address, IPv6Address, None] = ip_address
self.protocol: Optional[str] = protocol
self.certificate: Certificate | None = certificate
self.ip_address: IPv4Address | IPv6Address | None = ip_address
self.protocol: str | None = protocol
@property
def cb_kwargs(self) -> dict[str, Any]:
@ -114,7 +114,7 @@ class Response(object_ref):
def body(self) -> bytes:
return self._body
def _set_body(self, body: Optional[bytes]) -> None:
def _set_body(self, body: bytes | None) -> None:
if body is None:
self._body = b""
elif not isinstance(body, bytes):
@ -142,7 +142,7 @@ class Response(object_ref):
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
def replace(
self, *args: Any, cls: Optional[type[Response]] = None, **kwargs: Any
self, *args: Any, cls: type[Response] | None = None, **kwargs: Any
) -> Response:
"""Create a new Response with the same attributes except for those given new values"""
for x in self.attributes:
@ -183,19 +183,19 @@ class Response(object_ref):
def follow(
self,
url: Union[str, Link],
callback: Optional[CallbackT] = None,
url: str | Link,
callback: CallbackT | None = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = "utf-8",
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
body: bytes | str | None = None,
cookies: CookiesT | None = None,
meta: dict[str, Any] | None = None,
encoding: str | None = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
errback: Callable[[Failure], Any] | None = None,
cb_kwargs: dict[str, Any] | None = None,
flags: list[str] | None = None,
) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
@ -236,19 +236,19 @@ class Response(object_ref):
def follow_all(
self,
urls: Iterable[Union[str, Link]],
callback: Optional[CallbackT] = None,
urls: Iterable[str | Link],
callback: CallbackT | None = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = "utf-8",
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
body: bytes | str | None = None,
cookies: CookiesT | None = None,
meta: dict[str, Any] | None = None,
encoding: str | None = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
errback: Callable[[Failure], Any] | None = None,
cb_kwargs: dict[str, Any] | None = None,
flags: list[str] | None = None,
) -> Iterable[Request]:
"""
.. versionadded:: 2.0

View File

@ -8,9 +8,8 @@ See documentation in docs/topics/request-response.rst
from __future__ import annotations
import json
from collections.abc import Iterable
from contextlib import suppress
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
from typing import TYPE_CHECKING, Any, AnyStr, cast
from urllib.parse import urljoin
import parsel
@ -24,16 +23,16 @@ from w3lib.encoding import (
from w3lib.html import strip_html5_whitespace
from scrapy.http.response import Response
from scrapy.link import Link
from scrapy.utils.python import memoizemethod_noargs, to_unicode
from scrapy.utils.response import get_base_url
if TYPE_CHECKING:
from collections.abc import Callable, Mapping
from collections.abc import Callable, Iterable, Mapping
from twisted.python.failure import Failure
from scrapy.http.request import CallbackT, CookiesT, Request
from scrapy.link import Link
from scrapy.selector import Selector, SelectorList
@ -47,13 +46,13 @@ class TextResponse(Response):
attributes: tuple[str, ...] = Response.attributes + ("encoding",)
def __init__(self, *args: Any, **kwargs: Any):
self._encoding: Optional[str] = kwargs.pop("encoding", None)
self._cached_benc: Optional[str] = None
self._cached_ubody: Optional[str] = None
self._cached_selector: Optional[Selector] = None
self._encoding: str | None = kwargs.pop("encoding", None)
self._cached_benc: str | None = None
self._cached_ubody: str | None = None
self._cached_selector: Selector | None = None
super().__init__(*args, **kwargs)
def _set_body(self, body: Union[str, bytes, None]) -> None:
def _set_body(self, body: str | bytes | None) -> None:
self._body: bytes = b"" # used by encoding detection
if isinstance(body, str):
if self._encoding is None:
@ -69,7 +68,7 @@ class TextResponse(Response):
def encoding(self) -> str:
return self._declared_encoding() or self._body_inferred_encoding()
def _declared_encoding(self) -> Optional[str]:
def _declared_encoding(self) -> str | None:
return (
self._encoding
or self._bom_encoding()
@ -104,7 +103,7 @@ class TextResponse(Response):
return urljoin(get_base_url(self), url)
@memoizemethod_noargs
def _headers_encoding(self) -> Optional[str]:
def _headers_encoding(self) -> str | None:
content_type = cast(bytes, self.headers.get(b"Content-Type", b""))
return http_content_type_encoding(to_unicode(content_type, encoding="latin-1"))
@ -123,7 +122,7 @@ class TextResponse(Response):
self._cached_ubody = ubody
return self._cached_benc
def _auto_detect_fun(self, text: bytes) -> Optional[str]:
def _auto_detect_fun(self, text: bytes) -> str | None:
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
try:
text.decode(enc)
@ -133,11 +132,11 @@ class TextResponse(Response):
return None
@memoizemethod_noargs
def _body_declared_encoding(self) -> Optional[str]:
def _body_declared_encoding(self) -> str | None:
return html_body_declared_encoding(self.body)
@memoizemethod_noargs
def _bom_encoding(self) -> Optional[str]:
def _bom_encoding(self) -> str | None:
return read_bom(self.body)[0]
@property
@ -170,19 +169,19 @@ class TextResponse(Response):
def follow(
self,
url: Union[str, Link, parsel.Selector],
callback: Optional[CallbackT] = None,
url: str | Link | parsel.Selector,
callback: CallbackT | None = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = None,
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
body: bytes | str | None = None,
cookies: CookiesT | None = None,
meta: dict[str, Any] | None = None,
encoding: str | None = None,
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
errback: Callable[[Failure], Any] | None = None,
cb_kwargs: dict[str, Any] | None = None,
flags: list[str] | None = None,
) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
@ -223,21 +222,21 @@ class TextResponse(Response):
def follow_all(
self,
urls: Union[Iterable[Union[str, Link]], parsel.SelectorList, None] = None,
callback: Optional[CallbackT] = None,
urls: Iterable[str | Link] | parsel.SelectorList | None = None,
callback: CallbackT | None = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = None,
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
body: bytes | str | None = None,
cookies: CookiesT | None = None,
meta: dict[str, Any] | None = None,
encoding: str | None = None,
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
css: Optional[str] = None,
xpath: Optional[str] = None,
errback: Callable[[Failure], Any] | None = None,
cb_kwargs: dict[str, Any] | None = None,
flags: list[str] | None = None,
css: str | None = None,
xpath: str | None = None,
) -> Iterable[Request]:
"""
A generator that produces :class:`~.Request` instances to follow all
@ -279,7 +278,7 @@ class TextResponse(Response):
with suppress(_InvalidSelector):
urls.append(_url_from_selector(sel))
return super().follow_all(
urls=cast(Iterable[Union[str, Link]], urls),
urls=cast("Iterable[str | Link]", urls),
callback=callback,
method=method,
headers=headers,

View File

@ -9,7 +9,7 @@ import operator
import re
from collections.abc import Callable, Iterable
from functools import partial
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Union, cast
from urllib.parse import urljoin, urlparse
from lxml import etree # nosec
@ -58,9 +58,9 @@ def _canonicalize_link_url(link: Link) -> str:
class LxmlParserLinkExtractor:
def __init__(
self,
tag: Union[str, Callable[[str], bool]] = "a",
attr: Union[str, Callable[[str], bool]] = "href",
process: Optional[Callable[[Any], Any]] = None,
tag: str | Callable[[str], bool] = "a",
attr: str | Callable[[str], bool] = "href",
process: Callable[[Any], Any] | None = None,
unique: bool = False,
strip: bool = True,
canonicalized: bool = False,
@ -166,18 +166,18 @@ class LxmlLinkExtractor:
self,
allow: _RegexOrSeveralT = (),
deny: _RegexOrSeveralT = (),
allow_domains: Union[str, Iterable[str]] = (),
deny_domains: Union[str, Iterable[str]] = (),
restrict_xpaths: Union[str, Iterable[str]] = (),
tags: Union[str, Iterable[str]] = ("a", "area"),
attrs: Union[str, Iterable[str]] = ("href",),
allow_domains: str | Iterable[str] = (),
deny_domains: str | Iterable[str] = (),
restrict_xpaths: str | Iterable[str] = (),
tags: str | Iterable[str] = ("a", "area"),
attrs: str | Iterable[str] = ("href",),
canonicalize: bool = False,
unique: bool = True,
process_value: Optional[Callable[[Any], Any]] = None,
deny_extensions: Union[str, Iterable[str], None] = None,
restrict_css: Union[str, Iterable[str]] = (),
process_value: Callable[[Any], Any] | None = None,
deny_extensions: str | Iterable[str] | None = None,
restrict_css: str | Iterable[str] = (),
strip: bool = True,
restrict_text: Optional[_RegexOrSeveralT] = None,
restrict_text: _RegexOrSeveralT | None = None,
):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
self.link_extractor = LxmlParserLinkExtractor(
@ -206,7 +206,7 @@ class LxmlLinkExtractor:
self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text)
@staticmethod
def _compile_regexes(value: Optional[_RegexOrSeveralT]) -> list[re.Pattern[str]]:
def _compile_regexes(value: _RegexOrSeveralT | None) -> list[re.Pattern[str]]:
return [
x if isinstance(x, re.Pattern) else re.compile(x)
for x in arg_to_iter(value)

View File

@ -6,7 +6,7 @@ See documentation in docs/topics/loaders.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
import itemloaders
@ -92,9 +92,9 @@ class ItemLoader(itemloaders.ItemLoader):
def __init__(
self,
item: Any = None,
selector: Optional[Selector] = None,
response: Optional[TextResponse] = None,
parent: Optional[itemloaders.ItemLoader] = None,
selector: Selector | None = None,
response: TextResponse | None = None,
parent: itemloaders.ItemLoader | None = None,
**context: Any,
):
if selector is None and response is not None:

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
import os
from typing import TYPE_CHECKING, Any, Optional, TypedDict, Union
from typing import TYPE_CHECKING, Any, TypedDict
from twisted.python.failure import Failure
@ -31,7 +31,7 @@ DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
class LogFormatterResult(TypedDict):
level: int
msg: str
args: Union[dict[str, Any], tuple[Any, ...]]
args: dict[str, Any] | tuple[Any, ...]
class LogFormatter:
@ -93,7 +93,7 @@ class LogFormatter:
}
def scraped(
self, item: Any, response: Union[Response, Failure, None], spider: Spider
self, item: Any, response: Response | Failure | None, spider: Spider
) -> LogFormatterResult:
"""Logs a message when an item is scraped by a spider."""
src: Any
@ -116,7 +116,7 @@ class LogFormatter:
self,
item: Any,
exception: BaseException,
response: Optional[Response],
response: Response | None,
spider: Spider,
) -> LogFormatterResult:
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
@ -133,7 +133,7 @@ class LogFormatter:
self,
item: Any,
exception: BaseException,
response: Optional[Response],
response: Response | None,
spider: Spider,
) -> LogFormatterResult:
"""Logs a message when an item causes an error while it is passing
@ -153,7 +153,7 @@ class LogFormatter:
self,
failure: Failure,
request: Request,
response: Union[Response, Failure],
response: Response | Failure,
spider: Spider,
) -> LogFormatterResult:
"""Logs an error message from a spider.
@ -174,7 +174,7 @@ class LogFormatter:
failure: Failure,
request: Request,
spider: Spider,
errmsg: Optional[str] = None,
errmsg: str | None = None,
) -> LogFormatterResult:
"""Logs a download error message from a spider (typically coming from
the engine).

View File

@ -14,7 +14,7 @@ from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email.utils import formatdate
from io import BytesIO
from typing import IO, TYPE_CHECKING, Any, Optional, Union
from typing import IO, TYPE_CHECKING, Any
from twisted import version as twisted_version
from twisted.internet import ssl
@ -45,7 +45,7 @@ logger = logging.getLogger(__name__)
COMMASPACE = ", "
def _to_bytes_or_none(text: Union[str, bytes, None]) -> Optional[bytes]:
def _to_bytes_or_none(text: str | bytes | None) -> bytes | None:
if text is None:
return None
return to_bytes(text)
@ -56,8 +56,8 @@ class MailSender:
self,
smtphost: str = "localhost",
mailfrom: str = "scrapy@localhost",
smtpuser: Optional[str] = None,
smtppass: Optional[str] = None,
smtpuser: str | None = None,
smtppass: str | None = None,
smtpport: int = 25,
smtptls: bool = False,
smtpssl: bool = False,
@ -65,8 +65,8 @@ class MailSender:
):
self.smtphost: str = smtphost
self.smtpport: int = smtpport
self.smtpuser: Optional[bytes] = _to_bytes_or_none(smtpuser)
self.smtppass: Optional[bytes] = _to_bytes_or_none(smtppass)
self.smtpuser: bytes | None = _to_bytes_or_none(smtpuser)
self.smtppass: bytes | None = _to_bytes_or_none(smtppass)
self.smtptls: bool = smtptls
self.smtpssl: bool = smtpssl
self.mailfrom: str = mailfrom
@ -86,15 +86,15 @@ class MailSender:
def send(
self,
to: Union[str, list[str]],
to: str | list[str],
subject: str,
body: str,
cc: Union[str, list[str], None] = None,
cc: str | list[str] | None = None,
attachs: Sequence[tuple[str, str, IO[Any]]] = (),
mimetype: str = "text/plain",
charset: Optional[str] = None,
_callback: Optional[Callable[..., None]] = None,
) -> Optional[Deferred[None]]:
charset: str | None = None,
_callback: Callable[..., None] | None = None,
) -> Deferred[None] | None:
from twisted.internet import reactor
msg: MIMEBase

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
import pprint
from collections import defaultdict, deque
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, TypeVar, cast
from scrapy.exceptions import NotConfigured
from scrapy.utils.defer import process_chain, process_parallel
@ -40,9 +40,9 @@ class MiddlewareManager:
self.middlewares = middlewares
# Only process_spider_output and process_spider_exception can be None.
# Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
self.methods: dict[
str, deque[Union[None, Callable, tuple[Callable, Callable]]]
] = defaultdict(deque)
self.methods: dict[str, deque[None | Callable | tuple[Callable, Callable]]] = (
defaultdict(deque)
)
for mw in middlewares:
self._add_middleware(mw)
@ -51,9 +51,7 @@ class MiddlewareManager:
raise NotImplementedError
@classmethod
def from_settings(
cls, settings: Settings, crawler: Optional[Crawler] = None
) -> Self:
def from_settings(cls, settings: Settings, crawler: Crawler | None = None) -> Self:
mwlist = cls._get_mwlist_from_settings(settings)
middlewares = []
enabled = []

View File

@ -17,17 +17,7 @@ from contextlib import suppress
from ftplib import FTP
from io import BytesIO
from pathlib import Path
from typing import (
IO,
TYPE_CHECKING,
Any,
NoReturn,
Optional,
Protocol,
TypedDict,
Union,
cast,
)
from typing import IO, TYPE_CHECKING, Any, NoReturn, Protocol, TypedDict, cast
from urllib.parse import urlparse
from itemadapter import ItemAdapter
@ -61,7 +51,7 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
def _to_string(path: Union[str, PathLike[str]]) -> str:
def _to_string(path: str | PathLike[str]) -> str:
return str(path) # convert a Path object to string
@ -99,17 +89,17 @@ class FilesStoreProtocol(Protocol):
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
) -> Optional[Deferred[Any]]: ...
meta: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
) -> Deferred[Any] | None: ...
def stat_file(
self, path: str, info: MediaPipeline.SpiderInfo
) -> Union[StatInfo, Deferred[StatInfo]]: ...
) -> StatInfo | Deferred[StatInfo]: ...
class FSFilesStore:
def __init__(self, basedir: Union[str, PathLike[str]]):
def __init__(self, basedir: str | PathLike[str]):
basedir = _to_string(basedir)
if "://" in basedir:
basedir = basedir.split("://", 1)[1]
@ -121,18 +111,18 @@ class FSFilesStore:
def persist_file(
self,
path: Union[str, PathLike[str]],
path: str | PathLike[str],
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
meta: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
) -> None:
absolute_path = self._get_filesystem_path(path)
self._mkdir(absolute_path.parent, info)
absolute_path.write_bytes(buf.getvalue())
def stat_file(
self, path: Union[str, PathLike[str]], info: MediaPipeline.SpiderInfo
self, path: str | PathLike[str], info: MediaPipeline.SpiderInfo
) -> StatInfo:
absolute_path = self._get_filesystem_path(path)
try:
@ -145,12 +135,12 @@ class FSFilesStore:
return {"last_modified": last_modified, "checksum": checksum}
def _get_filesystem_path(self, path: Union[str, PathLike[str]]) -> Path:
def _get_filesystem_path(self, path: str | PathLike[str]) -> Path:
path_comps = _to_string(path).split("/")
return Path(self.basedir, *path_comps)
def _mkdir(
self, dirname: Path, domain: Optional[MediaPipeline.SpiderInfo] = None
self, dirname: Path, domain: MediaPipeline.SpiderInfo | None = None
) -> None:
seen: set[str] = self.created_directories[domain] if domain else set()
if str(dirname) not in seen:
@ -218,8 +208,8 @@ class S3FilesStore:
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
meta: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
) -> Deferred[Any]:
"""Upload file to S3 storage"""
key_name = f"{self.prefix}{path}"
@ -327,7 +317,7 @@ class GCSFilesStore:
deferToThread(self.bucket.get_blob, blob_path).addCallback(_onsuccess),
)
def _get_content_type(self, headers: Optional[dict[str, str]]) -> str:
def _get_content_type(self, headers: dict[str, str] | None) -> str:
if headers and "Content-Type" in headers:
return headers["Content-Type"]
return "application/octet-stream"
@ -340,8 +330,8 @@ class GCSFilesStore:
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
meta: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
) -> Deferred[Any]:
blob_path = self._get_blob_path(path)
blob = self.bucket.blob(blob_path)
@ -356,9 +346,9 @@ class GCSFilesStore:
class FTPFilesStore:
FTP_USERNAME: Optional[str] = None
FTP_PASSWORD: Optional[str] = None
USE_ACTIVE_MODE: Optional[bool] = None
FTP_USERNAME: str | None = None
FTP_PASSWORD: str | None = None
USE_ACTIVE_MODE: bool | None = None
def __init__(self, uri: str):
if not uri.startswith("ftp://"):
@ -380,8 +370,8 @@ class FTPFilesStore:
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
meta: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
) -> Deferred[Any]:
path = f"{self.basedir}/{path}"
return deferToThread(
@ -450,9 +440,9 @@ class FilesPipeline(MediaPipeline):
def __init__(
self,
store_uri: Union[str, PathLike[str]],
download_func: Optional[Callable[[Request, Spider], Response]] = None,
settings: Union[Settings, dict[str, Any], None] = None,
store_uri: str | PathLike[str],
download_func: Callable[[Request, Spider], Response] | None = None,
settings: Settings | dict[str, Any] | None = None,
):
store_uri = _to_string(store_uri)
if not store_uri:
@ -517,8 +507,8 @@ class FilesPipeline(MediaPipeline):
def media_to_download(
self, request: Request, info: MediaPipeline.SpiderInfo, *, item: Any = None
) -> Deferred[Optional[FileInfo]]:
def _onsuccess(result: StatInfo) -> Optional[FileInfo]:
) -> Deferred[FileInfo | None]:
def _onsuccess(result: StatInfo) -> FileInfo | None:
if not result:
return None # returning None force download
@ -551,7 +541,7 @@ class FilesPipeline(MediaPipeline):
path = self.file_path(request, info=info, item=item)
# maybeDeferred() overloads don't seem to support a Union[_T, Deferred[_T]] return type
dfd: Deferred[StatInfo] = maybeDeferred(self.store.stat_file, path, info) # type: ignore[call-overload]
dfd2: Deferred[Optional[FileInfo]] = dfd.addCallback(_onsuccess)
dfd2: Deferred[FileInfo | None] = dfd.addCallback(_onsuccess)
dfd2.addErrback(lambda _: None)
dfd2.addErrback(
lambda f: logger.error(
@ -684,8 +674,8 @@ class FilesPipeline(MediaPipeline):
def file_path(
self,
request: Request,
response: Optional[Response] = None,
info: Optional[MediaPipeline.SpiderInfo] = None,
response: Response | None = None,
info: MediaPipeline.SpiderInfo | None = None,
*,
item: Any = None,
) -> str:

View File

@ -11,7 +11,7 @@ import hashlib
import warnings
from contextlib import suppress
from io import BytesIO
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, cast
from itemadapter import ItemAdapter
@ -74,9 +74,9 @@ class ImagesPipeline(FilesPipeline):
def __init__(
self,
store_uri: Union[str, PathLike[str]],
download_func: Optional[Callable[[Request, Spider], Response]] = None,
settings: Union[Settings, dict[str, Any], None] = None,
store_uri: str | PathLike[str],
download_func: Callable[[Request, Spider], Response] | None = None,
settings: Settings | dict[str, Any] | None = None,
):
try:
from PIL import Image
@ -120,7 +120,7 @@ class ImagesPipeline(FilesPipeline):
resolve("IMAGES_THUMBS"), self.THUMBS
)
self._deprecated_convert_image: Optional[bool] = None
self._deprecated_convert_image: bool | None = None
@classmethod
def from_settings(cls, settings: Settings) -> Self:
@ -168,7 +168,7 @@ class ImagesPipeline(FilesPipeline):
*,
item: Any = None,
) -> str:
checksum: Optional[str] = None
checksum: str | None = None
for path, image, buf in self.get_images(response, request, info, item=item):
if checksum is None:
buf.seek(0)
@ -235,8 +235,8 @@ class ImagesPipeline(FilesPipeline):
def convert_image(
self,
image: Image.Image,
size: Optional[tuple[int, int]] = None,
response_body: Optional[BytesIO] = None,
size: tuple[int, int] | None = None,
response_body: BytesIO | None = None,
) -> tuple[Image.Image, BytesIO]:
if response_body is None:
warnings.warn(
@ -291,8 +291,8 @@ class ImagesPipeline(FilesPipeline):
def file_path(
self,
request: Request,
response: Optional[Response] = None,
info: Optional[MediaPipeline.SpiderInfo] = None,
response: Response | None = None,
info: MediaPipeline.SpiderInfo | None = None,
*,
item: Any = None,
) -> str:
@ -303,8 +303,8 @@ class ImagesPipeline(FilesPipeline):
self,
request: Request,
thumb_id: str,
response: Optional[Response] = None,
info: Optional[MediaPipeline.SpiderInfo] = None,
response: Response | None = None,
info: MediaPipeline.SpiderInfo | None = None,
*,
item: Any = None,
) -> str:

View File

@ -9,7 +9,6 @@ from typing import (
Any,
Literal,
NoReturn,
Optional,
TypedDict,
TypeVar,
Union,
@ -44,7 +43,7 @@ _T = TypeVar("_T")
class FileInfo(TypedDict):
url: str
path: str
checksum: Optional[str]
checksum: str | None
status: str
@ -64,15 +63,15 @@ class MediaPipeline(ABC):
def __init__(self, spider: Spider):
self.spider: Spider = spider
self.downloading: set[bytes] = set()
self.downloaded: dict[bytes, Union[FileInfo, Failure]] = {}
self.downloaded: dict[bytes, FileInfo | Failure] = {}
self.waiting: defaultdict[bytes, list[Deferred[FileInfo]]] = defaultdict(
list
)
def __init__(
self,
download_func: Optional[Callable[[Request, Spider], Response]] = None,
settings: Union[Settings, dict[str, Any], None] = None,
download_func: Callable[[Request, Spider], Response] | None = None,
settings: Settings | dict[str, Any] | None = None,
):
self.download_func = download_func
@ -94,8 +93,8 @@ class MediaPipeline(ABC):
def _key_for_pipe(
self,
key: str,
base_class_name: Optional[str] = None,
settings: Optional[Settings] = None,
base_class_name: str | None = None,
settings: Settings | None = None,
) -> str:
class_name = self.__class__.__name__
formatted_key = f"{class_name.upper()}_{key}"
@ -161,7 +160,7 @@ class MediaPipeline(ABC):
# Download request checking media_to_download hook output first
info.downloading.add(fp)
dfd: Deferred[Optional[FileInfo]] = mustbe_deferred(
dfd: Deferred[FileInfo | None] = mustbe_deferred(
self.media_to_download, request, info, item=item
)
dfd2: Deferred[FileInfo] = dfd.addCallback(
@ -182,8 +181,8 @@ class MediaPipeline(ABC):
request.meta["handle_httpstatus_all"] = True
def _check_media_to_download(
self, result: Optional[FileInfo], request: Request, info: SpiderInfo, item: Any
) -> Union[FileInfo, Deferred[FileInfo]]:
self, result: FileInfo | None, request: Request, info: SpiderInfo, item: Any
) -> FileInfo | Deferred[FileInfo]:
if result is not None:
return result
dfd: Deferred[Response]
@ -201,7 +200,7 @@ class MediaPipeline(ABC):
return dfd2
def _cache_result_and_execute_waiters(
self, result: Union[FileInfo, Failure], fp: bytes, info: SpiderInfo
self, result: FileInfo | Failure, fp: bytes, info: SpiderInfo
) -> None:
if isinstance(result, Failure):
# minimize cached information for failure
@ -243,7 +242,7 @@ class MediaPipeline(ABC):
@abstractmethod
def media_to_download(
self, request: Request, info: SpiderInfo, *, item: Any = None
) -> Deferred[Optional[FileInfo]]:
) -> Deferred[FileInfo | None]:
"""Check request before starting download"""
raise NotImplementedError()
@ -291,8 +290,8 @@ class MediaPipeline(ABC):
def file_path(
self,
request: Request,
response: Optional[Response] = None,
info: Optional[SpiderInfo] = None,
response: Response | None = None,
info: SpiderInfo | None = None,
*,
item: Any = None,
) -> str:

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import hashlib
import logging
from typing import TYPE_CHECKING, Optional, Protocol, cast
from typing import TYPE_CHECKING, Protocol, cast
from scrapy import Request
from scrapy.core.downloader import Downloader
@ -42,7 +42,7 @@ class QueueProtocol(Protocol):
def push(self, request: Request) -> None: ...
def pop(self) -> Optional[Request]: ...
def pop(self) -> Request | None: ...
def close(self) -> None: ...
@ -96,7 +96,7 @@ class ScrapyPriorityQueue:
self.downstream_queue_cls: type[QueueProtocol] = downstream_queue_cls
self.key: str = key
self.queues: dict[int, QueueProtocol] = {}
self.curprio: Optional[int] = None
self.curprio: int | None = None
self.init_prios(startprios)
def init_prios(self, startprios: Iterable[int]) -> None:
@ -127,7 +127,7 @@ class ScrapyPriorityQueue:
if self.curprio is None or priority < self.curprio:
self.curprio = priority
def pop(self) -> Optional[Request]:
def pop(self) -> Request | None:
if self.curprio is None:
return None
q = self.queues[self.curprio]
@ -139,7 +139,7 @@ class ScrapyPriorityQueue:
self.curprio = min(prios) if prios else None
return m
def peek(self) -> Optional[Request]:
def peek(self) -> Request | None:
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.
@ -193,7 +193,7 @@ class DownloaderAwarePriorityQueue:
crawler: Crawler,
downstream_queue_cls: type[QueueProtocol],
key: str,
startprios: Optional[dict[str, Iterable[int]]] = None,
startprios: dict[str, Iterable[int]] | None = None,
) -> Self:
return cls(crawler, downstream_queue_cls, key, startprios)
@ -202,7 +202,7 @@ class DownloaderAwarePriorityQueue:
crawler: Crawler,
downstream_queue_cls: type[QueueProtocol],
key: str,
slot_startprios: Optional[dict[str, Iterable[int]]] = None,
slot_startprios: dict[str, Iterable[int]] | None = None,
):
if crawler.settings.getint("CONCURRENT_REQUESTS_PER_IP") != 0:
raise ValueError(
@ -239,7 +239,7 @@ class DownloaderAwarePriorityQueue:
startprios,
)
def pop(self) -> Optional[Request]:
def pop(self) -> Request | None:
stats = self._downloader_interface.stats(self.pqueues)
if not stats:
@ -259,7 +259,7 @@ class DownloaderAwarePriorityQueue:
queue = self.pqueues[slot]
queue.push(request)
def peek(self) -> Optional[Request]:
def peek(self) -> Request | None:
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from twisted.internet import defer
from twisted.internet.base import ReactorBase, ThreadedResolver
@ -128,7 +128,7 @@ class CachingHostnameResolver:
resolutionReceiver: IResolutionReceiver,
hostName: str,
portNumber: int = 0,
addressTypes: Optional[Sequence[type[IAddress]]] = None,
addressTypes: Sequence[type[IAddress]] | None = None,
transportSemantics: str = "TCP",
) -> IHostResolution:
try:

View File

@ -8,7 +8,7 @@ from __future__ import annotations
from io import StringIO
from mimetypes import MimeTypes
from pkgutil import get_data
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING
from scrapy.http import Response
from scrapy.utils.misc import load_object
@ -58,7 +58,7 @@ class ResponseTypes:
return self.classes.get(basetype, Response)
def from_content_type(
self, content_type: Union[str, bytes], content_encoding: Optional[bytes] = None
self, content_type: str | bytes, content_encoding: bytes | None = None
) -> type[Response]:
"""Return the most appropriate Response class from an HTTP Content-Type
header"""
@ -70,7 +70,7 @@ class ResponseTypes:
return self.from_mimetype(mimetype)
def from_content_disposition(
self, content_disposition: Union[str, bytes]
self, content_disposition: str | bytes
) -> type[Response]:
try:
filename = (
@ -123,10 +123,10 @@ class ResponseTypes:
def from_args(
self,
headers: Optional[Mapping[bytes, bytes]] = None,
url: Optional[str] = None,
filename: Optional[str] = None,
body: Optional[bytes] = None,
headers: Mapping[bytes, bytes] | None = None,
url: str | None = None,
filename: str | None = None,
body: bytes | None = None,
) -> type[Response]:
"""Guess the most appropriate Response class based on
the given arguments."""

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
import sys
from abc import ABCMeta, abstractmethod
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING
from warnings import warn
from scrapy.exceptions import ScrapyDeprecationWarning
@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
def decode_robotstxt(
robotstxt_body: bytes, spider: Optional[Spider], to_native_str_type: bool = False
robotstxt_body: bytes, spider: Spider | None, to_native_str_type: bool = False
) -> str:
try:
if to_native_str_type:
@ -57,7 +57,7 @@ class RobotParser(metaclass=ABCMeta):
pass
@abstractmethod
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
"""Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
:param url: Absolute URL
@ -70,10 +70,10 @@ class RobotParser(metaclass=ABCMeta):
class PythonRobotParser(RobotParser):
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
from urllib.robotparser import RobotFileParser
self.spider: Optional[Spider] = spider
self.spider: Spider | None = spider
body_decoded = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
self.rp: RobotFileParser = RobotFileParser()
self.rp.parse(body_decoded.splitlines())
@ -84,18 +84,18 @@ class PythonRobotParser(RobotParser):
o = cls(robotstxt_body, spider)
return o
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.can_fetch(user_agent, url)
class ReppyRobotParser(RobotParser):
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
warn("ReppyRobotParser is deprecated.", ScrapyDeprecationWarning, stacklevel=2)
from reppy.robots import Robots
self.spider: Optional[Spider] = spider
self.spider: Spider | None = spider
self.rp = Robots.parse("", robotstxt_body)
@classmethod
@ -104,15 +104,15 @@ class ReppyRobotParser(RobotParser):
o = cls(robotstxt_body, spider)
return o
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
return self.rp.allowed(url, user_agent)
class RerpRobotParser(RobotParser):
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
from robotexclusionrulesparser import RobotExclusionRulesParser
self.spider: Optional[Spider] = spider
self.spider: Spider | None = spider
self.rp: RobotExclusionRulesParser = RobotExclusionRulesParser()
body_decoded = decode_robotstxt(robotstxt_body, spider)
self.rp.parse(body_decoded)
@ -123,17 +123,17 @@ class RerpRobotParser(RobotParser):
o = cls(robotstxt_body, spider)
return o
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.is_allowed(user_agent, url)
class ProtegoRobotParser(RobotParser):
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
from protego import Protego
self.spider: Optional[Spider] = spider
self.spider: Spider | None = spider
body_decoded = decode_robotstxt(robotstxt_body, spider)
self.rp = Protego.parse(body_decoded)
@ -143,7 +143,7 @@ class ProtegoRobotParser(RobotParser):
o = cls(robotstxt_body, spider)
return o
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.can_fetch(url, user_agent)

View File

@ -2,7 +2,9 @@
XPath selectors based on lxml
"""
from typing import Any, Optional, Union
from __future__ import annotations
from typing import Any
from parsel import Selector as _ParselSelector
@ -16,13 +18,13 @@ __all__ = ["Selector", "SelectorList"]
_NOT_SET = object()
def _st(response: Optional[TextResponse], st: Optional[str]) -> str:
def _st(response: TextResponse | None, st: str | None) -> str:
if st is None:
return "xml" if isinstance(response, XmlResponse) else "html"
return st
def _response_from_text(text: Union[str, bytes], st: Optional[str]) -> TextResponse:
def _response_from_text(text: str | bytes, st: str | None) -> TextResponse:
rt: type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
return rt(url="about:blank", encoding="utf-8", body=to_bytes(text, "utf-8"))
@ -71,10 +73,10 @@ class Selector(_ParselSelector, object_ref):
def __init__(
self,
response: Optional[TextResponse] = None,
text: Optional[str] = None,
type: Optional[str] = None,
root: Optional[Any] = _NOT_SET,
response: TextResponse | None = None,
text: str | None = None,
type: str | None = None,
root: Any | None = _NOT_SET,
**kwargs: Any,
):
if response is not None and text is not None:

View File

@ -5,7 +5,7 @@ import json
from collections.abc import Iterable, Iterator, Mapping, MutableMapping
from importlib import import_module
from pprint import pformat
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Union, cast
from scrapy.settings import default_settings
@ -35,7 +35,7 @@ SETTINGS_PRIORITIES: dict[str, int] = {
}
def get_settings_priority(priority: Union[int, str]) -> int:
def get_settings_priority(priority: int | str) -> int:
"""
Small helper function that looks up a given string priority in the
:attr:`~scrapy.settings.SETTINGS_PRIORITIES` dictionary and returns its
@ -97,9 +97,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
__default = object()
def __init__(
self, values: _SettingsInputT = None, priority: Union[int, str] = "project"
):
def __init__(self, values: _SettingsInputT = None, priority: int | str = "project"):
self.frozen: bool = False
self.attributes: dict[_SettingsKeyT, SettingsAttribute] = {}
if values:
@ -180,7 +178,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
return float(self.get(name, default))
def getlist(
self, name: _SettingsKeyT, default: Optional[list[Any]] = None
self, name: _SettingsKeyT, default: list[Any] | None = None
) -> list[Any]:
"""
Get a setting value as a list. If the setting original type is a list, a
@ -201,7 +199,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
return list(value)
def getdict(
self, name: _SettingsKeyT, default: Optional[dict[Any, Any]] = None
self, name: _SettingsKeyT, default: dict[Any, Any] | None = None
) -> dict[Any, Any]:
"""
Get a setting value as a dictionary. If the setting original type is a
@ -226,8 +224,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
def getdictorlist(
self,
name: _SettingsKeyT,
default: Union[dict[Any, Any], list[Any], tuple[Any], None] = None,
) -> Union[dict[Any, Any], list[Any]]:
default: dict[Any, Any] | list[Any] | tuple[Any] | None = None,
) -> dict[Any, Any] | list[Any]:
"""Get a setting value as either a :class:`dict` or a :class:`list`.
If the setting is already a dict or a list, a copy of it will be
@ -278,7 +276,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
compbs.update(self[name])
return compbs
def getpriority(self, name: _SettingsKeyT) -> Optional[int]:
def getpriority(self, name: _SettingsKeyT) -> int | None:
"""
Return the current numerical priority value of a setting, or ``None`` if
the given ``name`` does not exist.
@ -305,7 +303,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
self.set(name, value)
def set(
self, name: _SettingsKeyT, value: Any, priority: Union[int, str] = "project"
self, name: _SettingsKeyT, value: Any, priority: int | str = "project"
) -> None:
"""
Store a key/value attribute with a given priority.
@ -338,7 +336,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
self,
name: _SettingsKeyT,
default: Any = None,
priority: Union[int, str] = "project",
priority: int | str = "project",
) -> Any:
if name not in self:
self.set(name, default, priority)
@ -346,13 +344,11 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
return self.attributes[name].value
def setdict(
self, values: _SettingsInputT, priority: Union[int, str] = "project"
) -> None:
def setdict(self, values: _SettingsInputT, priority: int | str = "project") -> None:
self.update(values, priority)
def setmodule(
self, module: Union[ModuleType, str], priority: Union[int, str] = "project"
self, module: ModuleType | str, priority: int | str = "project"
) -> None:
"""
Store settings from a module with a given priority.
@ -376,7 +372,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
self.set(key, getattr(module, key), priority)
# BaseSettings.update() doesn't support all inputs that MutableMapping.update() supports
def update(self, values: _SettingsInputT, priority: Union[int, str] = "project") -> None: # type: ignore[override]
def update(self, values: _SettingsInputT, priority: int | str = "project") -> None: # type: ignore[override]
"""
Store key/value pairs with a given priority.
@ -409,9 +405,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
for name, value in values.items():
self.set(name, value, priority)
def delete(
self, name: _SettingsKeyT, priority: Union[int, str] = "project"
) -> None:
def delete(self, name: _SettingsKeyT, priority: int | str = "project") -> None:
if name not in self:
raise KeyError(name)
self._assert_mutability()
@ -525,9 +519,7 @@ class Settings(BaseSettings):
described on :ref:`topics-settings-ref` already populated.
"""
def __init__(
self, values: _SettingsInputT = None, priority: Union[int, str] = "project"
):
def __init__(self, values: _SettingsInputT = None, priority: int | str = "project"):
# Do not pass kwarg values here. We don't want to promote user-defined
# dicts, and we want to update, not replace, default dicts with the
# values given by the user

View File

@ -8,7 +8,7 @@ from __future__ import annotations
import os
import signal
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from itemadapter import is_item
from twisted.internet import defer, threads
@ -37,25 +37,25 @@ class Shell:
def __init__(
self,
crawler: Crawler,
update_vars: Optional[Callable[[dict[str, Any]], None]] = None,
code: Optional[str] = None,
update_vars: Callable[[dict[str, Any]], None] | None = None,
code: str | None = None,
):
self.crawler: Crawler = crawler
self.update_vars: Callable[[dict[str, Any]], None] = update_vars or (
lambda x: None
)
self.item_class: type = load_object(crawler.settings["DEFAULT_ITEM_CLASS"])
self.spider: Optional[Spider] = None
self.spider: Spider | None = None
self.inthread: bool = not threadable.isInIOThread()
self.code: Optional[str] = code
self.code: str | None = code
self.vars: dict[str, Any] = {}
def start(
self,
url: Optional[str] = None,
request: Optional[Request] = None,
response: Optional[Response] = None,
spider: Optional[Spider] = None,
url: str | None = None,
request: Request | None = None,
response: Response | None = None,
spider: Spider | None = None,
redirect: bool = True,
) -> None:
# disable accidental Ctrl-C key press from shutting down the engine
@ -97,9 +97,7 @@ class Shell:
self.vars, shells=shells, banner=self.vars.pop("banner", "")
)
def _schedule(
self, request: Request, spider: Optional[Spider]
) -> defer.Deferred[Any]:
def _schedule(self, request: Request, spider: Spider | None) -> defer.Deferred[Any]:
if is_asyncio_reactor_installed():
# set the asyncio event loop for the current thread
event_loop_path = self.crawler.settings["ASYNCIO_EVENT_LOOP"]
@ -111,7 +109,7 @@ class Shell:
self.crawler.engine.crawl(request)
return d
def _open_spider(self, request: Request, spider: Optional[Spider]) -> Spider:
def _open_spider(self, request: Request, spider: Spider | None) -> Spider:
if self.spider:
return self.spider
@ -126,8 +124,8 @@ class Shell:
def fetch(
self,
request_or_url: Union[Request, str],
spider: Optional[Spider] = None,
request_or_url: Request | str,
spider: Spider | None = None,
redirect: bool = True,
**kwargs: Any,
) -> None:
@ -155,9 +153,9 @@ class Shell:
def populate_vars(
self,
response: Optional[Response] = None,
request: Optional[Request] = None,
spider: Optional[Spider] = None,
response: Response | None = None,
request: Request | None = None,
spider: Spider | None = None,
) -> None:
import scrapy

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/spider-middleware.rst
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from scrapy.exceptions import IgnoreRequest
@ -65,7 +65,7 @@ class HttpErrorMiddleware:
def process_spider_exception(
self, response: Response, exception: Exception, spider: Spider
) -> Optional[Iterable[Any]]:
) -> Iterable[Any] | None:
if isinstance(exception, HttpError):
assert spider.crawler.stats
spider.crawler.stats.inc_value("httperror/response_ignored_count")

View File

@ -6,7 +6,7 @@ originated it.
from __future__ import annotations
import warnings
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, cast
from urllib.parse import urlparse
from w3lib.url import safe_url_string
@ -50,20 +50,20 @@ class ReferrerPolicy:
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES
name: str
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
raise NotImplementedError()
def stripped_referrer(self, url: str) -> Optional[str]:
def stripped_referrer(self, url: str) -> str | None:
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
return self.strip_url(url)
return None
def origin_referrer(self, url: str) -> Optional[str]:
def origin_referrer(self, url: str) -> str | None:
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
return self.origin(url)
return None
def strip_url(self, url: str, origin_only: bool = False) -> Optional[str]:
def strip_url(self, url: str, origin_only: bool = False) -> str | None:
"""
https://www.w3.org/TR/referrer-policy/#strip-url
@ -87,7 +87,7 @@ class ReferrerPolicy:
origin_only=origin_only,
)
def origin(self, url: str) -> Optional[str]:
def origin(self, url: str) -> str | None:
"""Return serialized origin (scheme, host, path) for a request or response URL."""
return self.strip_url(url, origin_only=True)
@ -113,7 +113,7 @@ class NoReferrerPolicy(ReferrerPolicy):
name: str = POLICY_NO_REFERRER
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
return None
@ -134,7 +134,7 @@ class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
name: str = POLICY_NO_REFERRER_WHEN_DOWNGRADE
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
if not self.tls_protected(response_url) or self.tls_protected(request_url):
return self.stripped_referrer(response_url)
return None
@ -153,7 +153,7 @@ class SameOriginPolicy(ReferrerPolicy):
name: str = POLICY_SAME_ORIGIN
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
if self.origin(response_url) == self.origin(request_url):
return self.stripped_referrer(response_url)
return None
@ -171,7 +171,7 @@ class OriginPolicy(ReferrerPolicy):
name: str = POLICY_ORIGIN
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
return self.origin_referrer(response_url)
@ -191,7 +191,7 @@ class StrictOriginPolicy(ReferrerPolicy):
name: str = POLICY_STRICT_ORIGIN
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
if (
self.tls_protected(response_url)
and self.potentially_trustworthy(request_url)
@ -215,7 +215,7 @@ class OriginWhenCrossOriginPolicy(ReferrerPolicy):
name: str = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
origin = self.origin(response_url)
if origin == self.origin(request_url):
return self.stripped_referrer(response_url)
@ -242,7 +242,7 @@ class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
name: str = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
origin = self.origin(response_url)
if origin == self.origin(request_url):
return self.stripped_referrer(response_url)
@ -271,7 +271,7 @@ class UnsafeUrlPolicy(ReferrerPolicy):
name: str = POLICY_UNSAFE_URL
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
def referrer(self, response_url: str, request_url: str) -> str | None:
return self.stripped_referrer(response_url)
@ -307,7 +307,7 @@ _policy_classes[""] = NoReferrerWhenDowngradePolicy
def _load_policy_class(
policy: str, warning_only: bool = False
) -> Optional[type[ReferrerPolicy]]:
) -> type[ReferrerPolicy] | None:
"""
Expect a string for the path to the policy class,
otherwise try to interpret the string as a standard value
@ -331,7 +331,7 @@ def _load_policy_class(
class RefererMiddleware:
def __init__(self, settings: Optional[BaseSettings] = None):
def __init__(self, settings: BaseSettings | None = None):
self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy
if settings is not None:
settings_policy = _load_policy_class(settings.get("REFERRER_POLICY"))
@ -349,9 +349,7 @@ class RefererMiddleware:
return mw
def policy(
self, resp_or_url: Union[Response, str], request: Request
) -> ReferrerPolicy:
def policy(self, resp_or_url: Response | str, request: Request) -> ReferrerPolicy:
"""
Determine Referrer-Policy to use from a parent Response (or URL),
and a Request to be sent.

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Optional, cast
from typing import TYPE_CHECKING, Any, cast
from scrapy import signals
from scrapy.http import Request, Response
@ -34,9 +34,9 @@ class Spider(object_ref):
"""
name: str
custom_settings: Optional[dict[_SettingsKeyT, Any]] = None
custom_settings: dict[_SettingsKeyT, Any] | None = None
def __init__(self, name: Optional[str] = None, **kwargs: Any):
def __init__(self, name: str | None = None, **kwargs: Any):
if name is not None:
self.name: str = name
elif not getattr(self, "name", None):
@ -103,10 +103,10 @@ class Spider(object_ref):
return url_is_from_spider(request.url, cls)
@staticmethod
def close(spider: Spider, reason: str) -> Optional[Deferred[None]]:
def close(spider: Spider, reason: str) -> Deferred[None] | None:
closed = getattr(spider, "closed", None)
if callable(closed):
return cast("Optional[Deferred[None]]", closed(reason))
return cast("Deferred[None] | None", closed(reason))
return None
def __repr__(self) -> str:

View File

@ -9,7 +9,7 @@ from __future__ import annotations
import copy
from collections.abc import AsyncIterable, Awaitable, Callable
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
from twisted.python.failure import Failure
@ -39,15 +39,11 @@ def _identity(x: _T) -> _T:
return x
def _identity_process_request(
request: Request, response: Response
) -> Optional[Request]:
def _identity_process_request(request: Request, response: Response) -> Request | None:
return request
def _get_method(
method: Union[Callable, str, None], spider: Spider
) -> Optional[Callable]:
def _get_method(method: Callable | str | None, spider: Spider) -> Callable | None:
if callable(method):
return method
if isinstance(method, str):
@ -61,20 +57,20 @@ _default_link_extractor = LinkExtractor()
class Rule:
def __init__(
self,
link_extractor: Optional[LinkExtractor] = None,
callback: Union[CallbackT, str, None] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
follow: Optional[bool] = None,
process_links: Union[ProcessLinksT, str, None] = None,
process_request: Union[ProcessRequestT, str, None] = None,
errback: Union[Callable[[Failure], Any], str, None] = None,
link_extractor: LinkExtractor | None = None,
callback: CallbackT | str | None = None,
cb_kwargs: dict[str, Any] | None = None,
follow: bool | None = None,
process_links: ProcessLinksT | str | None = None,
process_request: ProcessRequestT | str | None = None,
errback: Callable[[Failure], Any] | str | None = None,
):
self.link_extractor: LinkExtractor = link_extractor or _default_link_extractor
self.callback: Union[CallbackT, str, None] = callback
self.errback: Union[Callable[[Failure], Any], str, None] = errback
self.callback: CallbackT | str | None = callback
self.errback: Callable[[Failure], Any] | str | None = errback
self.cb_kwargs: dict[str, Any] = cb_kwargs or {}
self.process_links: Union[ProcessLinksT, str] = process_links or _identity
self.process_request: Union[ProcessRequestT, str] = (
self.process_links: ProcessLinksT | str = process_links or _identity
self.process_request: ProcessRequestT | str = (
process_request or _identity_process_request
)
self.follow: bool = follow if follow is not None else not callback
@ -124,7 +120,7 @@ class CrawlSpider(Spider):
meta={"rule": rule_index, "link_text": link.text},
)
def _requests_to_follow(self, response: Response) -> Iterable[Optional[Request]]:
def _requests_to_follow(self, response: Response) -> Iterable[Request | None]:
if not isinstance(response, HtmlResponse):
return
seen: set[Link] = set()
@ -157,7 +153,7 @@ class CrawlSpider(Spider):
async def _parse_response(
self,
response: Response,
callback: Optional[CallbackT],
callback: CallbackT | None,
cb_kwargs: dict[str, Any],
follow: bool = True,
) -> AsyncIterable[Any]:
@ -176,7 +172,7 @@ class CrawlSpider(Spider):
yield request_or_item
def _handle_failure(
self, failure: Failure, errback: Optional[Callable[[Failure], Any]]
self, failure: Failure, errback: Callable[[Failure], Any] | None
) -> Iterable[Any]:
if errback:
results = errback(failure) or ()

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.http import Response, TextResponse
@ -117,13 +117,13 @@ class CSVFeedSpider(Spider):
and the file's headers.
"""
delimiter: Optional[str] = (
delimiter: str | None = (
None # When this is None, python's csv module's default delimiter is used
)
quotechar: Optional[str] = (
quotechar: str | None = (
None # When this is None, python's csv module's default quotechar is used
)
headers: Optional[list[str]] = None
headers: list[str] | None = None
def process_results(
self, response: Response, results: Iterable[Any]

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Optional, cast
from typing import TYPE_CHECKING, Any, cast
from scrapy import Request
from scrapy.spiders import Spider
@ -18,7 +18,7 @@ class InitSpider(Spider):
self._postinit_reqs: Iterable[Request] = super().start_requests()
return cast(Iterable[Request], iterate_spider_output(self.init_request()))
def initialized(self, response: Optional[Response] = None) -> Any:
def initialized(self, response: Response | None = None) -> Any:
"""This method must be set as the callback of your last initialization
request. See self.init_request() docstring for more info.
"""

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, cast
from scrapy.http import Request, Response, XmlResponse
from scrapy.spiders import Spider
@ -24,10 +24,10 @@ logger = logging.getLogger(__name__)
class SitemapSpider(Spider):
sitemap_urls: Sequence[str] = ()
sitemap_rules: Sequence[
tuple[Union[re.Pattern[str], str], Union[str, CallbackT]]
] = [("", "parse")]
sitemap_follow: Sequence[Union[re.Pattern[str], str]] = [""]
sitemap_rules: Sequence[tuple[re.Pattern[str] | str, str | CallbackT]] = [
("", "parse")
]
sitemap_follow: Sequence[re.Pattern[str] | str] = [""]
sitemap_alternate_links: bool = False
_max_size: int
_warn_size: int
@ -93,7 +93,7 @@ class SitemapSpider(Spider):
yield Request(loc, callback=c)
break
def _get_sitemap_body(self, response: Response) -> Optional[bytes]:
def _get_sitemap_body(self, response: Response) -> bytes | None:
"""Return the sitemap body contained in the given response,
or None if the response is not a sitemap.
"""
@ -127,7 +127,7 @@ class SitemapSpider(Spider):
return None
def regex(x: Union[re.Pattern[str], str]) -> re.Pattern[str]:
def regex(x: re.Pattern[str] | str) -> re.Pattern[str]:
if isinstance(x, str):
return re.compile(x)
return x

View File

@ -7,7 +7,7 @@ from __future__ import annotations
import marshal
import pickle # nosec
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from queuelib import queue
@ -26,7 +26,7 @@ if TYPE_CHECKING:
def _with_mkdir(queue_class: type[queue.BaseQueue]) -> type[queue.BaseQueue]:
class DirectoriesCreated(queue_class): # type: ignore[valid-type,misc]
def __init__(self, path: Union[str, PathLike], *args: Any, **kwargs: Any):
def __init__(self, path: str | PathLike, *args: Any, **kwargs: Any):
dirname = Path(path).parent
if not dirname.exists():
dirname.mkdir(parents=True, exist_ok=True)
@ -45,13 +45,13 @@ def _serializable_queue(
s = serialize(obj)
super().push(s)
def pop(self) -> Optional[Any]:
def pop(self) -> Any | None:
s = super().pop()
if s:
return deserialize(s)
return None
def peek(self) -> Optional[Any]:
def peek(self) -> Any | None:
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.
@ -89,13 +89,13 @@ def _scrapy_serialization_queue(
request_dict = request.to_dict(spider=self.spider)
super().push(request_dict)
def pop(self) -> Optional[Request]:
def pop(self) -> Request | None:
request = super().pop()
if not request:
return None
return request_from_dict(request, spider=self.spider)
def peek(self) -> Optional[Request]:
def peek(self) -> Request | None:
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.
@ -118,7 +118,7 @@ def _scrapy_non_serialization_queue(
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self:
return cls()
def peek(self) -> Optional[Any]:
def peek(self) -> Any | None:
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.

View File

@ -6,7 +6,7 @@ from __future__ import annotations
import logging
import pprint
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from scrapy import Spider
@ -25,32 +25,32 @@ class StatsCollector:
self._stats: StatsT = {}
def get_value(
self, key: str, default: Any = None, spider: Optional[Spider] = None
self, key: str, default: Any = None, spider: Spider | None = None
) -> Any:
return self._stats.get(key, default)
def get_stats(self, spider: Optional[Spider] = None) -> StatsT:
def get_stats(self, spider: Spider | None = None) -> StatsT:
return self._stats
def set_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
self._stats[key] = value
def set_stats(self, stats: StatsT, spider: Optional[Spider] = None) -> None:
def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
self._stats = stats
def inc_value(
self, key: str, count: int = 1, start: int = 0, spider: Optional[Spider] = None
self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
) -> None:
d = self._stats
d[key] = d.setdefault(key, start) + count
def max_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
self._stats[key] = max(self._stats.setdefault(key, value), value)
def min_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
self._stats[key] = min(self._stats.setdefault(key, value), value)
def clear_stats(self, spider: Optional[Spider] = None) -> None:
def clear_stats(self, spider: Spider | None = None) -> None:
self._stats.clear()
def open_spider(self, spider: Spider) -> None:
@ -79,23 +79,23 @@ class MemoryStatsCollector(StatsCollector):
class DummyStatsCollector(StatsCollector):
def get_value(
self, key: str, default: Any = None, spider: Optional[Spider] = None
self, key: str, default: Any = None, spider: Spider | None = None
) -> Any:
return default
def set_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
pass
def set_stats(self, stats: StatsT, spider: Optional[Spider] = None) -> None:
def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
pass
def inc_value(
self, key: str, count: int = 1, start: int = 0, spider: Optional[Spider] = None
self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
) -> None:
pass
def max_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
pass
def min_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
pass

View File

@ -1,5 +1,7 @@
from __future__ import annotations
from collections.abc import AsyncGenerator, AsyncIterable, Iterable
from typing import TypeVar, Union
from typing import TypeVar
_T = TypeVar("_T")
@ -12,8 +14,8 @@ async def collect_asyncgen(result: AsyncIterable[_T]) -> list[_T]:
async def as_async_generator(
it: Union[Iterable[_T], AsyncIterable[_T]]
) -> AsyncGenerator[_T, None]:
it: Iterable[_T] | AsyncIterable[_T],
) -> AsyncGenerator[_T]:
"""Wraps an iterable (sync or async) into an async generator."""
if isinstance(it, AsyncIterable):
async for r in it:

View File

@ -8,7 +8,7 @@ from collections.abc import Iterable
from configparser import ConfigParser
from operator import itemgetter
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Callable, cast
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
from scrapy.settings import BaseSettings
@ -33,7 +33,7 @@ def build_component_list(
"please update your settings"
)
def _map_keys(compdict: Mapping[Any, Any]) -> Union[BaseSettings, dict[Any, Any]]:
def _map_keys(compdict: Mapping[Any, Any]) -> BaseSettings | dict[Any, Any]:
if isinstance(compdict, BaseSettings):
compbs = BaseSettings()
for k, v in compdict.items():
@ -86,8 +86,8 @@ def arglist_to_dict(arglist: list[str]) -> dict[str, str]:
def closest_scrapy_cfg(
path: Union[str, os.PathLike] = ".",
prevpath: Optional[Union[str, os.PathLike]] = None,
path: str | os.PathLike = ".",
prevpath: str | os.PathLike | None = None,
) -> str:
"""Return the path to the closest scrapy.cfg file by traversing the current
directory and its parents
@ -159,8 +159,8 @@ def feed_complete_default_values_from_settings(
def feed_process_params_from_cli(
settings: BaseSettings,
output: list[str],
output_format: Optional[str] = None,
overwrite_output: Optional[list[str]] = None,
output_format: str | None = None,
overwrite_output: list[str] | None = None,
) -> dict[str, dict[str, Any]]:
"""
Receives feed export params (from the 'crawl' or 'runspider' commands),

View File

@ -2,7 +2,7 @@ from __future__ import annotations
from collections.abc import Callable
from functools import wraps
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from collections.abc import Iterable
@ -100,7 +100,7 @@ DEFAULT_PYTHON_SHELLS: KnownShellsT = {
def get_shell_embed_func(
shells: Optional[Iterable[str]] = None, known_shells: Optional[KnownShellsT] = None
shells: Iterable[str] | None = None, known_shells: KnownShellsT | None = None
) -> Any:
"""Return the first acceptable shell-embed function
from a given list of shell names.
@ -120,9 +120,9 @@ def get_shell_embed_func(
def start_python_console(
namespace: Optional[dict[str, Any]] = None,
namespace: dict[str, Any] | None = None,
banner: str = "",
shells: Optional[Iterable[str]] = None,
shells: Iterable[str] | None = None,
) -> None:
"""Start Python console bound to the given namespace.
Readline support and tab completion will be used on Unix, if available.

View File

@ -4,7 +4,7 @@ import argparse
import warnings
from http.cookies import SimpleCookie
from shlex import split
from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
from typing import TYPE_CHECKING, Any, NoReturn
from urllib.parse import urlparse
from w3lib.http import basic_auth_header
@ -18,8 +18,8 @@ class DataAction(argparse.Action):
self,
parser: argparse.ArgumentParser,
namespace: argparse.Namespace,
values: Union[str, Sequence[Any], None],
option_string: Optional[str] = None,
values: str | Sequence[Any] | None,
option_string: str | None = None,
) -> None:
value = str(values)
if value.startswith("$"):

View File

@ -12,7 +12,7 @@ import warnings
import weakref
from collections import OrderedDict
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, AnyStr, Optional, TypeVar, Union
from typing import TYPE_CHECKING, Any, AnyStr, TypeVar
from scrapy.exceptions import ScrapyDeprecationWarning
@ -44,7 +44,7 @@ class CaselessDict(dict):
def __init__(
self,
seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
):
super().__init__()
if seq:
@ -84,7 +84,7 @@ class CaselessDict(dict):
return dict.setdefault(self, self.normkey(key), self.normvalue(def_val)) # type: ignore[arg-type]
# doesn't fully implement MutableMapping.update()
def update(self, seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]]]) -> None: # type: ignore[override]
def update(self, seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]]) -> None: # type: ignore[override]
seq = seq.items() if isinstance(seq, Mapping) else seq
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
super().update(iseq)
@ -145,9 +145,9 @@ class LocalCache(OrderedDict[_KT, _VT]):
Older items expires first.
"""
def __init__(self, limit: Optional[int] = None):
def __init__(self, limit: int | None = None):
super().__init__()
self.limit: Optional[int] = limit
self.limit: int | None = limit
def __setitem__(self, key: _KT, value: _VT) -> None:
if self.limit:
@ -168,7 +168,7 @@ class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
it cannot be instantiated with an initial dictionary.
"""
def __init__(self, limit: Optional[int] = None):
def __init__(self, limit: int | None = None):
super().__init__()
self.data: LocalCache = LocalCache(limit=limit)
@ -178,7 +178,7 @@ class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
except TypeError:
pass # key is not weak-referenceable, skip caching
def __getitem__(self, key: _KT) -> Optional[_VT]: # type: ignore[override]
def __getitem__(self, key: _KT) -> _VT | None: # type: ignore[override]
try:
return super().__getitem__(key)
except (TypeError, KeyError):

View File

@ -11,7 +11,7 @@ from asyncio import Future
from collections.abc import Awaitable, Coroutine, Iterable, Iterator
from functools import wraps
from types import CoroutineType
from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar, Union, cast, overload
from typing import TYPE_CHECKING, Any, Generic, TypeVar, Union, cast, overload
from twisted.internet import defer
from twisted.internet.defer import Deferred, DeferredList, ensureDeferred
@ -93,7 +93,7 @@ def mustbe_deferred(
def mustbe_deferred(
f: Callable[_P, Union[Deferred[_T], Coroutine[Deferred[Any], Any, _T], _T]],
f: Callable[_P, Deferred[_T] | Coroutine[Deferred[Any], Any, _T] | _T],
*args: _P.args,
**kw: _P.kwargs,
) -> Deferred[_T]:
@ -179,17 +179,17 @@ class _AsyncCooperatorAdapter(Iterator, Generic[_T]):
def __init__(
self,
aiterable: AsyncIterable[_T],
callable: Callable[Concatenate[_T, _P], Optional[Deferred[Any]]],
callable: Callable[Concatenate[_T, _P], Deferred[Any] | None],
*callable_args: _P.args,
**callable_kwargs: _P.kwargs,
):
self.aiterator: AsyncIterator[_T] = aiterable.__aiter__()
self.callable: Callable[Concatenate[_T, _P], Optional[Deferred[Any]]] = callable
self.callable: Callable[Concatenate[_T, _P], Deferred[Any] | None] = callable
self.callable_args: tuple[Any, ...] = callable_args
self.callable_kwargs: dict[str, Any] = callable_kwargs
self.finished: bool = False
self.waiting_deferreds: list[Deferred[Any]] = []
self.anext_deferred: Optional[Deferred[_T]] = None
self.anext_deferred: Deferred[_T] | None = None
def _callback(self, result: _T) -> None:
# This gets called when the result from aiterator.__anext__() is available.
@ -237,7 +237,7 @@ class _AsyncCooperatorAdapter(Iterator, Generic[_T]):
def parallel_async(
async_iterable: AsyncIterable[_T],
count: int,
callable: Callable[Concatenate[_T, _P], Optional[Deferred[Any]]],
callable: Callable[Concatenate[_T, _P], Deferred[Any] | None],
*args: _P.args,
**named: _P.kwargs,
) -> Deferred[list[tuple[bool, Iterator[Deferred[Any]]]]]:
@ -362,7 +362,7 @@ def deferred_from_coro(o: _CT) -> Deferred: ...
def deferred_from_coro(o: _T) -> _T: ...
def deferred_from_coro(o: _T) -> Union[Deferred, _T]:
def deferred_from_coro(o: _T) -> Deferred | _T:
"""Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
if isinstance(o, Deferred):
return o
@ -433,7 +433,7 @@ def deferred_to_future(d: Deferred[_T]) -> Future[_T]:
return d.asFuture(_get_asyncio_event_loop())
def maybe_deferred_to_future(d: Deferred[_T]) -> Union[Deferred[_T], Future[_T]]:
def maybe_deferred_to_future(d: Deferred[_T]) -> Deferred[_T] | Future[_T]:
"""
.. versionadded:: 2.6.0

View File

@ -1,8 +1,10 @@
"""Some helpers for deprecation messages"""
from __future__ import annotations
import inspect
import warnings
from typing import Any, Optional, overload
from typing import Any, overload
from scrapy.exceptions import ScrapyDeprecationWarning
@ -20,11 +22,11 @@ def attribute(obj: Any, oldattr: str, newattr: str, version: str = "0.12") -> No
def create_deprecated_class(
name: str,
new_class: type,
clsdict: Optional[dict[str, Any]] = None,
clsdict: dict[str, Any] | None = None,
warn_category: type[Warning] = ScrapyDeprecationWarning,
warn_once: bool = True,
old_class_path: Optional[str] = None,
new_class_path: Optional[str] = None,
old_class_path: str | None = None,
new_class_path: str | None = None,
subclass_warn_message: str = "{cls} inherits from deprecated class {old}, please inherit from {new}.",
instance_warn_message: str = "{cls} is deprecated, instantiate {new} instead.",
) -> type:
@ -55,7 +57,7 @@ def create_deprecated_class(
# https://github.com/python/mypy/issues/4177
class DeprecatedClass(new_class.__class__): # type: ignore[misc, name-defined]
deprecated_class: Optional[type] = None
deprecated_class: type | None = None
warned_on_subclass: bool = False
def __new__(
@ -128,7 +130,7 @@ def create_deprecated_class(
return deprecated_cls
def _clspath(cls: type, forced: Optional[str] = None) -> str:
def _clspath(cls: type, forced: str | None = None) -> str:
if forced is not None:
return forced
return f"{cls.__module__}.{cls.__name__}"

View File

@ -2,7 +2,7 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse
from weakref import WeakKeyDictionary
@ -10,12 +10,12 @@ if TYPE_CHECKING:
from scrapy.http import Request, Response
_urlparse_cache: WeakKeyDictionary[Union[Request, Response], ParseResult] = (
_urlparse_cache: WeakKeyDictionary[Request | Response, ParseResult] = (
WeakKeyDictionary()
)
def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult:
def urlparse_cached(request_or_response: Request | Response) -> ParseResult:
"""Return urlparse.urlparse caching the result, where the argument can be a
Request or Response object
"""

View File

@ -4,7 +4,7 @@ import csv
import logging
import re
from io import StringIO
from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast, overload
from typing import TYPE_CHECKING, Any, Literal, cast, overload
from warnings import warn
from lxml import etree # nosec
@ -20,7 +20,7 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
def xmliter(obj: Union[Response, str, bytes], nodename: str) -> Iterator[Selector]:
def xmliter(obj: Response | str | bytes, nodename: str) -> Iterator[Selector]:
"""Return a iterator of Selector's over all nodes of a XML document,
given the name of the node to iterate. Useful for parsing XML feeds.
@ -77,9 +77,9 @@ def xmliter(obj: Union[Response, str, bytes], nodename: str) -> Iterator[Selecto
def xmliter_lxml(
obj: Union[Response, str, bytes],
obj: Response | str | bytes,
nodename: str,
namespace: Optional[str] = None,
namespace: str | None = None,
prefix: str = "x",
) -> Iterator[Selector]:
reader = _StreamReader(obj)
@ -120,9 +120,9 @@ def xmliter_lxml(
class _StreamReader:
def __init__(self, obj: Union[Response, str, bytes]):
def __init__(self, obj: Response | str | bytes):
self._ptr: int = 0
self._text: Union[str, bytes]
self._text: str | bytes
if isinstance(obj, TextResponse):
self._text, self.encoding = obj.body, obj.encoding
elif isinstance(obj, Response):
@ -154,11 +154,11 @@ class _StreamReader:
def csviter(
obj: Union[Response, str, bytes],
delimiter: Optional[str] = None,
headers: Optional[list[str]] = None,
encoding: Optional[str] = None,
quotechar: Optional[str] = None,
obj: Response | str | bytes,
delimiter: str | None = None,
headers: list[str] | None = None,
encoding: str | None = None,
quotechar: str | None = None,
) -> Iterator[dict[str, str]]:
"""Returns an iterator of dictionaries from the given csv object
@ -214,22 +214,18 @@ def csviter(
@overload
def _body_or_str(obj: Union[Response, str, bytes]) -> str: ...
def _body_or_str(obj: Response | str | bytes) -> str: ...
@overload
def _body_or_str(obj: Union[Response, str, bytes], unicode: Literal[True]) -> str: ...
def _body_or_str(obj: Response | str | bytes, unicode: Literal[True]) -> str: ...
@overload
def _body_or_str(
obj: Union[Response, str, bytes], unicode: Literal[False]
) -> bytes: ...
def _body_or_str(obj: Response | str | bytes, unicode: Literal[False]) -> bytes: ...
def _body_or_str(
obj: Union[Response, str, bytes], unicode: bool = True
) -> Union[str, bytes]:
def _body_or_str(obj: Response | str | bytes, unicode: bool = True) -> str | bytes:
expected_types = (Response, str, bytes)
if not isinstance(obj, expected_types):
expected_types_str = " or ".join(t.__name__ for t in expected_types)

View File

@ -1,14 +1,14 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from scrapy.settings import BaseSettings
def job_dir(settings: BaseSettings) -> Optional[str]:
path: Optional[str] = settings["JOBDIR"]
def job_dir(settings: BaseSettings) -> str | None:
path: str | None = settings["JOBDIR"]
if not path:
return None
if not Path(path).exists():

View File

@ -5,7 +5,7 @@ import sys
from collections.abc import MutableMapping
from logging.config import dictConfig
from types import TracebackType
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Optional, cast
from twisted.python import log as twisted_log
from twisted.python.failure import Failure
@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
def failure_to_exc_info(
failure: Failure,
) -> Optional[tuple[type[BaseException], BaseException, Optional[TracebackType]]]:
) -> tuple[type[BaseException], BaseException, TracebackType | None] | None:
"""Extract exc_info from Failure instances"""
if isinstance(failure, Failure):
assert failure.type
@ -50,7 +50,7 @@ class TopLevelFormatter(logging.Filter):
``loggers`` list where it should act.
"""
def __init__(self, loggers: Optional[list[str]] = None):
def __init__(self, loggers: list[str] | None = None):
self.loggers: list[str] = loggers or []
def filter(self, record: logging.LogRecord) -> bool:
@ -80,7 +80,7 @@ DEFAULT_LOGGING = {
def configure_logging(
settings: Union[Settings, dict[_SettingsKeyT, Any], None] = None,
settings: Settings | dict[_SettingsKeyT, Any] | None = None,
install_root_handler: bool = True,
) -> None:
"""
@ -125,7 +125,7 @@ def configure_logging(
install_scrapy_root_handler(settings)
_scrapy_root_handler: Optional[logging.Handler] = None
_scrapy_root_handler: logging.Handler | None = None
def install_scrapy_root_handler(settings: Settings) -> None:
@ -141,7 +141,7 @@ def install_scrapy_root_handler(settings: Settings) -> None:
logging.root.addHandler(_scrapy_root_handler)
def get_scrapy_root_handler() -> Optional[logging.Handler]:
def get_scrapy_root_handler() -> logging.Handler | None:
return _scrapy_root_handler
@ -231,7 +231,7 @@ class LogCounterHandler(logging.Handler):
def logformatter_adapter(
logkws: LogFormatterResult,
) -> tuple[int, str, Union[dict[str, Any], tuple[Any, ...]]]:
) -> tuple[int, str, dict[str, Any] | tuple[Any, ...]]:
"""
Helper that takes the dictionary output from the methods in LogFormatter
and adapts it into a tuple of positional arguments for logger.log calls,

View File

@ -14,7 +14,7 @@ from contextlib import contextmanager
from functools import partial
from importlib import import_module
from pkgutil import iter_modules
from typing import IO, TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from typing import IO, TYPE_CHECKING, Any, TypeVar, cast
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.item import Item
@ -46,7 +46,7 @@ def arg_to_iter(arg: Any) -> Iterable[Any]:
return [arg]
def load_object(path: Union[str, Callable[..., Any]]) -> Any:
def load_object(path: str | Callable[..., Any]) -> Any:
"""Load an object given its absolute object path, and return it.
The object can be the import path of a class, function, variable or an
@ -126,7 +126,7 @@ def md5sum(file: IO[bytes]) -> str:
return m.hexdigest()
def rel_has_nofollow(rel: Optional[str]) -> bool:
def rel_has_nofollow(rel: str | None) -> bool:
"""Return True if link rel attribute has nofollow type"""
return rel is not None and "nofollow" in rel.replace(",", " ").split()

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import signal
from collections.abc import Callable
from types import FrameType

View File

@ -1,8 +1,9 @@
from __future__ import annotations
import os
import warnings
from importlib import import_module
from pathlib import Path
from typing import Union
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
@ -45,7 +46,7 @@ def project_data_dir(project: str = "default") -> str:
return str(d)
def data_path(path: Union[str, os.PathLike[str]], createdir: bool = False) -> str:
def data_path(path: str | os.PathLike[str], createdir: bool = False) -> str:
"""
Return the given path joined with the .scrapy data directory.
If given an absolute path, return it unmodified.

View File

@ -12,7 +12,7 @@ import weakref
from collections.abc import AsyncIterable, Iterable, Mapping
from functools import partial, wraps
from itertools import chain
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, overload
from typing import TYPE_CHECKING, Any, TypeVar, overload
from scrapy.utils.asyncgen import as_async_generator
@ -99,7 +99,7 @@ def unique(list_: Iterable[_T], key: Callable[[_T], Any] = lambda x: x) -> list[
def to_unicode(
text: Union[str, bytes], encoding: Optional[str] = None, errors: str = "strict"
text: str | bytes, encoding: str | None = None, errors: str = "strict"
) -> str:
"""Return the unicode representation of a bytes object ``text``. If
``text`` is already an unicode object, return it as-is."""
@ -116,7 +116,7 @@ def to_unicode(
def to_bytes(
text: Union[str, bytes], encoding: Optional[str] = None, errors: str = "strict"
text: str | bytes, encoding: str | None = None, errors: str = "strict"
) -> bytes:
"""Return the binary representation of ``text``. If ``text``
is already a bytes object, return it as-is."""
@ -132,8 +132,8 @@ def to_bytes(
def re_rsearch(
pattern: Union[str, Pattern[str]], text: str, chunk_size: int = 1024
) -> Optional[tuple[int, int]]:
pattern: str | Pattern[str], text: str, chunk_size: int = 1024
) -> tuple[int, int] | None:
"""
This function does a reverse search in a text using a regular expression
given in the attribute 'pattern'.
@ -269,7 +269,7 @@ def get_spec(func: Callable[..., Any]) -> tuple[list[str], dict[str, Any]]:
def equal_attributes(
obj1: Any, obj2: Any, attributes: Optional[list[Union[str, Callable[[Any], Any]]]]
obj1: Any, obj2: Any, attributes: list[str | Callable[[Any], Any]] | None
) -> bool:
"""Compare two objects attributes"""
# not attributes given return False by default
@ -297,8 +297,8 @@ def without_none_values(iterable: Iterable[_KT]) -> Iterable[_KT]: ...
def without_none_values(
iterable: Union[Mapping[_KT, _VT], Iterable[_KT]]
) -> Union[dict[_KT, _VT], Iterable[_KT]]:
iterable: Mapping[_KT, _VT] | Iterable[_KT]
) -> dict[_KT, _VT] | Iterable[_KT]:
"""Return a copy of ``iterable`` with all ``None`` entries removed.
If ``iterable`` is a mapping, return a dictionary where all pairs that have
@ -354,7 +354,7 @@ class MutableChain(Iterable[_T]):
async def _async_chain(
*iterables: Union[Iterable[_T], AsyncIterable[_T]]
*iterables: Iterable[_T] | AsyncIterable[_T],
) -> AsyncIterator[_T]:
for it in iterables:
async for o in as_async_generator(it):
@ -366,10 +366,10 @@ class MutableAsyncChain(AsyncIterable[_T]):
Similar to MutableChain but for async iterables
"""
def __init__(self, *args: Union[Iterable[_T], AsyncIterable[_T]]):
def __init__(self, *args: Iterable[_T] | AsyncIterable[_T]):
self.data: AsyncIterator[_T] = _async_chain(*args)
def extend(self, *iterables: Union[Iterable[_T], AsyncIterable[_T]]) -> None:
def extend(self, *iterables: Iterable[_T] | AsyncIterable[_T]) -> None:
self.data = _async_chain(self.data, _async_chain(*iterables))
def __aiter__(self) -> AsyncIterator[_T]:

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import asyncio
import sys
from contextlib import suppress
from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
from typing import TYPE_CHECKING, Any, Generic, TypeVar
from warnings import catch_warnings, filterwarnings, warn
from twisted.internet import asyncioreactor, error
@ -54,7 +54,7 @@ class CallLaterOnce(Generic[_T]):
self._func: Callable[_P, _T] = func
self._a: tuple[Any, ...] = a
self._kw: dict[str, Any] = kw
self._call: Optional[DelayedCall] = None
self._call: DelayedCall | None = None
def schedule(self, delay: float = 0) -> None:
from twisted.internet import reactor
@ -107,7 +107,7 @@ def _get_asyncio_event_loop_policy() -> AbstractEventLoopPolicy:
return policy
def install_reactor(reactor_path: str, event_loop_path: Optional[str] = None) -> None:
def install_reactor(reactor_path: str, event_loop_path: str | None = None) -> None:
"""Installs the :mod:`~twisted.internet.reactor` with the specified
import path. Also installs the asyncio event loop with the specified import
path if the asyncio reactor is enabled"""
@ -129,7 +129,7 @@ def _get_asyncio_event_loop() -> AbstractEventLoop:
return set_asyncio_event_loop(None)
def set_asyncio_event_loop(event_loop_path: Optional[str]) -> AbstractEventLoop:
def set_asyncio_event_loop(event_loop_path: str | None) -> AbstractEventLoop:
"""Sets and returns the event loop with specified import path."""
if event_loop_path is not None:
event_loop_class: type[AbstractEventLoop] = load_object(event_loop_path)

View File

@ -8,7 +8,7 @@ from __future__ import annotations
import hashlib
import json
import warnings
from typing import TYPE_CHECKING, Any, Optional, Protocol, Union
from typing import TYPE_CHECKING, Any, Protocol
from urllib.parse import urlunparse
from weakref import WeakKeyDictionary
@ -38,7 +38,7 @@ def _serialize_headers(headers: Iterable[bytes], request: Request) -> Iterable[b
_fingerprint_cache: WeakKeyDictionary[
Request, dict[tuple[Optional[tuple[bytes, ...]], bool], bytes]
Request, dict[tuple[tuple[bytes, ...] | None, bool], bytes]
]
_fingerprint_cache = WeakKeyDictionary()
@ -46,7 +46,7 @@ _fingerprint_cache = WeakKeyDictionary()
def fingerprint(
request: Request,
*,
include_headers: Optional[Iterable[Union[bytes, str]]] = None,
include_headers: Iterable[bytes | str] | None = None,
keep_fragments: bool = False,
) -> bytes:
"""
@ -79,7 +79,7 @@ def fingerprint(
If you want to include them, set the keep_fragments argument to True
(for instance when handling requests with a headless browser).
"""
processed_include_headers: Optional[tuple[bytes, ...]] = None
processed_include_headers: tuple[bytes, ...] | None = None
if include_headers:
processed_include_headers = tuple(
to_bytes(h.lower()) for h in sorted(include_headers)
@ -129,7 +129,7 @@ class RequestFingerprinter:
def from_crawler(cls, crawler: Crawler) -> Self:
return cls(crawler)
def __init__(self, crawler: Optional[Crawler] = None):
def __init__(self, crawler: Crawler | None = None):
if crawler:
implementation = crawler.settings.get(
"REQUEST_FINGERPRINTER_IMPLEMENTATION"
@ -177,7 +177,7 @@ def request_httprepr(request: Request) -> bytes:
return s
def referer_str(request: Request) -> Optional[str]:
def referer_str(request: Request) -> str | None:
"""Return Referer HTTP header suitable for logging."""
referrer = request.headers.get("Referer")
if referrer is None:
@ -185,7 +185,7 @@ def referer_str(request: Request) -> Optional[str]:
return to_unicode(referrer, errors="replace")
def request_from_dict(d: dict[str, Any], *, spider: Optional[Spider] = None) -> Request:
def request_from_dict(d: dict[str, Any], *, spider: Spider | None = None) -> Request:
"""Create a :class:`~scrapy.Request` object from a dict.
If a spider is given, it will try to resolve the callbacks looking at the

View File

@ -9,7 +9,7 @@ import os
import re
import tempfile
import webbrowser
from typing import TYPE_CHECKING, Any, Union
from typing import TYPE_CHECKING, Any
from weakref import WeakKeyDictionary
from twisted.web import http
@ -35,15 +35,15 @@ def get_base_url(response: TextResponse) -> str:
return _baseurl_cache[response]
_metaref_cache: WeakKeyDictionary[
Response, Union[tuple[None, None], tuple[float, str]]
] = WeakKeyDictionary()
_metaref_cache: WeakKeyDictionary[Response, tuple[None, None] | tuple[float, str]] = (
WeakKeyDictionary()
)
def get_meta_refresh(
response: TextResponse,
ignore_tags: Iterable[str] = ("script", "noscript"),
) -> Union[tuple[None, None], tuple[float, str]]:
) -> tuple[None, None] | tuple[float, str]:
"""Parse the http-equiv refresh parameter from the given response"""
if response not in _metaref_cache:
text = response.text[0:4096]
@ -53,7 +53,7 @@ def get_meta_refresh(
return _metaref_cache[response]
def response_status_message(status: Union[bytes, float, int, str]) -> str:
def response_status_message(status: bytes | float | int | str) -> str:
"""Return status code plus status text descriptive message"""
status_int = int(status)
message = http.RESPONSES.get(status_int, "Unknown Status")

Some files were not shown because too many files have changed in this diff Show More