mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Remove --keep-runtime-typing from pyupgrade.
This commit is contained in:
parent
c8e87ab21a
commit
c9095ef927
@ -30,7 +30,7 @@ repos:
|
||||
additional_dependencies:
|
||||
- black==24.4.2
|
||||
- repo: https://github.com/asottile/pyupgrade
|
||||
rev: v3.16.0
|
||||
rev: v3.18.0
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: [--py39-plus, --keep-runtime-typing]
|
||||
args: [--py39-plus]
|
||||
|
@ -6,7 +6,7 @@ import inspect
|
||||
import os
|
||||
import sys
|
||||
from importlib.metadata import entry_points
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import BaseRunSpiderCommand, ScrapyCommand, ScrapyHelpFormatter
|
||||
@ -30,7 +30,7 @@ if TYPE_CHECKING:
|
||||
class ScrapyArgumentParser(argparse.ArgumentParser):
|
||||
def _parse_optional(
|
||||
self, arg_string: str
|
||||
) -> Optional[tuple[Optional[argparse.Action], str, Optional[str]]]:
|
||||
) -> tuple[argparse.Action | None, str, str | None] | None:
|
||||
# if starts with -: it means that is a parameter not a argument
|
||||
if arg_string[:2] == "-:":
|
||||
return None
|
||||
@ -89,7 +89,7 @@ def _get_commands_dict(
|
||||
return cmds
|
||||
|
||||
|
||||
def _pop_command_name(argv: list[str]) -> Optional[str]:
|
||||
def _pop_command_name(argv: list[str]) -> str | None:
|
||||
i = 0
|
||||
for arg in argv[1:]:
|
||||
if not arg.startswith("-"):
|
||||
@ -147,9 +147,7 @@ def _run_print_help(
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def execute(
|
||||
argv: Optional[list[str]] = None, settings: Optional[Settings] = None
|
||||
) -> None:
|
||||
def execute(argv: list[str] | None = None, settings: Settings | None = None) -> None:
|
||||
if argv is None:
|
||||
argv = sys.argv
|
||||
|
||||
|
@ -8,7 +8,7 @@ import argparse
|
||||
import builtins
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from twisted.python import failure
|
||||
|
||||
@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
||||
|
||||
class ScrapyCommand:
|
||||
requires_project: bool = False
|
||||
crawler_process: Optional[CrawlerProcess] = None
|
||||
crawler_process: CrawlerProcess | None = None
|
||||
|
||||
# default settings to be used for this command instead of global defaults
|
||||
default_settings: dict[str, Any] = {}
|
||||
@ -195,7 +195,7 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
|
||||
prog: str,
|
||||
indent_increment: int = 2,
|
||||
max_help_position: int = 24,
|
||||
width: Optional[int] = None,
|
||||
width: int | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
prog,
|
||||
|
@ -1,10 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import string
|
||||
from importlib import import_module
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union, cast
|
||||
from typing import Any, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import scrapy
|
||||
@ -140,7 +142,7 @@ class Command(ScrapyCommand):
|
||||
name: str,
|
||||
url: str,
|
||||
template_name: str,
|
||||
template_file: Union[str, os.PathLike],
|
||||
template_file: str | os.PathLike,
|
||||
) -> None:
|
||||
"""Generate the spider module, based on the given template"""
|
||||
tvars = self._generate_template_variables(module, name, url, template_name)
|
||||
@ -161,7 +163,7 @@ class Command(ScrapyCommand):
|
||||
if spiders_module:
|
||||
print(f"in module:\n {spiders_module.__name__}.{module}")
|
||||
|
||||
def _find_template(self, template: str) -> Optional[Path]:
|
||||
def _find_template(self, template: str) -> Path | None:
|
||||
template_file = Path(self.templates_dir, f"{template}.tmpl")
|
||||
if template_file.exists():
|
||||
return template_file
|
||||
|
@ -5,7 +5,7 @@ import functools
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, overload
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, overload
|
||||
|
||||
from itemadapter import ItemAdapter, is_item
|
||||
from twisted.internet.defer import Deferred, maybeDeferred
|
||||
@ -38,10 +38,10 @@ _T = TypeVar("_T")
|
||||
class Command(BaseRunSpiderCommand):
|
||||
requires_project = True
|
||||
|
||||
spider: Optional[Spider] = None
|
||||
spider: Spider | None = None
|
||||
items: dict[int, list[Any]] = {}
|
||||
requests: dict[int, list[Request]] = {}
|
||||
spidercls: Optional[type[Spider]]
|
||||
spidercls: type[Spider] | None
|
||||
|
||||
first_response = None
|
||||
|
||||
@ -137,13 +137,13 @@ class Command(BaseRunSpiderCommand):
|
||||
|
||||
@overload
|
||||
def iterate_spider_output(
|
||||
self, result: Union[AsyncGenerator[_T, None], Coroutine[Any, Any, _T]]
|
||||
self, result: AsyncGenerator[_T] | Coroutine[Any, Any, _T]
|
||||
) -> Deferred[_T]: ...
|
||||
|
||||
@overload
|
||||
def iterate_spider_output(self, result: _T) -> Iterable[Any]: ...
|
||||
|
||||
def iterate_spider_output(self, result: Any) -> Union[Iterable[Any], Deferred[Any]]:
|
||||
def iterate_spider_output(self, result: Any) -> Iterable[Any] | Deferred[Any]:
|
||||
if inspect.isasyncgen(result):
|
||||
d = deferred_from_coro(
|
||||
collect_asyncgen(aiter_errback(result, self.handle_exception))
|
||||
@ -164,7 +164,7 @@ class Command(BaseRunSpiderCommand):
|
||||
old_reqs = self.requests.get(lvl, [])
|
||||
self.requests[lvl] = old_reqs + new_reqs
|
||||
|
||||
def print_items(self, lvl: Optional[int] = None, colour: bool = True) -> None:
|
||||
def print_items(self, lvl: int | None = None, colour: bool = True) -> None:
|
||||
if lvl is None:
|
||||
items = [item for lst in self.items.values() for item in lst]
|
||||
else:
|
||||
@ -173,7 +173,7 @@ class Command(BaseRunSpiderCommand):
|
||||
print("# Scraped Items ", "-" * 60)
|
||||
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
|
||||
|
||||
def print_requests(self, lvl: Optional[int] = None, colour: bool = True) -> None:
|
||||
def print_requests(self, lvl: int | None = None, colour: bool = True) -> None:
|
||||
if lvl is None:
|
||||
if self.requests:
|
||||
requests = self.requests[max(self.requests)]
|
||||
@ -222,7 +222,7 @@ class Command(BaseRunSpiderCommand):
|
||||
self,
|
||||
response: Response,
|
||||
callback: CallbackT,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
) -> Deferred[Any]:
|
||||
cb_kwargs = cb_kwargs or {}
|
||||
d = maybeDeferred(self.iterate_spider_output, callback(response, **cb_kwargs))
|
||||
@ -230,7 +230,7 @@ class Command(BaseRunSpiderCommand):
|
||||
|
||||
def get_callback_from_rules(
|
||||
self, spider: Spider, response: Response
|
||||
) -> Union[CallbackT, str, None]:
|
||||
) -> CallbackT | str | None:
|
||||
if getattr(spider, "rules", None):
|
||||
for rule in spider.rules: # type: ignore[attr-defined]
|
||||
if rule.link_extractor.matches(response.url):
|
||||
@ -303,9 +303,9 @@ class Command(BaseRunSpiderCommand):
|
||||
*,
|
||||
spider: Spider,
|
||||
opts: argparse.Namespace,
|
||||
response: Optional[Response] = None,
|
||||
response: Response | None = None,
|
||||
) -> CallbackT:
|
||||
cb: Union[str, CallbackT, None] = None
|
||||
cb: str | CallbackT | None = None
|
||||
if response:
|
||||
cb = response.meta["_callback"]
|
||||
if not cb:
|
||||
|
@ -4,7 +4,7 @@ import argparse
|
||||
import sys
|
||||
from importlib import import_module
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
||||
from types import ModuleType
|
||||
|
||||
|
||||
def _import_file(filepath: Union[str, PathLike[str]]) -> ModuleType:
|
||||
def _import_file(filepath: str | PathLike[str]) -> ModuleType:
|
||||
abspath = Path(filepath).resolve()
|
||||
if abspath.suffix not in (".py", ".pyw"):
|
||||
raise ValueError(f"Not a Python source file: {abspath}")
|
||||
|
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
@ -6,7 +8,6 @@ from importlib.util import find_spec
|
||||
from pathlib import Path
|
||||
from shutil import copy2, copystat, ignore_patterns, move
|
||||
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
|
||||
from typing import Union
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
@ -24,7 +25,7 @@ TEMPLATES_TO_RENDER: tuple[tuple[str, ...], ...] = (
|
||||
IGNORE = ignore_patterns("*.pyc", "__pycache__", ".svn")
|
||||
|
||||
|
||||
def _make_writable(path: Union[str, os.PathLike]) -> None:
|
||||
def _make_writable(path: str | os.PathLike) -> None:
|
||||
current_permissions = os.stat(path).st_mode
|
||||
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
|
||||
|
||||
|
@ -6,7 +6,7 @@ from collections.abc import AsyncGenerator, Iterable
|
||||
from functools import wraps
|
||||
from inspect import getmembers
|
||||
from types import CoroutineType
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
from unittest import TestCase, TestResult
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
@ -24,7 +24,7 @@ if TYPE_CHECKING:
|
||||
class Contract:
|
||||
"""Abstract class for contracts"""
|
||||
|
||||
request_cls: Optional[type[Request]] = None
|
||||
request_cls: type[Request] | None = None
|
||||
name: str
|
||||
|
||||
def __init__(self, method: Callable, *args: Any):
|
||||
@ -126,10 +126,8 @@ class ContractsManager:
|
||||
|
||||
return contracts
|
||||
|
||||
def from_spider(
|
||||
self, spider: Spider, results: TestResult
|
||||
) -> list[Optional[Request]]:
|
||||
requests: list[Optional[Request]] = []
|
||||
def from_spider(self, spider: Spider, results: TestResult) -> list[Request | None]:
|
||||
requests: list[Request | None] = []
|
||||
for method in self.tested_methods_from_spidercls(type(spider)):
|
||||
bound_method = spider.__getattribute__(method)
|
||||
try:
|
||||
@ -140,7 +138,7 @@ class ContractsManager:
|
||||
|
||||
return requests
|
||||
|
||||
def from_method(self, method: Callable, results: TestResult) -> Optional[Request]:
|
||||
def from_method(self, method: Callable, results: TestResult) -> Request | None:
|
||||
contracts = self.extract_contracts(method)
|
||||
if contracts:
|
||||
request_cls = Request
|
||||
|
@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any, Callable
|
||||
|
||||
from itemadapter import ItemAdapter, is_item
|
||||
|
||||
@ -63,7 +65,7 @@ class ReturnsContract(Contract):
|
||||
"""
|
||||
|
||||
name = "returns"
|
||||
object_type_verifiers: dict[Optional[str], Callable[[Any], bool]] = {
|
||||
object_type_verifiers: dict[str | None, Callable[[Any], bool]] = {
|
||||
"request": lambda x: isinstance(x, Request),
|
||||
"requests": lambda x: isinstance(x, Request),
|
||||
"item": is_item,
|
||||
|
@ -5,7 +5,7 @@ import warnings
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
||||
|
||||
from twisted.internet import task
|
||||
from twisted.internet.defer import Deferred
|
||||
@ -37,7 +37,7 @@ class Slot:
|
||||
delay: float,
|
||||
randomize_delay: bool,
|
||||
*,
|
||||
throttle: Optional[bool] = None,
|
||||
throttle: bool | None = None,
|
||||
):
|
||||
self.concurrency: int = concurrency
|
||||
self.delay: float = delay
|
||||
@ -119,15 +119,13 @@ class Downloader:
|
||||
"DOWNLOAD_SLOTS", {}
|
||||
)
|
||||
|
||||
def fetch(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Deferred[Union[Response, Request]]:
|
||||
def fetch(self, request: Request, spider: Spider) -> Deferred[Response | Request]:
|
||||
def _deactivate(response: _T) -> _T:
|
||||
self.active.remove(request)
|
||||
return response
|
||||
|
||||
self.active.add(request)
|
||||
dfd: Deferred[Union[Response, Request]] = self.middleware.download(
|
||||
dfd: Deferred[Response | Request] = self.middleware.download(
|
||||
self._enqueue_request, request, spider
|
||||
)
|
||||
return dfd.addBoth(_deactivate)
|
||||
@ -164,7 +162,7 @@ class Downloader:
|
||||
|
||||
return key
|
||||
|
||||
def _get_slot_key(self, request: Request, spider: Optional[Spider]) -> str:
|
||||
def _get_slot_key(self, request: Request, spider: Spider | None) -> str:
|
||||
warnings.warn(
|
||||
"Use of this protected method is deprecated. Consider using its corresponding public method get_slot_key() instead.",
|
||||
ScrapyDeprecationWarning,
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from OpenSSL import SSL
|
||||
from twisted.internet._sslverify import _setAcceptableProtocols
|
||||
@ -49,7 +49,7 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
self,
|
||||
method: int = SSL.SSLv23_METHOD,
|
||||
tls_verbose_logging: bool = False,
|
||||
tls_ciphers: Optional[str] = None,
|
||||
tls_ciphers: str | None = None,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
):
|
||||
@ -73,7 +73,7 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
tls_verbose_logging: bool = settings.getbool(
|
||||
"DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING"
|
||||
)
|
||||
tls_ciphers: Optional[str] = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
|
||||
tls_ciphers: str | None = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
|
||||
return cls( # type: ignore[misc]
|
||||
method=method,
|
||||
tls_verbose_logging=tls_verbose_logging,
|
||||
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING, Any, Optional, Protocol, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Protocol, cast
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
@ -35,16 +35,16 @@ class DownloadHandlerProtocol(Protocol):
|
||||
class DownloadHandlers:
|
||||
def __init__(self, crawler: Crawler):
|
||||
self._crawler: Crawler = crawler
|
||||
self._schemes: dict[str, Union[str, Callable[..., Any]]] = (
|
||||
self._schemes: dict[str, str | Callable[..., Any]] = (
|
||||
{}
|
||||
) # stores acceptable schemes on instancing
|
||||
self._handlers: dict[str, DownloadHandlerProtocol] = (
|
||||
{}
|
||||
) # stores instanced handlers for schemes
|
||||
self._notconfigured: dict[str, str] = {} # remembers failed handlers
|
||||
handlers: dict[str, Union[str, Callable[..., Any]]] = without_none_values(
|
||||
handlers: dict[str, str | Callable[..., Any]] = without_none_values(
|
||||
cast(
|
||||
dict[str, Union[str, Callable[..., Any]]],
|
||||
"dict[str, str | Callable[..., Any]]",
|
||||
crawler.settings.getwithbase("DOWNLOAD_HANDLERS"),
|
||||
)
|
||||
)
|
||||
@ -54,7 +54,7 @@ class DownloadHandlers:
|
||||
|
||||
crawler.signals.connect(self._close, signals.engine_stopped)
|
||||
|
||||
def _get_handler(self, scheme: str) -> Optional[DownloadHandlerProtocol]:
|
||||
def _get_handler(self, scheme: str) -> DownloadHandlerProtocol | None:
|
||||
"""Lazy-load the downloadhandler for a scheme
|
||||
only on the first request for that scheme.
|
||||
"""
|
||||
@ -70,7 +70,7 @@ class DownloadHandlers:
|
||||
|
||||
def _load_handler(
|
||||
self, scheme: str, skip_lazy: bool = False
|
||||
) -> Optional[DownloadHandlerProtocol]:
|
||||
) -> DownloadHandlerProtocol | None:
|
||||
path = self._schemes[scheme]
|
||||
try:
|
||||
dhcls: type[DownloadHandlerProtocol] = load_object(path)
|
||||
|
@ -32,7 +32,7 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO
|
||||
from urllib.parse import unquote
|
||||
|
||||
from twisted.internet.protocol import ClientCreator, Protocol
|
||||
@ -56,8 +56,8 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class ReceivedDataProtocol(Protocol):
|
||||
def __init__(self, filename: Optional[str] = None):
|
||||
self.__filename: Optional[str] = filename
|
||||
def __init__(self, filename: str | None = None):
|
||||
self.__filename: str | None = filename
|
||||
self.body: BinaryIO = open(filename, "wb") if filename else BytesIO()
|
||||
self.size: int = 0
|
||||
|
||||
@ -66,7 +66,7 @@ class ReceivedDataProtocol(Protocol):
|
||||
self.size += len(data)
|
||||
|
||||
@property
|
||||
def filename(self) -> Optional[str]:
|
||||
def filename(self) -> str | None:
|
||||
return self.__filename
|
||||
|
||||
def close(self) -> None:
|
||||
|
@ -8,7 +8,7 @@ import re
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypedDict, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, Any, TypedDict, TypeVar
|
||||
from urllib.parse import urldefrag, urlunparse
|
||||
|
||||
from twisted.internet import ssl
|
||||
@ -52,10 +52,10 @@ _T = TypeVar("_T")
|
||||
class _ResultT(TypedDict):
|
||||
txresponse: TxResponse
|
||||
body: bytes
|
||||
flags: Optional[list[str]]
|
||||
certificate: Optional[ssl.Certificate]
|
||||
ip_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address, None]
|
||||
failure: NotRequired[Optional[Failure]]
|
||||
flags: list[str] | None
|
||||
certificate: ssl.Certificate | None
|
||||
ip_address: ipaddress.IPv4Address | ipaddress.IPv6Address | None
|
||||
failure: NotRequired[Failure | None]
|
||||
|
||||
|
||||
class HTTP11DownloadHandler:
|
||||
@ -143,10 +143,10 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
reactor: ReactorBase,
|
||||
host: str,
|
||||
port: int,
|
||||
proxyConf: tuple[str, int, Optional[bytes]],
|
||||
proxyConf: tuple[str, int, bytes | None],
|
||||
contextFactory: IPolicyForHTTPS,
|
||||
timeout: float = 30,
|
||||
bindAddress: Optional[tuple[str, int]] = None,
|
||||
bindAddress: tuple[str, int] | None = None,
|
||||
):
|
||||
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
|
||||
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
|
||||
@ -220,7 +220,7 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
|
||||
|
||||
def tunnel_request_data(
|
||||
host: str, port: int, proxy_auth_header: Optional[bytes] = None
|
||||
host: str, port: int, proxy_auth_header: bytes | None = None
|
||||
) -> bytes:
|
||||
r"""
|
||||
Return binary content of a CONNECT request.
|
||||
@ -254,14 +254,14 @@ class TunnelingAgent(Agent):
|
||||
self,
|
||||
*,
|
||||
reactor: ReactorBase,
|
||||
proxyConf: tuple[str, int, Optional[bytes]],
|
||||
proxyConf: tuple[str, int, bytes | None],
|
||||
contextFactory: IPolicyForHTTPS,
|
||||
connectTimeout: Optional[float] = None,
|
||||
bindAddress: Optional[bytes] = None,
|
||||
pool: Optional[HTTPConnectionPool] = None,
|
||||
connectTimeout: float | None = None,
|
||||
bindAddress: bytes | None = None,
|
||||
pool: HTTPConnectionPool | None = None,
|
||||
):
|
||||
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
|
||||
self._proxyConf: tuple[str, int, Optional[bytes]] = proxyConf
|
||||
self._proxyConf: tuple[str, int, bytes | None] = proxyConf
|
||||
self._contextFactory: IPolicyForHTTPS = contextFactory
|
||||
|
||||
def _getEndpoint(self, uri: URI) -> TunnelingTCP4ClientEndpoint:
|
||||
@ -281,8 +281,8 @@ class TunnelingAgent(Agent):
|
||||
endpoint: TCP4ClientEndpoint,
|
||||
method: bytes,
|
||||
parsedURI: bytes,
|
||||
headers: Optional[TxHeaders],
|
||||
bodyProducer: Optional[IBodyProducer],
|
||||
headers: TxHeaders | None,
|
||||
bodyProducer: IBodyProducer | None,
|
||||
requestPath: bytes,
|
||||
) -> Deferred[TxResponse]:
|
||||
# proxy host and port are required for HTTP pool `key`
|
||||
@ -305,9 +305,9 @@ class ScrapyProxyAgent(Agent):
|
||||
self,
|
||||
reactor: ReactorBase,
|
||||
proxyURI: bytes,
|
||||
connectTimeout: Optional[float] = None,
|
||||
bindAddress: Optional[bytes] = None,
|
||||
pool: Optional[HTTPConnectionPool] = None,
|
||||
connectTimeout: float | None = None,
|
||||
bindAddress: bytes | None = None,
|
||||
pool: HTTPConnectionPool | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
reactor=reactor,
|
||||
@ -321,8 +321,8 @@ class ScrapyProxyAgent(Agent):
|
||||
self,
|
||||
method: bytes,
|
||||
uri: bytes,
|
||||
headers: Optional[TxHeaders] = None,
|
||||
bodyProducer: Optional[IBodyProducer] = None,
|
||||
headers: TxHeaders | None = None,
|
||||
bodyProducer: IBodyProducer | None = None,
|
||||
) -> Deferred[TxResponse]:
|
||||
"""
|
||||
Issue a new request via the configured proxy.
|
||||
@ -350,8 +350,8 @@ class ScrapyAgent:
|
||||
*,
|
||||
contextFactory: IPolicyForHTTPS,
|
||||
connectTimeout: float = 10,
|
||||
bindAddress: Optional[bytes] = None,
|
||||
pool: Optional[HTTPConnectionPool] = None,
|
||||
bindAddress: bytes | None = None,
|
||||
pool: HTTPConnectionPool | None = None,
|
||||
maxsize: int = 0,
|
||||
warnsize: int = 0,
|
||||
fail_on_dataloss: bool = True,
|
||||
@ -359,12 +359,12 @@ class ScrapyAgent:
|
||||
):
|
||||
self._contextFactory: IPolicyForHTTPS = contextFactory
|
||||
self._connectTimeout: float = connectTimeout
|
||||
self._bindAddress: Optional[bytes] = bindAddress
|
||||
self._pool: Optional[HTTPConnectionPool] = pool
|
||||
self._bindAddress: bytes | None = bindAddress
|
||||
self._pool: HTTPConnectionPool | None = pool
|
||||
self._maxsize: int = maxsize
|
||||
self._warnsize: int = warnsize
|
||||
self._fail_on_dataloss: bool = fail_on_dataloss
|
||||
self._txresponse: Optional[TxResponse] = None
|
||||
self._txresponse: TxResponse | None = None
|
||||
self._crawler: Crawler = crawler
|
||||
|
||||
def _get_agent(self, request: Request, timeout: float) -> Agent:
|
||||
@ -462,7 +462,7 @@ class ScrapyAgent:
|
||||
|
||||
def _cb_bodyready(
|
||||
self, txresponse: TxResponse, request: Request
|
||||
) -> Union[_ResultT, Deferred[_ResultT]]:
|
||||
) -> _ResultT | Deferred[_ResultT]:
|
||||
headers_received_result = self._crawler.signals.send_catch_log(
|
||||
signal=signals.headers_received,
|
||||
headers=self._headers_from_twisted_response(txresponse),
|
||||
@ -551,7 +551,7 @@ class ScrapyAgent:
|
||||
|
||||
def _cb_bodydone(
|
||||
self, result: _ResultT, request: Request, url: str
|
||||
) -> Union[Response, Failure]:
|
||||
) -> Response | Failure:
|
||||
headers = self._headers_from_twisted_response(result["txresponse"])
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
|
||||
try:
|
||||
@ -614,14 +614,12 @@ class _ResponseReader(Protocol):
|
||||
self._fail_on_dataloss_warned: bool = False
|
||||
self._reached_warnsize: bool = False
|
||||
self._bytes_received: int = 0
|
||||
self._certificate: Optional[ssl.Certificate] = None
|
||||
self._ip_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address, None] = (
|
||||
None
|
||||
)
|
||||
self._certificate: ssl.Certificate | None = None
|
||||
self._ip_address: ipaddress.IPv4Address | ipaddress.IPv6Address | None = None
|
||||
self._crawler: Crawler = crawler
|
||||
|
||||
def _finish_response(
|
||||
self, flags: Optional[list[str]] = None, failure: Optional[Failure] = None
|
||||
self, flags: list[str] | None = None, failure: Failure | None = None
|
||||
) -> None:
|
||||
self._finished.callback(
|
||||
{
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urldefrag
|
||||
|
||||
from twisted.internet.error import TimeoutError
|
||||
@ -60,8 +60,8 @@ class ScrapyH2Agent:
|
||||
context_factory: IPolicyForHTTPS,
|
||||
pool: H2ConnectionPool,
|
||||
connect_timeout: int = 10,
|
||||
bind_address: Optional[bytes] = None,
|
||||
crawler: Optional[Crawler] = None,
|
||||
bind_address: bytes | None = None,
|
||||
crawler: Crawler | None = None,
|
||||
) -> None:
|
||||
self._context_factory = context_factory
|
||||
self._connect_timeout = connect_timeout
|
||||
@ -69,7 +69,7 @@ class ScrapyH2Agent:
|
||||
self._pool = pool
|
||||
self._crawler = crawler
|
||||
|
||||
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
|
||||
def _get_agent(self, request: Request, timeout: float | None) -> H2Agent:
|
||||
from twisted.internet import reactor
|
||||
|
||||
bind_address = request.meta.get("bindaddress") or self._bind_address
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -26,9 +26,9 @@ class S3DownloadHandler:
|
||||
settings: BaseSettings,
|
||||
*,
|
||||
crawler: Crawler,
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
aws_secret_access_key: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
aws_access_key_id: str | None = None,
|
||||
aws_secret_access_key: str | None = None,
|
||||
aws_session_token: str | None = None,
|
||||
httpdownloadhandler: type[HTTPDownloadHandler] = HTTPDownloadHandler,
|
||||
**kw: Any,
|
||||
):
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/downloader-middleware.rst
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
|
||||
@ -46,11 +46,11 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
download_func: Callable[[Request, Spider], Deferred[Response]],
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Deferred[Union[Response, Request]]:
|
||||
) -> Deferred[Response | Request]:
|
||||
@inlineCallbacks
|
||||
def process_request(
|
||||
request: Request,
|
||||
) -> Generator[Deferred[Any], Any, Union[Response, Request]]:
|
||||
) -> Generator[Deferred[Any], Any, Response | Request]:
|
||||
for method in self.methods["process_request"]:
|
||||
method = cast(Callable, method)
|
||||
response = yield deferred_from_coro(
|
||||
@ -69,8 +69,8 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
@inlineCallbacks
|
||||
def process_response(
|
||||
response: Union[Response, Request]
|
||||
) -> Generator[Deferred[Any], Any, Union[Response, Request]]:
|
||||
response: Response | Request,
|
||||
) -> Generator[Deferred[Any], Any, Response | Request]:
|
||||
if response is None:
|
||||
raise TypeError("Received None in process_response")
|
||||
elif isinstance(response, Request):
|
||||
@ -93,7 +93,7 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
@inlineCallbacks
|
||||
def process_exception(
|
||||
failure: Failure,
|
||||
) -> Generator[Deferred[Any], Any, Union[Failure, Response, Request]]:
|
||||
) -> Generator[Deferred[Any], Any, Failure | Response | Request]:
|
||||
exception = failure.value
|
||||
for method in self.methods["process_exception"]:
|
||||
method = cast(Callable, method)
|
||||
@ -111,7 +111,7 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
return response
|
||||
return failure
|
||||
|
||||
deferred: Deferred[Union[Response, Request]] = mustbe_deferred(
|
||||
deferred: Deferred[Response | Request] = mustbe_deferred(
|
||||
process_request, request
|
||||
)
|
||||
deferred.addErrback(process_exception)
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
|
||||
|
||||
from twisted.internet import defer
|
||||
@ -144,9 +144,9 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
# converting to bytes to comply to Twisted interface
|
||||
self.url: bytes = to_bytes(self._url, encoding="ascii")
|
||||
self.method: bytes = to_bytes(request.method, encoding="ascii")
|
||||
self.body: Optional[bytes] = request.body or None
|
||||
self.body: bytes | None = request.body or None
|
||||
self.headers: Headers = Headers(request.headers)
|
||||
self.response_headers: Optional[Headers] = None
|
||||
self.response_headers: Headers | None = None
|
||||
self.timeout: float = request.meta.get("download_timeout") or timeout
|
||||
self.start_time: float = time()
|
||||
self.deferred: defer.Deferred[Response] = defer.Deferred().addCallback(
|
||||
|
@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks, succeed
|
||||
@ -18,7 +18,7 @@ from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.downloader import Downloader
|
||||
from scrapy.core.scraper import Scraper
|
||||
from scrapy.core.scraper import Scraper, _HandleOutputDeferred
|
||||
from scrapy.exceptions import CloseSpider, DontCloseSpider, IgnoreRequest
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.logformatter import LogFormatter
|
||||
@ -32,7 +32,6 @@ if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Generator, Iterable, Iterator
|
||||
|
||||
from scrapy.core.scheduler import BaseScheduler
|
||||
from scrapy.core.scraper import _HandleOutputDeferred
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.settings import BaseSettings
|
||||
from scrapy.spiders import Spider
|
||||
@ -51,9 +50,9 @@ class Slot:
|
||||
nextcall: CallLaterOnce[None],
|
||||
scheduler: BaseScheduler,
|
||||
) -> None:
|
||||
self.closing: Optional[Deferred[None]] = None
|
||||
self.closing: Deferred[None] | None = None
|
||||
self.inprogress: set[Request] = set()
|
||||
self.start_requests: Optional[Iterator[Request]] = iter(start_requests)
|
||||
self.start_requests: Iterator[Request] | None = iter(start_requests)
|
||||
self.close_if_idle: bool = close_if_idle
|
||||
self.nextcall: CallLaterOnce[None] = nextcall
|
||||
self.scheduler: BaseScheduler = scheduler
|
||||
@ -84,15 +83,15 @@ class ExecutionEngine:
|
||||
def __init__(
|
||||
self,
|
||||
crawler: Crawler,
|
||||
spider_closed_callback: Callable[[Spider], Optional[Deferred[None]]],
|
||||
spider_closed_callback: Callable[[Spider], Deferred[None] | None],
|
||||
) -> None:
|
||||
self.crawler: Crawler = crawler
|
||||
self.settings: Settings = crawler.settings
|
||||
self.signals: SignalManager = crawler.signals
|
||||
assert crawler.logformatter
|
||||
self.logformatter: LogFormatter = crawler.logformatter
|
||||
self.slot: Optional[Slot] = None
|
||||
self.spider: Optional[Spider] = None
|
||||
self.slot: Slot | None = None
|
||||
self.spider: Spider | None = None
|
||||
self.running: bool = False
|
||||
self.paused: bool = False
|
||||
self.scheduler_cls: type[BaseScheduler] = self._get_scheduler_class(
|
||||
@ -101,10 +100,10 @@ class ExecutionEngine:
|
||||
downloader_cls: type[Downloader] = load_object(self.settings["DOWNLOADER"])
|
||||
self.downloader: Downloader = downloader_cls(crawler)
|
||||
self.scraper: Scraper = Scraper(crawler)
|
||||
self._spider_closed_callback: Callable[[Spider], Optional[Deferred[None]]] = (
|
||||
self._spider_closed_callback: Callable[[Spider], Deferred[None] | None] = (
|
||||
spider_closed_callback
|
||||
)
|
||||
self.start_time: Optional[float] = None
|
||||
self.start_time: float | None = None
|
||||
|
||||
def _get_scheduler_class(self, settings: BaseSettings) -> type[BaseScheduler]:
|
||||
from scrapy.core.scheduler import BaseScheduler
|
||||
@ -218,7 +217,7 @@ class ExecutionEngine:
|
||||
or self.scraper.slot.needs_backout()
|
||||
)
|
||||
|
||||
def _next_request_from_scheduler(self) -> Optional[Deferred[None]]:
|
||||
def _next_request_from_scheduler(self) -> Deferred[None] | None:
|
||||
assert self.slot is not None # typing
|
||||
assert self.spider is not None # typing
|
||||
|
||||
@ -226,7 +225,7 @@ class ExecutionEngine:
|
||||
if request is None:
|
||||
return None
|
||||
|
||||
d: Deferred[Union[Response, Request]] = self._download(request)
|
||||
d: Deferred[Response | Request] = self._download(request)
|
||||
d.addBoth(self._handle_downloader_output, request)
|
||||
d.addErrback(
|
||||
lambda f: logger.info(
|
||||
@ -260,8 +259,8 @@ class ExecutionEngine:
|
||||
return d2
|
||||
|
||||
def _handle_downloader_output(
|
||||
self, result: Union[Request, Response, Failure], request: Request
|
||||
) -> Optional[_HandleOutputDeferred]:
|
||||
self, result: Request | Response | Failure, request: Request
|
||||
) -> _HandleOutputDeferred | None:
|
||||
assert self.spider is not None # typing
|
||||
|
||||
if not isinstance(result, (Request, Response, Failure)):
|
||||
@ -323,24 +322,24 @@ class ExecutionEngine:
|
||||
"""Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
|
||||
if self.spider is None:
|
||||
raise RuntimeError(f"No open spider to crawl: {request}")
|
||||
d: Deferred[Union[Response, Request]] = self._download(request)
|
||||
d: Deferred[Response | Request] = self._download(request)
|
||||
# Deferred.addBoth() overloads don't seem to support a Union[_T, Deferred[_T]] return type
|
||||
d2: Deferred[Response] = d.addBoth(self._downloaded, request) # type: ignore[call-overload]
|
||||
return d2
|
||||
|
||||
def _downloaded(
|
||||
self, result: Union[Response, Request, Failure], request: Request
|
||||
) -> Union[Deferred[Response], Response, Failure]:
|
||||
self, result: Response | Request | Failure, request: Request
|
||||
) -> Deferred[Response] | Response | Failure:
|
||||
assert self.slot is not None # typing
|
||||
self.slot.remove_request(request)
|
||||
return self.download(result) if isinstance(result, Request) else result
|
||||
|
||||
def _download(self, request: Request) -> Deferred[Union[Response, Request]]:
|
||||
def _download(self, request: Request) -> Deferred[Response | Request]:
|
||||
assert self.slot is not None # typing
|
||||
|
||||
self.slot.add_request(request)
|
||||
|
||||
def _on_success(result: Union[Response, Request]) -> Union[Response, Request]:
|
||||
def _on_success(result: Response | Request) -> Response | Request:
|
||||
if not isinstance(result, (Response, Request)):
|
||||
raise TypeError(
|
||||
f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
|
||||
@ -368,9 +367,7 @@ class ExecutionEngine:
|
||||
return _
|
||||
|
||||
assert self.spider is not None
|
||||
dwld: Deferred[Union[Response, Request]] = self.downloader.fetch(
|
||||
request, self.spider
|
||||
)
|
||||
dwld: Deferred[Response | Request] = self.downloader.fetch(request, self.spider)
|
||||
dwld.addCallback(_on_success)
|
||||
dwld.addBoth(_on_complete)
|
||||
return dwld
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.defer import Deferred
|
||||
@ -121,8 +121,8 @@ class H2Agent:
|
||||
reactor: ReactorBase,
|
||||
pool: H2ConnectionPool,
|
||||
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
|
||||
connect_timeout: Optional[float] = None,
|
||||
bind_address: Optional[bytes] = None,
|
||||
connect_timeout: float | None = None,
|
||||
bind_address: bytes | None = None,
|
||||
) -> None:
|
||||
self._reactor = reactor
|
||||
self._pool = pool
|
||||
@ -165,8 +165,8 @@ class ScrapyProxyH2Agent(H2Agent):
|
||||
proxy_uri: URI,
|
||||
pool: H2ConnectionPool,
|
||||
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
|
||||
connect_timeout: Optional[float] = None,
|
||||
bind_address: Optional[bytes] = None,
|
||||
connect_timeout: float | None = None,
|
||||
bind_address: bytes | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
reactor=reactor,
|
||||
|
@ -4,7 +4,7 @@ import ipaddress
|
||||
import itertools
|
||||
import logging
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from h2.config import H2Configuration
|
||||
from h2.connection import H2Connection
|
||||
@ -63,7 +63,7 @@ class InvalidNegotiatedProtocol(H2Error):
|
||||
class RemoteTerminatedConnection(H2Error):
|
||||
def __init__(
|
||||
self,
|
||||
remote_ip_address: Optional[Union[IPv4Address, IPv6Address]],
|
||||
remote_ip_address: IPv4Address | IPv6Address | None,
|
||||
event: ConnectionTerminated,
|
||||
) -> None:
|
||||
self.remote_ip_address = remote_ip_address
|
||||
@ -74,9 +74,7 @@ class RemoteTerminatedConnection(H2Error):
|
||||
|
||||
|
||||
class MethodNotAllowed405(H2Error):
|
||||
def __init__(
|
||||
self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]
|
||||
) -> None:
|
||||
def __init__(self, remote_ip_address: IPv4Address | IPv6Address | None) -> None:
|
||||
self.remote_ip_address = remote_ip_address
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from h2.errors import ErrorCodes
|
||||
from h2.exceptions import H2Error, ProtocolError, StreamClosedError
|
||||
@ -382,7 +382,7 @@ class Stream:
|
||||
def close(
|
||||
self,
|
||||
reason: StreamCloseReason,
|
||||
errors: Optional[list[BaseException]] = None,
|
||||
errors: list[BaseException] | None = None,
|
||||
from_protocol: bool = False,
|
||||
) -> None:
|
||||
"""Based on the reason sent we will handle each case."""
|
||||
|
@ -4,7 +4,7 @@ import json
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
# working around https://github.com/sphinx-doc/sphinx/issues/10400
|
||||
from twisted.internet.defer import Deferred # noqa: TC002
|
||||
@ -73,7 +73,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
|
||||
"""
|
||||
return cls()
|
||||
|
||||
def open(self, spider: Spider) -> Optional[Deferred[None]]:
|
||||
def open(self, spider: Spider) -> Deferred[None] | None:
|
||||
"""
|
||||
Called when the spider is opened by the engine. It receives the spider
|
||||
instance as argument and it's useful to execute initialization code.
|
||||
@ -83,7 +83,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
|
||||
"""
|
||||
pass
|
||||
|
||||
def close(self, reason: str) -> Optional[Deferred[None]]:
|
||||
def close(self, reason: str) -> Deferred[None] | None:
|
||||
"""
|
||||
Called when the spider is closed by the engine. It receives the reason why the crawl
|
||||
finished as argument and it's useful to execute cleaning code.
|
||||
@ -115,7 +115,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def next_request(self) -> Optional[Request]:
|
||||
def next_request(self) -> Request | None:
|
||||
"""
|
||||
Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
|
||||
to indicate that there are no requests to be considered ready at the moment.
|
||||
@ -181,22 +181,22 @@ class Scheduler(BaseScheduler):
|
||||
def __init__(
|
||||
self,
|
||||
dupefilter: BaseDupeFilter,
|
||||
jobdir: Optional[str] = None,
|
||||
dqclass: Optional[type[BaseQueue]] = None,
|
||||
mqclass: Optional[type[BaseQueue]] = None,
|
||||
jobdir: str | None = None,
|
||||
dqclass: type[BaseQueue] | None = None,
|
||||
mqclass: type[BaseQueue] | None = None,
|
||||
logunser: bool = False,
|
||||
stats: Optional[StatsCollector] = None,
|
||||
pqclass: Optional[type[ScrapyPriorityQueue]] = None,
|
||||
crawler: Optional[Crawler] = None,
|
||||
stats: StatsCollector | None = None,
|
||||
pqclass: type[ScrapyPriorityQueue] | None = None,
|
||||
crawler: Crawler | None = None,
|
||||
):
|
||||
self.df: BaseDupeFilter = dupefilter
|
||||
self.dqdir: Optional[str] = self._dqdir(jobdir)
|
||||
self.pqclass: Optional[type[ScrapyPriorityQueue]] = pqclass
|
||||
self.dqclass: Optional[type[BaseQueue]] = dqclass
|
||||
self.mqclass: Optional[type[BaseQueue]] = mqclass
|
||||
self.dqdir: str | None = self._dqdir(jobdir)
|
||||
self.pqclass: type[ScrapyPriorityQueue] | None = pqclass
|
||||
self.dqclass: type[BaseQueue] | None = dqclass
|
||||
self.mqclass: type[BaseQueue] | None = mqclass
|
||||
self.logunser: bool = logunser
|
||||
self.stats: Optional[StatsCollector] = stats
|
||||
self.crawler: Optional[Crawler] = crawler
|
||||
self.stats: StatsCollector | None = stats
|
||||
self.crawler: Crawler | None = crawler
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
@ -218,7 +218,7 @@ class Scheduler(BaseScheduler):
|
||||
def has_pending_requests(self) -> bool:
|
||||
return len(self) > 0
|
||||
|
||||
def open(self, spider: Spider) -> Optional[Deferred[None]]:
|
||||
def open(self, spider: Spider) -> Deferred[None] | None:
|
||||
"""
|
||||
(1) initialize the memory queue
|
||||
(2) initialize the disk queue if the ``jobdir`` attribute is a valid directory
|
||||
@ -226,10 +226,10 @@ class Scheduler(BaseScheduler):
|
||||
"""
|
||||
self.spider: Spider = spider
|
||||
self.mqs: ScrapyPriorityQueue = self._mq()
|
||||
self.dqs: Optional[ScrapyPriorityQueue] = self._dq() if self.dqdir else None
|
||||
self.dqs: ScrapyPriorityQueue | None = self._dq() if self.dqdir else None
|
||||
return self.df.open()
|
||||
|
||||
def close(self, reason: str) -> Optional[Deferred[None]]:
|
||||
def close(self, reason: str) -> Deferred[None] | None:
|
||||
"""
|
||||
(1) dump pending requests to disk if there is a disk queue
|
||||
(2) return the result of the dupefilter's ``close`` method
|
||||
@ -263,7 +263,7 @@ class Scheduler(BaseScheduler):
|
||||
self.stats.inc_value("scheduler/enqueued", spider=self.spider)
|
||||
return True
|
||||
|
||||
def next_request(self) -> Optional[Request]:
|
||||
def next_request(self) -> Request | None:
|
||||
"""
|
||||
Return a :class:`~scrapy.http.Request` object from the memory queue,
|
||||
falling back to the disk queue if the memory queue is empty.
|
||||
@ -272,7 +272,7 @@ class Scheduler(BaseScheduler):
|
||||
Increment the appropriate stats, such as: ``scheduler/dequeued``,
|
||||
``scheduler/dequeued/disk``, ``scheduler/dequeued/memory``.
|
||||
"""
|
||||
request: Optional[Request] = self.mqs.pop()
|
||||
request: Request | None = self.mqs.pop()
|
||||
assert self.stats is not None
|
||||
if request is not None:
|
||||
self.stats.inc_value("scheduler/dequeued/memory", spider=self.spider)
|
||||
@ -318,7 +318,7 @@ class Scheduler(BaseScheduler):
|
||||
def _mqpush(self, request: Request) -> None:
|
||||
self.mqs.push(request)
|
||||
|
||||
def _dqpop(self) -> Optional[Request]:
|
||||
def _dqpop(self) -> Request | None:
|
||||
if self.dqs is not None:
|
||||
return self.dqs.pop()
|
||||
return None
|
||||
@ -355,7 +355,7 @@ class Scheduler(BaseScheduler):
|
||||
)
|
||||
return q
|
||||
|
||||
def _dqdir(self, jobdir: Optional[str]) -> Optional[str]:
|
||||
def _dqdir(self, jobdir: str | None) -> str | None:
|
||||
"""Return a folder name to keep disk queue state at"""
|
||||
if jobdir:
|
||||
dqdir = Path(jobdir, "requests.queue")
|
||||
|
@ -6,7 +6,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from collections import deque
|
||||
from collections.abc import AsyncIterable, Iterator
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, Union, cast
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
@ -42,11 +42,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_ParallelResult = list[tuple[bool, Iterator[Any]]]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# parameterized Deferreds require Twisted 21.7.0
|
||||
_HandleOutputDeferred = Deferred[Union[_ParallelResult, None]]
|
||||
QueueTuple = tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
|
||||
_HandleOutputDeferred = Deferred[Union[_ParallelResult, None]]
|
||||
QueueTuple = tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
|
||||
|
||||
|
||||
class Slot:
|
||||
@ -60,10 +57,10 @@ class Slot:
|
||||
self.active: set[Request] = set()
|
||||
self.active_size: int = 0
|
||||
self.itemproc_size: int = 0
|
||||
self.closing: Optional[Deferred[Spider]] = None
|
||||
self.closing: Deferred[Spider] | None = None
|
||||
|
||||
def add_response_request(
|
||||
self, result: Union[Response, Failure], request: Request
|
||||
self, result: Response | Failure, request: Request
|
||||
) -> _HandleOutputDeferred:
|
||||
deferred: _HandleOutputDeferred = Deferred()
|
||||
self.queue.append((result, request, deferred))
|
||||
@ -78,9 +75,7 @@ class Slot:
|
||||
self.active.add(request)
|
||||
return response, request, deferred
|
||||
|
||||
def finish_response(
|
||||
self, result: Union[Response, Failure], request: Request
|
||||
) -> None:
|
||||
def finish_response(self, result: Response | Failure, request: Request) -> None:
|
||||
self.active.remove(request)
|
||||
if isinstance(result, Response):
|
||||
self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
|
||||
@ -96,7 +91,7 @@ class Slot:
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, crawler: Crawler) -> None:
|
||||
self.slot: Optional[Slot] = None
|
||||
self.slot: Slot | None = None
|
||||
self.spidermw: SpiderMiddlewareManager = SpiderMiddlewareManager.from_crawler(
|
||||
crawler
|
||||
)
|
||||
@ -135,7 +130,7 @@ class Scraper:
|
||||
self.slot.closing.callback(spider)
|
||||
|
||||
def enqueue_scrape(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
self, result: Response | Failure, request: Request, spider: Spider
|
||||
) -> _HandleOutputDeferred:
|
||||
if self.slot is None:
|
||||
raise RuntimeError("Scraper slot not assigned")
|
||||
@ -167,7 +162,7 @@ class Scraper:
|
||||
self._scrape(response, request, spider).chainDeferred(deferred)
|
||||
|
||||
def _scrape(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
self, result: Response | Failure, request: Request, spider: Spider
|
||||
) -> _HandleOutputDeferred:
|
||||
"""
|
||||
Handle the downloaded response or failure through the spider callback/errback
|
||||
@ -176,7 +171,7 @@ class Scraper:
|
||||
raise TypeError(
|
||||
f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}"
|
||||
)
|
||||
dfd: Deferred[Union[Iterable[Any], AsyncIterable[Any]]] = self._scrape2(
|
||||
dfd: Deferred[Iterable[Any] | AsyncIterable[Any]] = self._scrape2(
|
||||
result, request, spider
|
||||
) # returns spider's processed output
|
||||
dfd.addErrback(self.handle_spider_error, request, result, spider)
|
||||
@ -186,8 +181,8 @@ class Scraper:
|
||||
return dfd2
|
||||
|
||||
def _scrape2(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
) -> Deferred[Union[Iterable[Any], AsyncIterable[Any]]]:
|
||||
self, result: Response | Failure, request: Request, spider: Spider
|
||||
) -> Deferred[Iterable[Any] | AsyncIterable[Any]]:
|
||||
"""
|
||||
Handle the different cases of request's result been a Response or a Failure
|
||||
"""
|
||||
@ -202,8 +197,8 @@ class Scraper:
|
||||
return dfd
|
||||
|
||||
def call_spider(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
) -> Deferred[Union[Iterable[Any], AsyncIterable[Any]]]:
|
||||
self, result: Response | Failure, request: Request, spider: Spider
|
||||
) -> Deferred[Iterable[Any] | AsyncIterable[Any]]:
|
||||
dfd: Deferred[Any]
|
||||
if isinstance(result, Response):
|
||||
if getattr(result, "request", None) is None:
|
||||
@ -222,7 +217,7 @@ class Scraper:
|
||||
if request.errback:
|
||||
warn_on_generator_with_return_value(spider, request.errback)
|
||||
dfd.addErrback(request.errback)
|
||||
dfd2: Deferred[Union[Iterable[Any], AsyncIterable[Any]]] = dfd.addCallback(
|
||||
dfd2: Deferred[Iterable[Any] | AsyncIterable[Any]] = dfd.addCallback(
|
||||
iterate_spider_output
|
||||
)
|
||||
return dfd2
|
||||
@ -231,7 +226,7 @@ class Scraper:
|
||||
self,
|
||||
_failure: Failure,
|
||||
request: Request,
|
||||
response: Union[Response, Failure],
|
||||
response: Response | Failure,
|
||||
spider: Spider,
|
||||
) -> None:
|
||||
exc = _failure.value
|
||||
@ -258,14 +253,14 @@ class Scraper:
|
||||
|
||||
def handle_spider_output(
|
||||
self,
|
||||
result: Union[Iterable[_T], AsyncIterable[_T]],
|
||||
result: Iterable[_T] | AsyncIterable[_T],
|
||||
request: Request,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
) -> _HandleOutputDeferred:
|
||||
if not result:
|
||||
return defer_succeed(None)
|
||||
it: Union[Iterable[_T], AsyncIterable[_T]]
|
||||
it: Iterable[_T] | AsyncIterable[_T]
|
||||
dfd: Deferred[_ParallelResult]
|
||||
if isinstance(result, AsyncIterable):
|
||||
it = aiter_errback(
|
||||
@ -296,7 +291,7 @@ class Scraper:
|
||||
|
||||
def _process_spidermw_output(
|
||||
self, output: Any, request: Request, response: Response, spider: Spider
|
||||
) -> Optional[Deferred[Any]]:
|
||||
) -> Deferred[Any] | None:
|
||||
"""Process each Request/Item (given in the output parameter) returned
|
||||
from the given spider
|
||||
"""
|
||||
@ -316,9 +311,7 @@ class Scraper:
|
||||
)
|
||||
return None
|
||||
|
||||
def start_itemproc(
|
||||
self, item: Any, *, response: Optional[Response]
|
||||
) -> Deferred[Any]:
|
||||
def start_itemproc(self, item: Any, *, response: Response | None) -> Deferred[Any]:
|
||||
"""Send *item* to the item pipelines for processing.
|
||||
|
||||
*response* is the source of the item data. If the item does not come
|
||||
@ -337,7 +330,7 @@ class Scraper:
|
||||
download_failure: Failure,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Union[Failure, None]:
|
||||
) -> Failure | None:
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
errors that got propagated thru here).
|
||||
|
||||
@ -371,7 +364,7 @@ class Scraper:
|
||||
return None
|
||||
|
||||
def _itemproc_finished(
|
||||
self, output: Any, item: Any, response: Optional[Response], spider: Spider
|
||||
self, output: Any, item: Any, response: Response | None, spider: Spider
|
||||
) -> Deferred[Any]:
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``"""
|
||||
assert self.slot is not None # typing
|
||||
|
@ -10,7 +10,7 @@ import logging
|
||||
from collections.abc import AsyncIterable, Callable, Iterable
|
||||
from inspect import isasyncgenfunction, iscoroutine
|
||||
from itertools import islice
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, Union, cast
|
||||
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.python.failure import Failure
|
||||
@ -76,7 +76,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
response: Response,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Union[Iterable[_T], AsyncIterable[_T]]:
|
||||
) -> Iterable[_T] | AsyncIterable[_T]:
|
||||
for method in self.methods["process_spider_input"]:
|
||||
method = cast(Callable, method)
|
||||
try:
|
||||
@ -97,10 +97,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
self,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
iterable: Union[Iterable[_T], AsyncIterable[_T]],
|
||||
iterable: Iterable[_T] | AsyncIterable[_T],
|
||||
exception_processor_index: int,
|
||||
recover_to: Union[MutableChain[_T], MutableAsyncChain[_T]],
|
||||
) -> Union[Iterable[_T], AsyncIterable[_T]]:
|
||||
recover_to: MutableChain[_T] | MutableAsyncChain[_T],
|
||||
) -> Iterable[_T] | AsyncIterable[_T]:
|
||||
def process_sync(iterable: Iterable[_T]) -> Iterable[_T]:
|
||||
try:
|
||||
yield from iterable
|
||||
@ -142,7 +142,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
spider: Spider,
|
||||
_failure: Failure,
|
||||
start_index: int = 0,
|
||||
) -> Union[Failure, MutableChain[_T], MutableAsyncChain[_T]]:
|
||||
) -> Failure | MutableChain[_T] | MutableAsyncChain[_T]:
|
||||
exception = _failure.value
|
||||
# don't handle _InvalidOutput exception
|
||||
if isinstance(exception, _InvalidOutput):
|
||||
@ -158,7 +158,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
if _isiterable(result):
|
||||
# stop exception handling by handing control over to the
|
||||
# process_spider_output chain if an iterable has been returned
|
||||
dfd: Deferred[Union[MutableChain[_T], MutableAsyncChain[_T]]] = (
|
||||
dfd: Deferred[MutableChain[_T] | MutableAsyncChain[_T]] = (
|
||||
self._process_spider_output(
|
||||
response, spider, result, method_index + 1
|
||||
)
|
||||
@ -192,12 +192,12 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
self,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
result: Union[Iterable[_T], AsyncIterable[_T]],
|
||||
result: Iterable[_T] | AsyncIterable[_T],
|
||||
start_index: int = 0,
|
||||
) -> Generator[Deferred[Any], Any, Union[MutableChain[_T], MutableAsyncChain[_T]]]:
|
||||
) -> Generator[Deferred[Any], Any, MutableChain[_T] | MutableAsyncChain[_T]]:
|
||||
# items in this iterable do not need to go through the process_spider_output
|
||||
# chain, they went through it already from the process_spider_exception method
|
||||
recovered: Union[MutableChain[_T], MutableAsyncChain[_T]]
|
||||
recovered: MutableChain[_T] | MutableAsyncChain[_T]
|
||||
last_result_is_async = isinstance(result, AsyncIterable)
|
||||
if last_result_is_async:
|
||||
recovered = MutableAsyncChain()
|
||||
@ -248,10 +248,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
# might fail directly if the output value is not a generator
|
||||
result = method(response=response, result=result, spider=spider)
|
||||
except Exception as ex:
|
||||
exception_result: Union[
|
||||
Failure, MutableChain[_T], MutableAsyncChain[_T]
|
||||
] = self._process_spider_exception(
|
||||
response, spider, Failure(ex), method_index + 1
|
||||
exception_result: Failure | MutableChain[_T] | MutableAsyncChain[_T] = (
|
||||
self._process_spider_exception(
|
||||
response, spider, Failure(ex), method_index + 1
|
||||
)
|
||||
)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
@ -283,9 +283,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
self,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
result: Union[Iterable[_T], AsyncIterable[_T]],
|
||||
) -> Union[MutableChain[_T], MutableAsyncChain[_T]]:
|
||||
recovered: Union[MutableChain[_T], MutableAsyncChain[_T]]
|
||||
result: Iterable[_T] | AsyncIterable[_T],
|
||||
) -> MutableChain[_T] | MutableAsyncChain[_T]:
|
||||
recovered: MutableChain[_T] | MutableAsyncChain[_T]
|
||||
if isinstance(result, AsyncIterable):
|
||||
recovered = MutableAsyncChain()
|
||||
else:
|
||||
@ -293,7 +293,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
result = self._evaluate_iterable(response, spider, result, 0, recovered)
|
||||
result = await maybe_deferred_to_future(
|
||||
cast(
|
||||
"Deferred[Union[Iterable[_T], AsyncIterable[_T]]]",
|
||||
"Deferred[Iterable[_T] | AsyncIterable[_T]]",
|
||||
self._process_spider_output(response, spider, result),
|
||||
)
|
||||
)
|
||||
@ -310,22 +310,22 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
response: Response,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Deferred[Union[MutableChain[_T], MutableAsyncChain[_T]]]:
|
||||
) -> Deferred[MutableChain[_T] | MutableAsyncChain[_T]]:
|
||||
async def process_callback_output(
|
||||
result: Union[Iterable[_T], AsyncIterable[_T]]
|
||||
) -> Union[MutableChain[_T], MutableAsyncChain[_T]]:
|
||||
result: Iterable[_T] | AsyncIterable[_T],
|
||||
) -> MutableChain[_T] | MutableAsyncChain[_T]:
|
||||
return await self._process_callback_output(response, spider, result)
|
||||
|
||||
def process_spider_exception(
|
||||
_failure: Failure,
|
||||
) -> Union[Failure, MutableChain[_T], MutableAsyncChain[_T]]:
|
||||
) -> Failure | MutableChain[_T] | MutableAsyncChain[_T]:
|
||||
return self._process_spider_exception(response, spider, _failure)
|
||||
|
||||
dfd: Deferred[Union[Iterable[_T], AsyncIterable[_T]]] = mustbe_deferred(
|
||||
dfd: Deferred[Iterable[_T] | AsyncIterable[_T]] = mustbe_deferred(
|
||||
self._process_spider_input, scrape_func, response, request, spider
|
||||
)
|
||||
dfd2: Deferred[Union[MutableChain[_T], MutableAsyncChain[_T]]] = (
|
||||
dfd.addCallback(deferred_f_from_coro_f(process_callback_output))
|
||||
dfd2: Deferred[MutableChain[_T] | MutableAsyncChain[_T]] = dfd.addCallback(
|
||||
deferred_f_from_coro_f(process_callback_output)
|
||||
)
|
||||
dfd2.addErrback(process_spider_exception)
|
||||
return dfd2
|
||||
@ -339,10 +339,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
@staticmethod
|
||||
def _get_async_method_pair(
|
||||
mw: Any, methodname: str
|
||||
) -> Union[None, Callable, tuple[Callable, Callable]]:
|
||||
normal_method: Optional[Callable] = getattr(mw, methodname, None)
|
||||
) -> None | Callable | tuple[Callable, Callable]:
|
||||
normal_method: Callable | None = getattr(mw, methodname, None)
|
||||
methodname_async = methodname + "_async"
|
||||
async_method: Optional[Callable] = getattr(mw, methodname_async, None)
|
||||
async_method: Callable | None = getattr(mw, methodname_async, None)
|
||||
if not async_method:
|
||||
return normal_method
|
||||
if not normal_method:
|
||||
|
@ -4,7 +4,7 @@ import logging
|
||||
import pprint
|
||||
import signal
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
||||
|
||||
from twisted.internet.defer import (
|
||||
Deferred,
|
||||
@ -57,7 +57,7 @@ class Crawler:
|
||||
def __init__(
|
||||
self,
|
||||
spidercls: type[Spider],
|
||||
settings: Union[None, dict[str, Any], Settings] = None,
|
||||
settings: None | dict[str, Any] | Settings = None,
|
||||
init_reactor: bool = False,
|
||||
):
|
||||
if isinstance(spidercls, Spider):
|
||||
@ -78,12 +78,12 @@ class Crawler:
|
||||
self.crawling: bool = False
|
||||
self._started: bool = False
|
||||
|
||||
self.extensions: Optional[ExtensionManager] = None
|
||||
self.stats: Optional[StatsCollector] = None
|
||||
self.logformatter: Optional[LogFormatter] = None
|
||||
self.request_fingerprinter: Optional[RequestFingerprinter] = None
|
||||
self.spider: Optional[Spider] = None
|
||||
self.engine: Optional[ExecutionEngine] = None
|
||||
self.extensions: ExtensionManager | None = None
|
||||
self.stats: StatsCollector | None = None
|
||||
self.logformatter: LogFormatter | None = None
|
||||
self.request_fingerprinter: RequestFingerprinter | None = None
|
||||
self.spider: Spider | None = None
|
||||
self.engine: ExecutionEngine | None = None
|
||||
|
||||
def _update_root_log_handler(self) -> None:
|
||||
if get_scrapy_root_handler() is not None:
|
||||
@ -181,16 +181,16 @@ class Crawler:
|
||||
@staticmethod
|
||||
def _get_component(
|
||||
component_class: type[_T], components: Iterable[Any]
|
||||
) -> Optional[_T]:
|
||||
) -> _T | None:
|
||||
for component in components:
|
||||
if isinstance(component, component_class):
|
||||
return component
|
||||
return None
|
||||
|
||||
def get_addon(self, cls: type[_T]) -> Optional[_T]:
|
||||
def get_addon(self, cls: type[_T]) -> _T | None:
|
||||
return self._get_component(cls, self.addons.addons)
|
||||
|
||||
def get_downloader_middleware(self, cls: type[_T]) -> Optional[_T]:
|
||||
def get_downloader_middleware(self, cls: type[_T]) -> _T | None:
|
||||
if not self.engine:
|
||||
raise RuntimeError(
|
||||
"Crawler.get_downloader_middleware() can only be called after "
|
||||
@ -198,7 +198,7 @@ class Crawler:
|
||||
)
|
||||
return self._get_component(cls, self.engine.downloader.middleware.middlewares)
|
||||
|
||||
def get_extension(self, cls: type[_T]) -> Optional[_T]:
|
||||
def get_extension(self, cls: type[_T]) -> _T | None:
|
||||
if not self.extensions:
|
||||
raise RuntimeError(
|
||||
"Crawler.get_extension() can only be called after the "
|
||||
@ -206,7 +206,7 @@ class Crawler:
|
||||
)
|
||||
return self._get_component(cls, self.extensions.middlewares)
|
||||
|
||||
def get_item_pipeline(self, cls: type[_T]) -> Optional[_T]:
|
||||
def get_item_pipeline(self, cls: type[_T]) -> _T | None:
|
||||
if not self.engine:
|
||||
raise RuntimeError(
|
||||
"Crawler.get_item_pipeline() can only be called after the "
|
||||
@ -214,7 +214,7 @@ class Crawler:
|
||||
)
|
||||
return self._get_component(cls, self.engine.scraper.itemproc.middlewares)
|
||||
|
||||
def get_spider_middleware(self, cls: type[_T]) -> Optional[_T]:
|
||||
def get_spider_middleware(self, cls: type[_T]) -> _T | None:
|
||||
if not self.engine:
|
||||
raise RuntimeError(
|
||||
"Crawler.get_spider_middleware() can only be called after the "
|
||||
@ -250,7 +250,7 @@ class CrawlerRunner:
|
||||
verifyClass(ISpiderLoader, loader_cls)
|
||||
return cast("SpiderLoader", loader_cls.from_settings(settings.frozencopy()))
|
||||
|
||||
def __init__(self, settings: Union[dict[str, Any], Settings, None] = None):
|
||||
def __init__(self, settings: dict[str, Any] | Settings | None = None):
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
self.settings: Settings = settings
|
||||
@ -261,7 +261,7 @@ class CrawlerRunner:
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
crawler_or_spidercls: Union[type[Spider], str, Crawler],
|
||||
crawler_or_spidercls: type[Spider] | str | Crawler,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> Deferred[None]:
|
||||
@ -308,7 +308,7 @@ class CrawlerRunner:
|
||||
return d.addBoth(_done)
|
||||
|
||||
def create_crawler(
|
||||
self, crawler_or_spidercls: Union[type[Spider], str, Crawler]
|
||||
self, crawler_or_spidercls: type[Spider] | str | Crawler
|
||||
) -> Crawler:
|
||||
"""
|
||||
Return a :class:`~scrapy.crawler.Crawler` object.
|
||||
@ -329,7 +329,7 @@ class CrawlerRunner:
|
||||
return crawler_or_spidercls
|
||||
return self._create_crawler(crawler_or_spidercls)
|
||||
|
||||
def _create_crawler(self, spidercls: Union[str, type[Spider]]) -> Crawler:
|
||||
def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
|
||||
if isinstance(spidercls, str):
|
||||
spidercls = self.spider_loader.load(spidercls)
|
||||
return Crawler(spidercls, self.settings)
|
||||
@ -380,7 +380,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
settings: Union[dict[str, Any], Settings, None] = None,
|
||||
settings: dict[str, Any] | Settings | None = None,
|
||||
install_root_handler: bool = True,
|
||||
):
|
||||
super().__init__(settings)
|
||||
@ -409,7 +409,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
)
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
def _create_crawler(self, spidercls: Union[type[Spider], str]) -> Crawler:
|
||||
def _create_crawler(self, spidercls: type[Spider] | str) -> Crawler:
|
||||
if isinstance(spidercls, str):
|
||||
spidercls = self.spider_loader.load(spidercls)
|
||||
init_reactor = not self._initialized_reactor
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from w3lib import html
|
||||
|
||||
@ -43,7 +43,7 @@ class AjaxCrawlMiddleware:
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if not isinstance(response, HtmlResponse) or response.status != 200:
|
||||
return response
|
||||
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from tldextract import TLDExtract
|
||||
|
||||
@ -70,7 +70,7 @@ class CookiesMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
if request.meta.get("dont_merge_cookies", False):
|
||||
return None
|
||||
|
||||
@ -87,7 +87,7 @@ class CookiesMiddleware:
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if request.meta.get("dont_merge_cookies", False):
|
||||
return response
|
||||
|
||||
@ -123,7 +123,7 @@ class CookiesMiddleware:
|
||||
msg = f"Received cookies from: {response}\n{cookies}"
|
||||
logger.debug(msg, extra={"spider": spider})
|
||||
|
||||
def _format_cookie(self, cookie: VerboseCookie, request: Request) -> Optional[str]:
|
||||
def _format_cookie(self, cookie: VerboseCookie, request: Request) -> str | None:
|
||||
"""
|
||||
Given a dict consisting of cookie components, return its string representation.
|
||||
Decode from bytes if necessary.
|
||||
|
@ -6,7 +6,7 @@ See documentation in docs/topics/downloader-middleware.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
@ -32,7 +32,7 @@ class DefaultHeadersMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
for k, v in self._headers:
|
||||
request.headers.setdefault(k, v)
|
||||
return None
|
||||
|
@ -6,7 +6,7 @@ See documentation in docs/topics/downloader-middleware.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
|
||||
@ -33,7 +33,7 @@ class DownloadTimeoutMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
if self._timeout:
|
||||
request.meta.setdefault("download_timeout", self._timeout)
|
||||
return None
|
||||
|
@ -6,7 +6,7 @@ See documentation in docs/topics/downloader-middleware.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
@ -40,7 +40,7 @@ class HttpAuthMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
auth = getattr(self, "auth", None)
|
||||
if auth and b"Authorization" not in request.headers:
|
||||
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from email.utils import formatdate
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import (
|
||||
@ -69,7 +69,7 @@ class HttpCacheMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
if request.meta.get("dont_cache", False):
|
||||
return None
|
||||
|
||||
@ -79,7 +79,7 @@ class HttpCacheMiddleware:
|
||||
return None
|
||||
|
||||
# Look for cached response and check if expired
|
||||
cachedresponse: Optional[Response] = self.storage.retrieve_response(
|
||||
cachedresponse: Response | None = self.storage.retrieve_response(
|
||||
spider, request
|
||||
)
|
||||
if cachedresponse is None:
|
||||
@ -103,7 +103,7 @@ class HttpCacheMiddleware:
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if request.meta.get("dont_cache", False):
|
||||
return response
|
||||
|
||||
@ -118,7 +118,7 @@ class HttpCacheMiddleware:
|
||||
response.headers["Date"] = formatdate(usegmt=True)
|
||||
|
||||
# Do not validate first-hand responses
|
||||
cachedresponse: Optional[Response] = request.meta.pop("cached_response", None)
|
||||
cachedresponse: Response | None = request.meta.pop("cached_response", None)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value("httpcache/firsthand", spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
@ -134,8 +134,8 @@ class HttpCacheMiddleware:
|
||||
|
||||
def process_exception(
|
||||
self, request: Request, exception: Exception, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
cachedresponse: Optional[Response] = request.meta.pop("cached_response", None)
|
||||
) -> Request | Response | None:
|
||||
cachedresponse: Response | None = request.meta.pop("cached_response", None)
|
||||
if cachedresponse is not None and isinstance(
|
||||
exception, self.DOWNLOAD_EXCEPTIONS
|
||||
):
|
||||
@ -148,7 +148,7 @@ class HttpCacheMiddleware:
|
||||
spider: Spider,
|
||||
response: Response,
|
||||
request: Request,
|
||||
cachedresponse: Optional[Response],
|
||||
cachedresponse: Response | None,
|
||||
) -> None:
|
||||
if self.policy.should_cache_response(response, request):
|
||||
self.stats.inc_value("httpcache/store", spider=spider)
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import warnings
|
||||
from itertools import chain
|
||||
from logging import getLogger
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
@ -54,9 +54,9 @@ class HttpCompressionMiddleware:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stats: Optional[StatsCollector] = None,
|
||||
stats: StatsCollector | None = None,
|
||||
*,
|
||||
crawler: Optional[Crawler] = None,
|
||||
crawler: Crawler | None = None,
|
||||
):
|
||||
if not crawler:
|
||||
self.stats = stats
|
||||
@ -96,13 +96,13 @@ class HttpCompressionMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS))
|
||||
return None
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if request.method == "HEAD":
|
||||
return response
|
||||
if isinstance(response, Response):
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import unquote, urlunparse
|
||||
from urllib.request import ( # type: ignore[attr-defined]
|
||||
_parse_proxy,
|
||||
@ -23,9 +23,9 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class HttpProxyMiddleware:
|
||||
def __init__(self, auth_encoding: Optional[str] = "latin-1"):
|
||||
self.auth_encoding: Optional[str] = auth_encoding
|
||||
self.proxies: dict[str, tuple[Optional[bytes], str]] = {}
|
||||
def __init__(self, auth_encoding: str | None = "latin-1"):
|
||||
self.auth_encoding: str | None = auth_encoding
|
||||
self.proxies: dict[str, tuple[bytes | None, str]] = {}
|
||||
for type_, url in getproxies().items():
|
||||
try:
|
||||
self.proxies[type_] = self._get_proxy(url, type_)
|
||||
@ -38,7 +38,7 @@ class HttpProxyMiddleware:
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
if not crawler.settings.getbool("HTTPPROXY_ENABLED"):
|
||||
raise NotConfigured
|
||||
auth_encoding: Optional[str] = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
|
||||
auth_encoding: str | None = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
|
||||
return cls(auth_encoding)
|
||||
|
||||
def _basic_auth_header(self, username: str, password: str) -> bytes:
|
||||
@ -47,7 +47,7 @@ class HttpProxyMiddleware:
|
||||
)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
def _get_proxy(self, url: str, orig_type: str) -> tuple[Optional[bytes], str]:
|
||||
def _get_proxy(self, url: str, orig_type: str) -> tuple[bytes | None, str]:
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))
|
||||
|
||||
@ -60,7 +60,7 @@ class HttpProxyMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
creds, proxy_url, scheme = None, None, None
|
||||
if "proxy" in request.meta:
|
||||
if request.meta["proxy"] is not None:
|
||||
@ -82,9 +82,9 @@ class HttpProxyMiddleware:
|
||||
def _set_proxy_and_creds(
|
||||
self,
|
||||
request: Request,
|
||||
proxy_url: Optional[str],
|
||||
creds: Optional[bytes],
|
||||
scheme: Optional[str],
|
||||
proxy_url: str | None,
|
||||
creds: bytes | None,
|
||||
scheme: str | None,
|
||||
) -> None:
|
||||
if scheme:
|
||||
request.meta["_scheme_proxy"] = True
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
@ -144,7 +144,7 @@ class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if (
|
||||
request.meta.get("dont_redirect", False)
|
||||
or response.status in getattr(spider, "handle_httpstatus_list", [])
|
||||
@ -185,7 +185,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if (
|
||||
request.meta.get("dont_redirect", False)
|
||||
or request.method == "HEAD"
|
||||
|
@ -14,7 +14,7 @@ from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from logging import Logger, getLogger
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.settings import BaseSettings, Settings
|
||||
@ -60,12 +60,12 @@ def get_retry_request(
|
||||
request: Request,
|
||||
*,
|
||||
spider: Spider,
|
||||
reason: Union[str, Exception, type[Exception]] = "unspecified",
|
||||
max_retry_times: Optional[int] = None,
|
||||
priority_adjust: Optional[int] = None,
|
||||
reason: str | Exception | type[Exception] = "unspecified",
|
||||
max_retry_times: int | None = None,
|
||||
priority_adjust: int | None = None,
|
||||
logger: Logger = retry_logger,
|
||||
stats_base_key: str = "retry",
|
||||
) -> Optional[Request]:
|
||||
) -> Request | None:
|
||||
"""
|
||||
Returns a new :class:`~scrapy.Request` object to retry the specified
|
||||
request, or ``None`` if retries of the specified request have been
|
||||
@ -167,7 +167,7 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
if request.meta.get("dont_retry", False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
@ -177,7 +177,7 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
|
||||
|
||||
def process_exception(
|
||||
self, request: Request, exception: Exception, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
if isinstance(exception, self.exceptions_to_retry) and not request.meta.get(
|
||||
"dont_retry", False
|
||||
):
|
||||
@ -187,9 +187,9 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
|
||||
def _retry(
|
||||
self,
|
||||
request: Request,
|
||||
reason: Union[str, Exception, type[Exception]],
|
||||
reason: str | Exception | type[Exception],
|
||||
spider: Spider,
|
||||
) -> Optional[Request]:
|
||||
) -> Request | None:
|
||||
max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
|
||||
priority_adjust = request.meta.get("priority_adjust", self.priority_adjust)
|
||||
return get_retry_request(
|
||||
|
@ -7,7 +7,7 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, TypeVar
|
||||
|
||||
from twisted.internet.defer import Deferred, maybeDeferred
|
||||
|
||||
@ -41,13 +41,11 @@ class RobotsTxtMiddleware:
|
||||
if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
|
||||
raise NotConfigured
|
||||
self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy")
|
||||
self._robotstxt_useragent: Optional[str] = crawler.settings.get(
|
||||
self._robotstxt_useragent: str | None = crawler.settings.get(
|
||||
"ROBOTSTXT_USER_AGENT", None
|
||||
)
|
||||
self.crawler: Crawler = crawler
|
||||
self._parsers: dict[
|
||||
str, Union[RobotParser, Deferred[Optional[RobotParser]], None]
|
||||
] = {}
|
||||
self._parsers: dict[str, RobotParser | Deferred[RobotParser | None] | None] = {}
|
||||
self._parserimpl: RobotParser = load_object(
|
||||
crawler.settings.get("ROBOTSTXT_PARSER")
|
||||
)
|
||||
@ -61,24 +59,24 @@ class RobotsTxtMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Optional[Deferred[None]]:
|
||||
) -> Deferred[None] | None:
|
||||
if request.meta.get("dont_obey_robotstxt"):
|
||||
return None
|
||||
if request.url.startswith("data:") or request.url.startswith("file:"):
|
||||
return None
|
||||
d: Deferred[Optional[RobotParser]] = maybeDeferred(
|
||||
d: Deferred[RobotParser | None] = maybeDeferred(
|
||||
self.robot_parser, request, spider # type: ignore[call-overload]
|
||||
)
|
||||
d2: Deferred[None] = d.addCallback(self.process_request_2, request, spider)
|
||||
return d2
|
||||
|
||||
def process_request_2(
|
||||
self, rp: Optional[RobotParser], request: Request, spider: Spider
|
||||
self, rp: RobotParser | None, request: Request, spider: Spider
|
||||
) -> None:
|
||||
if rp is None:
|
||||
return
|
||||
|
||||
useragent: Union[str, bytes, None] = self._robotstxt_useragent
|
||||
useragent: str | bytes | None = self._robotstxt_useragent
|
||||
if not useragent:
|
||||
useragent = request.headers.get(b"User-Agent", self._default_useragent)
|
||||
assert useragent is not None
|
||||
@ -94,7 +92,7 @@ class RobotsTxtMiddleware:
|
||||
|
||||
def robot_parser(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[RobotParser, Deferred[Optional[RobotParser]], None]:
|
||||
) -> RobotParser | Deferred[RobotParser | None] | None:
|
||||
url = urlparse_cached(request)
|
||||
netloc = url.netloc
|
||||
|
||||
@ -117,9 +115,9 @@ class RobotsTxtMiddleware:
|
||||
|
||||
parser = self._parsers[netloc]
|
||||
if isinstance(parser, Deferred):
|
||||
d: Deferred[Optional[RobotParser]] = Deferred()
|
||||
d: Deferred[RobotParser | None] = Deferred()
|
||||
|
||||
def cb(result: Optional[RobotParser]) -> Optional[RobotParser]:
|
||||
def cb(result: RobotParser | None) -> RobotParser | None:
|
||||
d.callback(result)
|
||||
return result
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from twisted.web import http
|
||||
|
||||
@ -19,7 +19,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
def get_header_size(
|
||||
headers: dict[str, Union[list[Union[str, bytes]], tuple[Union[str, bytes], ...]]]
|
||||
headers: dict[str, list[str | bytes] | tuple[str | bytes, ...]]
|
||||
) -> int:
|
||||
size = 0
|
||||
for key, value in headers.items():
|
||||
@ -47,7 +47,7 @@ class DownloaderStats:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
self.stats.inc_value("downloader/request_count", spider=spider)
|
||||
self.stats.inc_value(
|
||||
f"downloader/request_method_count/{request.method}", spider=spider
|
||||
@ -58,7 +58,7 @@ class DownloaderStats:
|
||||
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Union[Request, Response]:
|
||||
) -> Request | Response:
|
||||
self.stats.inc_value("downloader/response_count", spider=spider)
|
||||
self.stats.inc_value(
|
||||
f"downloader/response_status_count/{response.status}", spider=spider
|
||||
@ -75,7 +75,7 @@ class DownloaderStats:
|
||||
|
||||
def process_exception(
|
||||
self, request: Request, exception: Exception, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
ex_class = global_object_name(exception.__class__)
|
||||
self.stats.inc_value("downloader/exception_count", spider=spider)
|
||||
self.stats.inc_value(
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
|
||||
@ -31,7 +31,7 @@ class UserAgentMiddleware:
|
||||
|
||||
def process_request(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Union[Request, Response, None]:
|
||||
) -> Request | Response | None:
|
||||
if self.user_agent:
|
||||
request.headers.setdefault(b"User-Agent", self.user_agent)
|
||||
return None
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.request import (
|
||||
@ -31,10 +31,10 @@ class BaseDupeFilter:
|
||||
def request_seen(self, request: Request) -> bool:
|
||||
return False
|
||||
|
||||
def open(self) -> Optional[Deferred[None]]:
|
||||
def open(self) -> Deferred[None] | None:
|
||||
pass
|
||||
|
||||
def close(self, reason: str) -> Optional[Deferred[None]]:
|
||||
def close(self, reason: str) -> Deferred[None] | None:
|
||||
pass
|
||||
|
||||
def log(self, request: Request, spider: Spider) -> None:
|
||||
@ -47,10 +47,10 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Optional[str] = None,
|
||||
path: str | None = None,
|
||||
debug: bool = False,
|
||||
*,
|
||||
fingerprinter: Optional[RequestFingerprinterProtocol] = None,
|
||||
fingerprinter: RequestFingerprinterProtocol | None = None,
|
||||
) -> None:
|
||||
self.file = None
|
||||
self.fingerprinter: RequestFingerprinterProtocol = (
|
||||
@ -70,7 +70,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
cls,
|
||||
settings: BaseSettings,
|
||||
*,
|
||||
fingerprinter: Optional[RequestFingerprinterProtocol] = None,
|
||||
fingerprinter: RequestFingerprinterProtocol | None = None,
|
||||
) -> Self:
|
||||
debug = settings.getbool("DUPEFILTER_DEBUG")
|
||||
return cls(job_dir(settings), debug, fingerprinter=fingerprinter)
|
||||
|
@ -2,6 +2,8 @@
|
||||
Item Exporters are used to export/serialize items into different formats.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import marshal
|
||||
import pickle # nosec
|
||||
@ -9,7 +11,7 @@ import pprint
|
||||
from collections.abc import Callable, Iterable, Mapping
|
||||
from io import BytesIO, TextIOWrapper
|
||||
from json import JSONEncoder
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
from xml.sax.saxutils import XMLGenerator # nosec
|
||||
from xml.sax.xmlreader import AttributesImpl # nosec
|
||||
|
||||
@ -41,12 +43,12 @@ class BaseItemExporter:
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses ``__init__`` methods)
|
||||
"""
|
||||
self.encoding: Optional[str] = options.pop("encoding", None)
|
||||
self.fields_to_export: Union[Mapping[str, str], Iterable[str], None] = (
|
||||
options.pop("fields_to_export", None)
|
||||
self.encoding: str | None = options.pop("encoding", None)
|
||||
self.fields_to_export: Mapping[str, str] | Iterable[str] | None = options.pop(
|
||||
"fields_to_export", None
|
||||
)
|
||||
self.export_empty_fields: bool = options.pop("export_empty_fields", False)
|
||||
self.indent: Optional[int] = options.pop("indent", None)
|
||||
self.indent: int | None = options.pop("indent", None)
|
||||
if not dont_fail and options:
|
||||
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
|
||||
|
||||
@ -54,7 +56,7 @@ class BaseItemExporter:
|
||||
raise NotImplementedError
|
||||
|
||||
def serialize_field(
|
||||
self, field: Union[Mapping[str, Any], Field], name: str, value: Any
|
||||
self, field: Mapping[str, Any] | Field, name: str, value: Any
|
||||
) -> Any:
|
||||
serializer: Callable[[Any], Any] = field.get("serializer", lambda x: x)
|
||||
return serializer(value)
|
||||
@ -66,7 +68,7 @@ class BaseItemExporter:
|
||||
pass
|
||||
|
||||
def _get_serialized_fields(
|
||||
self, item: Any, default_value: Any = None, include_empty: Optional[bool] = None
|
||||
self, item: Any, default_value: Any = None, include_empty: bool | None = None
|
||||
) -> Iterable[tuple[str, Any]]:
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
@ -225,7 +227,7 @@ class CsvItemExporter(BaseItemExporter):
|
||||
file: BytesIO,
|
||||
include_headers_line: bool = True,
|
||||
join_multivalued: str = ",",
|
||||
errors: Optional[str] = None,
|
||||
errors: str | None = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
@ -245,7 +247,7 @@ class CsvItemExporter(BaseItemExporter):
|
||||
self._join_multivalued = join_multivalued
|
||||
|
||||
def serialize_field(
|
||||
self, field: Union[Mapping[str, Any], Field], name: str, value: Any
|
||||
self, field: Mapping[str, Any] | Field, name: str, value: Any
|
||||
) -> Any:
|
||||
serializer: Callable[[Any], Any] = field.get("serializer", self._join_if_needed)
|
||||
return serializer(value)
|
||||
@ -346,7 +348,7 @@ class PythonItemExporter(BaseItemExporter):
|
||||
self.encoding = "utf-8"
|
||||
|
||||
def serialize_field(
|
||||
self, field: Union[Mapping[str, Any], Field], name: str, value: Any
|
||||
self, field: Mapping[str, Any] | Field, name: str, value: Any
|
||||
) -> Any:
|
||||
serializer: Callable[[Any], Any] = field.get(
|
||||
"serializer", self._serialize_value
|
||||
@ -364,10 +366,10 @@ class PythonItemExporter(BaseItemExporter):
|
||||
return to_unicode(value, encoding=self.encoding)
|
||||
return value
|
||||
|
||||
def _serialize_item(self, item: Any) -> Iterable[tuple[Union[str, bytes], Any]]:
|
||||
def _serialize_item(self, item: Any) -> Iterable[tuple[str | bytes, Any]]:
|
||||
for key, value in ItemAdapter(item).items():
|
||||
yield key, self._serialize_value(value)
|
||||
|
||||
def export_item(self, item: Any) -> dict[Union[str, bytes], Any]: # type: ignore[override]
|
||||
result: dict[Union[str, bytes], Any] = dict(self._get_serialized_fields(item))
|
||||
def export_item(self, item: Any) -> dict[str | bytes, Any]: # type: ignore[override]
|
||||
result: dict[str | bytes, Any] = dict(self._get_serialized_fields(item))
|
||||
return result
|
||||
|
@ -5,7 +5,7 @@ Extension for collecting core stats like items scraped and start/finish times
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy import Spider, signals
|
||||
|
||||
@ -20,7 +20,7 @@ if TYPE_CHECKING:
|
||||
class CoreStats:
|
||||
def __init__(self, stats: StatsCollector):
|
||||
self.stats: StatsCollector = stats
|
||||
self.start_time: Optional[datetime] = None
|
||||
self.start_time: datetime | None = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
|
@ -12,7 +12,7 @@ import sys
|
||||
import threading
|
||||
import traceback
|
||||
from pdb import Pdb
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.utils.engine import format_engine_status
|
||||
from scrapy.utils.trackref import format_live_refs
|
||||
@ -43,7 +43,7 @@ class StackTraceDump:
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
return cls(crawler)
|
||||
|
||||
def dump_stacktrace(self, signum: int, frame: Optional[FrameType]) -> None:
|
||||
def dump_stacktrace(self, signum: int, frame: FrameType | None) -> None:
|
||||
assert self.crawler.engine
|
||||
log_args = {
|
||||
"stackdumps": self._thread_stacks(),
|
||||
@ -75,6 +75,6 @@ class Debugger:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
def _enter_debugger(self, signum: int, frame: Optional[FrameType]) -> None:
|
||||
def _enter_debugger(self, signum: int, frame: FrameType | None) -> None:
|
||||
assert frame
|
||||
Pdb().set_trace(frame.f_back) # noqa: T100
|
||||
|
@ -14,7 +14,7 @@ from collections.abc import Callable
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path, PureWindowsPath
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Protocol, TypeVar, Union, cast
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Protocol, TypeVar, cast
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
|
||||
@ -67,7 +67,7 @@ def build_storage(
|
||||
builder: Callable[..., _StorageT],
|
||||
uri: str,
|
||||
*args: Any,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
feed_options: dict[str, Any] | None = None,
|
||||
preargs: Iterable[Any] = (),
|
||||
**kwargs: Any,
|
||||
) -> _StorageT:
|
||||
@ -84,10 +84,10 @@ class ItemFilter:
|
||||
:type feed_options: dict
|
||||
"""
|
||||
|
||||
feed_options: Optional[dict[str, Any]]
|
||||
feed_options: dict[str, Any] | None
|
||||
item_classes: tuple[type, ...]
|
||||
|
||||
def __init__(self, feed_options: Optional[dict[str, Any]]) -> None:
|
||||
def __init__(self, feed_options: dict[str, Any] | None) -> None:
|
||||
self.feed_options = feed_options
|
||||
if feed_options is not None:
|
||||
self.item_classes = tuple(
|
||||
@ -129,7 +129,7 @@ class IFeedStorage(Interface):
|
||||
class FeedStorageProtocol(Protocol):
|
||||
"""Reimplementation of ``IFeedStorage`` that can be used in type hints."""
|
||||
|
||||
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
|
||||
def __init__(self, uri: str, *, feed_options: dict[str, Any] | None = None):
|
||||
"""Initialize the storage with the parameters given in the URI and the
|
||||
feed-specific options (see :setting:`FEEDS`)"""
|
||||
|
||||
@ -137,7 +137,7 @@ class FeedStorageProtocol(Protocol):
|
||||
"""Open the storage for the given spider. It must return a file-like
|
||||
object that will be used for the exporters"""
|
||||
|
||||
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
|
||||
def store(self, file: IO[bytes]) -> Deferred[None] | None:
|
||||
"""Store the given file stream"""
|
||||
|
||||
|
||||
@ -150,7 +150,7 @@ class BlockingFeedStorage:
|
||||
|
||||
return NamedTemporaryFile(prefix="feed-", dir=path)
|
||||
|
||||
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
|
||||
def store(self, file: IO[bytes]) -> Deferred[None] | None:
|
||||
return deferToThread(self._store_in_thread, file)
|
||||
|
||||
def _store_in_thread(self, file: IO[bytes]) -> None:
|
||||
@ -162,9 +162,9 @@ class StdoutFeedStorage:
|
||||
def __init__(
|
||||
self,
|
||||
uri: str,
|
||||
_stdout: Optional[IO[bytes]] = None,
|
||||
_stdout: IO[bytes] | None = None,
|
||||
*,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
feed_options: dict[str, Any] | None = None,
|
||||
):
|
||||
if not _stdout:
|
||||
_stdout = sys.stdout.buffer
|
||||
@ -180,13 +180,13 @@ class StdoutFeedStorage:
|
||||
def open(self, spider: Spider) -> IO[bytes]:
|
||||
return self._stdout
|
||||
|
||||
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
|
||||
def store(self, file: IO[bytes]) -> Deferred[None] | None:
|
||||
pass
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage:
|
||||
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
|
||||
def __init__(self, uri: str, *, feed_options: dict[str, Any] | None = None):
|
||||
self.path: str = file_uri_to_path(uri)
|
||||
feed_options = feed_options or {}
|
||||
self.write_mode: OpenBinaryMode = (
|
||||
@ -199,7 +199,7 @@ class FileFeedStorage:
|
||||
dirname.mkdir(parents=True)
|
||||
return Path(self.path).open(self.write_mode)
|
||||
|
||||
def store(self, file: IO[bytes]) -> Optional[Deferred[None]]:
|
||||
def store(self, file: IO[bytes]) -> Deferred[None] | None:
|
||||
file.close()
|
||||
return None
|
||||
|
||||
@ -208,27 +208,27 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
def __init__(
|
||||
self,
|
||||
uri: str,
|
||||
access_key: Optional[str] = None,
|
||||
secret_key: Optional[str] = None,
|
||||
acl: Optional[str] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
access_key: str | None = None,
|
||||
secret_key: str | None = None,
|
||||
acl: str | None = None,
|
||||
endpoint_url: str | None = None,
|
||||
*,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
session_token: Optional[str] = None,
|
||||
region_name: Optional[str] = None,
|
||||
feed_options: dict[str, Any] | None = None,
|
||||
session_token: str | None = None,
|
||||
region_name: str | None = None,
|
||||
):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured("missing botocore library")
|
||||
u = urlparse(uri)
|
||||
assert u.hostname
|
||||
self.bucketname: str = u.hostname
|
||||
self.access_key: Optional[str] = u.username or access_key
|
||||
self.secret_key: Optional[str] = u.password or secret_key
|
||||
self.session_token: Optional[str] = session_token
|
||||
self.access_key: str | None = u.username or access_key
|
||||
self.secret_key: str | None = u.password or secret_key
|
||||
self.session_token: str | None = session_token
|
||||
self.keyname: str = u.path[1:] # remove first "/"
|
||||
self.acl: Optional[str] = acl
|
||||
self.endpoint_url: Optional[str] = endpoint_url
|
||||
self.region_name: Optional[str] = region_name
|
||||
self.acl: str | None = acl
|
||||
self.endpoint_url: str | None = endpoint_url
|
||||
self.region_name: str | None = region_name
|
||||
# It can be either botocore.client.BaseClient or mypy_boto3_s3.S3Client,
|
||||
# there seems to be no good way to infer it statically.
|
||||
self.s3_client: Any
|
||||
@ -279,7 +279,7 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
crawler: Crawler,
|
||||
uri: str,
|
||||
*,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
feed_options: dict[str, Any] | None = None,
|
||||
) -> Self:
|
||||
return build_storage(
|
||||
cls,
|
||||
@ -310,9 +310,9 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
|
||||
class GCSFeedStorage(BlockingFeedStorage):
|
||||
def __init__(self, uri: str, project_id: Optional[str], acl: Optional[str]):
|
||||
self.project_id: Optional[str] = project_id
|
||||
self.acl: Optional[str] = acl
|
||||
def __init__(self, uri: str, project_id: str | None, acl: str | None):
|
||||
self.project_id: str | None = project_id
|
||||
self.acl: str | None = acl
|
||||
u = urlparse(uri)
|
||||
assert u.hostname
|
||||
self.bucket_name: str = u.hostname
|
||||
@ -342,7 +342,7 @@ class FTPFeedStorage(BlockingFeedStorage):
|
||||
uri: str,
|
||||
use_active_mode: bool = False,
|
||||
*,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
feed_options: dict[str, Any] | None = None,
|
||||
):
|
||||
u = urlparse(uri)
|
||||
if not u.hostname:
|
||||
@ -361,7 +361,7 @@ class FTPFeedStorage(BlockingFeedStorage):
|
||||
crawler: Crawler,
|
||||
uri: str,
|
||||
*,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
feed_options: dict[str, Any] | None = None,
|
||||
) -> Self:
|
||||
return build_storage(
|
||||
cls,
|
||||
@ -399,8 +399,8 @@ class FeedSlot:
|
||||
settings: BaseSettings,
|
||||
crawler: Crawler,
|
||||
):
|
||||
self.file: Optional[IO[bytes]] = None
|
||||
self.exporter: Optional[BaseItemExporter] = None
|
||||
self.file: IO[bytes] | None = None
|
||||
self.exporter: BaseItemExporter | None = None
|
||||
self.storage: FeedStorageProtocol = storage
|
||||
# feed params
|
||||
self.batch_id: int = batch_id
|
||||
@ -558,7 +558,7 @@ class FeedExporter:
|
||||
self.crawler.signals.send_catch_log_deferred(signals.feed_exporter_closed)
|
||||
)
|
||||
|
||||
def _close_slot(self, slot: FeedSlot, spider: Spider) -> Optional[Deferred[None]]:
|
||||
def _close_slot(self, slot: FeedSlot, spider: Spider) -> Deferred[None] | None:
|
||||
def get_file(slot_: FeedSlot) -> IO[bytes]:
|
||||
assert slot_.file
|
||||
if isinstance(slot_.file, PostProcessingManager):
|
||||
@ -770,8 +770,8 @@ class FeedExporter:
|
||||
def _get_uri_params(
|
||||
self,
|
||||
spider: Spider,
|
||||
uri_params_function: Union[str, UriParamsCallableT, None],
|
||||
slot: Optional[FeedSlot] = None,
|
||||
uri_params_function: str | UriParamsCallableT | None,
|
||||
slot: FeedSlot | None = None,
|
||||
) -> dict[str, Any]:
|
||||
params = {}
|
||||
for k in dir(spider):
|
||||
|
@ -9,7 +9,7 @@ from importlib import import_module
|
||||
from pathlib import Path
|
||||
from time import time
|
||||
from types import ModuleType
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import IO, TYPE_CHECKING, Any, cast
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
|
||||
@ -66,16 +66,14 @@ class RFC2616Policy:
|
||||
self.always_store: bool = settings.getbool("HTTPCACHE_ALWAYS_STORE")
|
||||
self.ignore_schemes: list[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self._cc_parsed: WeakKeyDictionary[
|
||||
Union[Request, Response], dict[bytes, Optional[bytes]]
|
||||
Request | Response, dict[bytes, bytes | None]
|
||||
] = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls: list[bytes] = [
|
||||
to_bytes(cc)
|
||||
for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(
|
||||
self, r: Union[Request, Response]
|
||||
) -> dict[bytes, Optional[bytes]]:
|
||||
def _parse_cachecontrol(self, r: Request | Response) -> dict[bytes, bytes | None]:
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get(b"Cache-Control", b"")
|
||||
assert cch is not None
|
||||
@ -191,7 +189,7 @@ class RFC2616Policy:
|
||||
if b"ETag" in cachedresponse.headers:
|
||||
request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
|
||||
|
||||
def _get_max_age(self, cc: dict[bytes, Optional[bytes]]) -> Optional[int]:
|
||||
def _get_max_age(self, cc: dict[bytes, bytes | None]) -> int | None:
|
||||
try:
|
||||
return max(0, int(cc[b"max-age"])) # type: ignore[arg-type]
|
||||
except (KeyError, ValueError):
|
||||
@ -275,7 +273,7 @@ class DbmCacheStorage:
|
||||
def close_spider(self, spider: Spider) -> None:
|
||||
self.db.close()
|
||||
|
||||
def retrieve_response(self, spider: Spider, request: Request) -> Optional[Response]:
|
||||
def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return None # not cached
|
||||
@ -300,7 +298,7 @@ class DbmCacheStorage:
|
||||
self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
|
||||
self.db[f"{key}_time"] = str(time())
|
||||
|
||||
def _read_data(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
|
||||
def _read_data(self, spider: Spider, request: Request) -> dict[str, Any] | None:
|
||||
key = self._fingerprinter.fingerprint(request).hex()
|
||||
db = self.db
|
||||
tkey = f"{key}_time"
|
||||
@ -320,9 +318,7 @@ class FilesystemCacheStorage:
|
||||
self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS")
|
||||
self.use_gzip: bool = settings.getbool("HTTPCACHE_GZIP")
|
||||
# https://github.com/python/mypy/issues/10740
|
||||
self._open: Callable[
|
||||
Concatenate[Union[str, os.PathLike], str, ...], IO[bytes]
|
||||
] = (
|
||||
self._open: Callable[Concatenate[str | os.PathLike, str, ...], IO[bytes]] = (
|
||||
gzip.open if self.use_gzip else open # type: ignore[assignment]
|
||||
)
|
||||
|
||||
@ -339,7 +335,7 @@ class FilesystemCacheStorage:
|
||||
def close_spider(self, spider: Spider) -> None:
|
||||
pass
|
||||
|
||||
def retrieve_response(self, spider: Spider, request: Request) -> Optional[Response]:
|
||||
def retrieve_response(self, spider: Spider, request: Request) -> Response | None:
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
@ -387,7 +383,7 @@ class FilesystemCacheStorage:
|
||||
key = self._fingerprinter.fingerprint(request).hex()
|
||||
return str(Path(self.cachedir, spider.name, key[0:2], key))
|
||||
|
||||
def _read_meta(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
|
||||
def _read_meta(self, spider: Spider, request: Request) -> dict[str, Any] | None:
|
||||
rpath = Path(self._get_request_path(spider, request))
|
||||
metapath = rpath / "pickled_meta"
|
||||
if not metapath.exists():
|
||||
@ -399,7 +395,7 @@ class FilesystemCacheStorage:
|
||||
return cast(dict[str, Any], pickle.load(f)) # nosec
|
||||
|
||||
|
||||
def parse_cachecontrol(header: bytes) -> dict[bytes, Optional[bytes]]:
|
||||
def parse_cachecontrol(header: bytes) -> dict[bytes, bytes | None]:
|
||||
"""Parse Cache-Control header
|
||||
|
||||
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
@ -419,7 +415,7 @@ def parse_cachecontrol(header: bytes) -> dict[bytes, Optional[bytes]]:
|
||||
return directives
|
||||
|
||||
|
||||
def rfc1123_to_epoch(date_str: Union[str, bytes, None]) -> Optional[int]:
|
||||
def rfc1123_to_epoch(date_str: str | bytes | None) -> int | None:
|
||||
try:
|
||||
date_str = to_unicode(date_str, encoding="ascii") # type: ignore[arg-type]
|
||||
return mktime_tz(parsedate_tz(date_str)) # type: ignore[arg-type]
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
@ -29,7 +29,7 @@ class LogStats:
|
||||
self.stats: StatsCollector = stats
|
||||
self.interval: float = interval
|
||||
self.multiplier: float = 60.0 / self.interval
|
||||
self.task: Optional[task.LoopingCall] = None
|
||||
self.task: task.LoopingCall | None = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
@ -81,7 +81,7 @@ class LogStats:
|
||||
|
||||
def calculate_final_stats(
|
||||
self, spider: Spider
|
||||
) -> Union[tuple[None, None], tuple[float, float]]:
|
||||
) -> tuple[None, None] | tuple[float, float]:
|
||||
start_time = self.stats.get_value("start_time")
|
||||
finished_time = self.stats.get_value("finished_time")
|
||||
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from json import JSONEncoder
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
@ -36,7 +36,7 @@ class PeriodicLog:
|
||||
self.stats: StatsCollector = stats
|
||||
self.interval: float = interval
|
||||
self.multiplier: float = 60.0 / self.interval
|
||||
self.task: Optional[task.LoopingCall] = None
|
||||
self.task: task.LoopingCall | None = None
|
||||
self.encoder: JSONEncoder = ScrapyJSONEncoder(sort_keys=True, indent=4)
|
||||
self.ext_stats_enabled: bool = bool(ext_stats)
|
||||
self.ext_stats_include: list[str] = ext_stats.get("include", [])
|
||||
@ -52,7 +52,7 @@ class PeriodicLog:
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
try:
|
||||
ext_stats: Optional[dict[str, Any]] = crawler.settings.getdict(
|
||||
ext_stats: dict[str, Any] | None = crawler.settings.getdict(
|
||||
"PERIODIC_LOG_STATS"
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
@ -62,7 +62,7 @@ class PeriodicLog:
|
||||
else None
|
||||
)
|
||||
try:
|
||||
ext_delta: Optional[dict[str, Any]] = crawler.settings.getdict(
|
||||
ext_delta: dict[str, Any] | None = crawler.settings.getdict(
|
||||
"PERIODIC_LOG_DELTA"
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
@ -93,8 +93,8 @@ class PeriodicLog:
|
||||
|
||||
def spider_opened(self, spider: Spider) -> None:
|
||||
self.time_prev: datetime = datetime.now(tz=timezone.utc)
|
||||
self.delta_prev: dict[str, Union[int, float]] = {}
|
||||
self.stats_prev: dict[str, Union[int, float]] = {}
|
||||
self.delta_prev: dict[str, int | float] = {}
|
||||
self.stats_prev: dict[str, int | float] = {}
|
||||
|
||||
self.task = task.LoopingCall(self.log)
|
||||
self.task.start(self.interval)
|
||||
@ -110,7 +110,7 @@ class PeriodicLog:
|
||||
logger.info(self.encoder.encode(data))
|
||||
|
||||
def log_delta(self) -> dict[str, Any]:
|
||||
num_stats: dict[str, Union[int, float]] = {
|
||||
num_stats: dict[str, int | float] = {
|
||||
k: v
|
||||
for k, v in self.stats._stats.items()
|
||||
if isinstance(v, (int, float))
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import pickle # nosec
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy import Spider, signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -18,8 +18,8 @@ if TYPE_CHECKING:
|
||||
class SpiderState:
|
||||
"""Store and load spider state during a scraping job"""
|
||||
|
||||
def __init__(self, jobdir: Optional[str] = None):
|
||||
self.jobdir: Optional[str] = jobdir
|
||||
def __init__(self, jobdir: str | None = None):
|
||||
self.jobdir: str | None = jobdir
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
|
@ -6,7 +6,7 @@ Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy import Spider, signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -39,7 +39,7 @@ class StatsMailer:
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider: Spider) -> Optional[Deferred[None]]:
|
||||
def spider_closed(self, spider: Spider) -> Deferred[None] | None:
|
||||
spider_stats = self.stats.get_stats(spider)
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -90,8 +90,8 @@ class AutoThrottle:
|
||||
|
||||
def _get_slot(
|
||||
self, request: Request, spider: Spider
|
||||
) -> tuple[Optional[str], Optional[Slot]]:
|
||||
key: Optional[str] = request.meta.get("download_slot")
|
||||
) -> tuple[str | None, Slot | None]:
|
||||
key: str | None = request.meta.get("download_slot")
|
||||
if key is None:
|
||||
return None, None
|
||||
assert self.crawler.engine
|
||||
|
@ -5,7 +5,7 @@ import time
|
||||
from http.cookiejar import Cookie
|
||||
from http.cookiejar import CookieJar as _CookieJar
|
||||
from http.cookiejar import CookiePolicy, DefaultCookiePolicy
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_unicode
|
||||
@ -28,7 +28,7 @@ IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
|
||||
class CookieJar:
|
||||
def __init__(
|
||||
self,
|
||||
policy: Optional[CookiePolicy] = None,
|
||||
policy: CookiePolicy | None = None,
|
||||
check_expired_frequency: int = 10000,
|
||||
):
|
||||
self.policy: CookiePolicy = policy or DefaultCookiePolicy()
|
||||
@ -83,9 +83,9 @@ class CookieJar:
|
||||
|
||||
def clear(
|
||||
self,
|
||||
domain: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
name: Optional[str] = None,
|
||||
domain: str | None = None,
|
||||
path: str | None = None,
|
||||
name: str | None = None,
|
||||
) -> None:
|
||||
self.jar.clear(domain, path, name)
|
||||
|
||||
@ -188,7 +188,7 @@ class WrappedRequest:
|
||||
def has_header(self, name: str) -> bool:
|
||||
return name in self.request.headers
|
||||
|
||||
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
def get_header(self, name: str, default: str | None = None) -> str | None:
|
||||
value = self.request.headers.get(name, default)
|
||||
return to_unicode(value, errors="replace") if value is not None else None
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Union, cast
|
||||
|
||||
from w3lib.http import headers_dict_to_raw
|
||||
|
||||
@ -25,14 +25,14 @@ class Headers(CaselessDict):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
encoding: str = "utf-8",
|
||||
):
|
||||
self.encoding: str = encoding
|
||||
super().__init__(seq)
|
||||
|
||||
def update( # type: ignore[override]
|
||||
self, seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]]]
|
||||
self, seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]]
|
||||
) -> None:
|
||||
seq = seq.items() if isinstance(seq, Mapping) else seq
|
||||
iseq: dict[bytes, list[bytes]] = {}
|
||||
@ -44,7 +44,7 @@ class Headers(CaselessDict):
|
||||
"""Normalize key to bytes"""
|
||||
return self._tobytes(key.title())
|
||||
|
||||
def normvalue(self, value: Union[_RawValueT, Iterable[_RawValueT]]) -> list[bytes]:
|
||||
def normvalue(self, value: _RawValueT | Iterable[_RawValueT]) -> list[bytes]:
|
||||
"""Normalize values to bytes"""
|
||||
_value: Iterable[_RawValueT]
|
||||
if value is None:
|
||||
@ -67,13 +67,13 @@ class Headers(CaselessDict):
|
||||
return str(x).encode(self.encoding)
|
||||
raise TypeError(f"Unsupported value type: {type(x)}")
|
||||
|
||||
def __getitem__(self, key: AnyStr) -> Optional[bytes]:
|
||||
def __getitem__(self, key: AnyStr) -> bytes | None:
|
||||
try:
|
||||
return cast(list[bytes], super().__getitem__(key))[-1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def get(self, key: AnyStr, def_val: Any = None) -> Optional[bytes]:
|
||||
def get(self, key: AnyStr, def_val: Any = None) -> bytes | None:
|
||||
try:
|
||||
return cast(list[bytes], super().get(key, def_val))[-1]
|
||||
except IndexError:
|
||||
@ -103,7 +103,7 @@ class Headers(CaselessDict):
|
||||
def items(self) -> Iterable[tuple[bytes, list[bytes]]]: # type: ignore[override]
|
||||
return ((k, self.getlist(k)) for k in self.keys())
|
||||
|
||||
def values(self) -> list[Optional[bytes]]: # type: ignore[override]
|
||||
def values(self) -> list[bytes | None]: # type: ignore[override]
|
||||
return [
|
||||
self[k] for k in self.keys() # pylint: disable=consider-using-dict-items
|
||||
]
|
||||
|
@ -13,7 +13,6 @@ from typing import (
|
||||
Any,
|
||||
AnyStr,
|
||||
NoReturn,
|
||||
Optional,
|
||||
TypedDict,
|
||||
TypeVar,
|
||||
Union,
|
||||
@ -112,18 +111,18 @@ class Request(object_ref):
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
callback: Optional[CallbackT] = None,
|
||||
callback: CallbackT | None = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
body: bytes | str | None = None,
|
||||
cookies: CookiesT | None = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
encoding: str = "utf-8",
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
errback: Callable[[Failure], Any] | None = None,
|
||||
flags: list[str] | None = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
self._encoding: str = encoding # this one has to be set first
|
||||
self.method: str = str(method).upper()
|
||||
@ -139,17 +138,15 @@ class Request(object_ref):
|
||||
)
|
||||
if not (callable(errback) or errback is None):
|
||||
raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
|
||||
self.callback: Optional[CallbackT] = callback
|
||||
self.errback: Optional[Callable[[Failure], Any]] = errback
|
||||
self.callback: CallbackT | None = callback
|
||||
self.errback: Callable[[Failure], Any] | None = errback
|
||||
|
||||
self.cookies: CookiesT = cookies or {}
|
||||
self.headers: Headers = Headers(headers or {}, encoding=encoding)
|
||||
self.dont_filter: bool = dont_filter
|
||||
|
||||
self._meta: Optional[dict[str, Any]] = dict(meta) if meta else None
|
||||
self._cb_kwargs: Optional[dict[str, Any]] = (
|
||||
dict(cb_kwargs) if cb_kwargs else None
|
||||
)
|
||||
self._meta: dict[str, Any] | None = dict(meta) if meta else None
|
||||
self._cb_kwargs: dict[str, Any] | None = dict(cb_kwargs) if cb_kwargs else None
|
||||
self.flags: list[str] = [] if flags is None else list(flags)
|
||||
|
||||
@property
|
||||
@ -186,7 +183,7 @@ class Request(object_ref):
|
||||
def body(self) -> bytes:
|
||||
return self._body
|
||||
|
||||
def _set_body(self, body: Optional[Union[str, bytes]]) -> None:
|
||||
def _set_body(self, body: str | bytes | None) -> None:
|
||||
self._body = b"" if body is None else to_bytes(body, self.encoding)
|
||||
|
||||
@property
|
||||
@ -208,7 +205,7 @@ class Request(object_ref):
|
||||
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
|
||||
|
||||
def replace(
|
||||
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
|
||||
self, *args: Any, cls: type[Request] | None = None, **kwargs: Any
|
||||
) -> Request:
|
||||
"""Create a new Request with the same attributes except for those given new values"""
|
||||
for x in self.attributes:
|
||||
@ -255,7 +252,7 @@ class Request(object_ref):
|
||||
request_kwargs.update(kwargs)
|
||||
return cls(**request_kwargs)
|
||||
|
||||
def to_dict(self, *, spider: Optional[scrapy.Spider] = None) -> dict[str, Any]:
|
||||
def to_dict(self, *, spider: scrapy.Spider | None = None) -> dict[str, Any]:
|
||||
"""Return a dictionary containing the Request's data.
|
||||
|
||||
Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object.
|
||||
|
@ -62,14 +62,14 @@ class FormRequest(Request):
|
||||
def from_response(
|
||||
cls,
|
||||
response: TextResponse,
|
||||
formname: Optional[str] = None,
|
||||
formid: Optional[str] = None,
|
||||
formname: str | None = None,
|
||||
formid: str | None = None,
|
||||
formnumber: int = 0,
|
||||
formdata: FormdataType = None,
|
||||
clickdata: Optional[dict[str, Union[str, int]]] = None,
|
||||
clickdata: dict[str, str | int] | None = None,
|
||||
dont_click: bool = False,
|
||||
formxpath: Optional[str] = None,
|
||||
formcss: Optional[str] = None,
|
||||
formxpath: str | None = None,
|
||||
formcss: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Self:
|
||||
kwargs.setdefault("encoding", response.encoding)
|
||||
@ -92,7 +92,7 @@ class FormRequest(Request):
|
||||
return cls(url=url, method=method, formdata=formdata, **kwargs)
|
||||
|
||||
|
||||
def _get_form_url(form: FormElement, url: Optional[str]) -> str:
|
||||
def _get_form_url(form: FormElement, url: str | None) -> str:
|
||||
assert form.base_url is not None # typing
|
||||
if url is None:
|
||||
action = form.get("action")
|
||||
@ -113,10 +113,10 @@ def _urlencode(seq: Iterable[FormdataKVType], enc: str) -> str:
|
||||
|
||||
def _get_form(
|
||||
response: TextResponse,
|
||||
formname: Optional[str],
|
||||
formid: Optional[str],
|
||||
formname: str | None,
|
||||
formid: str | None,
|
||||
formnumber: int,
|
||||
formxpath: Optional[str],
|
||||
formxpath: str | None,
|
||||
) -> FormElement:
|
||||
"""Find the wanted form element within the given response."""
|
||||
root = response.selector.root
|
||||
@ -160,7 +160,7 @@ def _get_inputs(
|
||||
form: FormElement,
|
||||
formdata: FormdataType,
|
||||
dont_click: bool,
|
||||
clickdata: Optional[dict[str, Union[str, int]]],
|
||||
clickdata: dict[str, str | int] | None,
|
||||
) -> list[FormdataKVType]:
|
||||
"""Return a list of key-value pairs for the inputs found in the given form."""
|
||||
try:
|
||||
@ -196,8 +196,8 @@ def _get_inputs(
|
||||
|
||||
|
||||
def _value(
|
||||
ele: Union[InputElement, SelectElement, TextareaElement]
|
||||
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
|
||||
ele: InputElement | SelectElement | TextareaElement,
|
||||
) -> tuple[str | None, None | str | MultipleSelectOptions]:
|
||||
n = ele.name
|
||||
v = ele.value
|
||||
if ele.tag == "select":
|
||||
@ -206,8 +206,8 @@ def _value(
|
||||
|
||||
|
||||
def _select_value(
|
||||
ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions]
|
||||
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
|
||||
ele: SelectElement, n: str | None, v: None | str | MultipleSelectOptions
|
||||
) -> tuple[str | None, None | str | MultipleSelectOptions]:
|
||||
multiple = ele.multiple
|
||||
if v is None and not multiple:
|
||||
# Match browser behaviour on simple select tag without options selected
|
||||
@ -218,8 +218,8 @@ def _select_value(
|
||||
|
||||
|
||||
def _get_clickable(
|
||||
clickdata: Optional[dict[str, Union[str, int]]], form: FormElement
|
||||
) -> Optional[tuple[str, str]]:
|
||||
clickdata: dict[str, str | int] | None, form: FormElement
|
||||
) -> tuple[str, str] | None:
|
||||
"""
|
||||
Returns the clickable element specified in clickdata,
|
||||
if the latter is given. If not, it returns the first
|
||||
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
||||
import copy
|
||||
import json
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Optional, overload
|
||||
from typing import TYPE_CHECKING, Any, overload
|
||||
|
||||
from scrapy.http.request import Request, RequestTypeVar
|
||||
|
||||
@ -23,7 +23,7 @@ class JsonRequest(Request):
|
||||
attributes: tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
|
||||
|
||||
def __init__(
|
||||
self, *args: Any, dumps_kwargs: Optional[dict[str, Any]] = None, **kwargs: Any
|
||||
self, *args: Any, dumps_kwargs: dict[str, Any] | None = None, **kwargs: Any
|
||||
) -> None:
|
||||
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
|
||||
dumps_kwargs.setdefault("sort_keys", True)
|
||||
@ -59,7 +59,7 @@ class JsonRequest(Request):
|
||||
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
|
||||
|
||||
def replace(
|
||||
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
|
||||
self, *args: Any, cls: type[Request] | None = None, **kwargs: Any
|
||||
) -> Request:
|
||||
body_passed = kwargs.get("body", None) is not None
|
||||
data: Any = kwargs.pop("data", None)
|
||||
|
@ -5,8 +5,10 @@ This module implements the XmlRpcRequest class which is a more convenient class
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import xmlrpc.client as xmlrpclib
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import defusedxml.xmlrpc
|
||||
|
||||
@ -19,7 +21,7 @@ DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
|
||||
|
||||
|
||||
class XmlRpcRequest(Request):
|
||||
def __init__(self, *args: Any, encoding: Optional[str] = None, **kwargs: Any):
|
||||
def __init__(self, *args: Any, encoding: str | None = None, **kwargs: Any):
|
||||
if "body" not in kwargs and "params" in kwargs:
|
||||
kw = {k: kwargs.pop(k) for k in DUMPS_ARGS if k in kwargs}
|
||||
kwargs["body"] = xmlrpclib.dumps(**kw)
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/request-response.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, TypeVar, Union, overload
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, TypeVar, overload
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from scrapy.exceptions import NotSupported
|
||||
@ -60,23 +60,23 @@ class Response(object_ref):
|
||||
self,
|
||||
url: str,
|
||||
status: int = 200,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
body: bytes = b"",
|
||||
flags: Optional[list[str]] = None,
|
||||
request: Optional[Request] = None,
|
||||
certificate: Optional[Certificate] = None,
|
||||
ip_address: Union[IPv4Address, IPv6Address, None] = None,
|
||||
protocol: Optional[str] = None,
|
||||
flags: list[str] | None = None,
|
||||
request: Request | None = None,
|
||||
certificate: Certificate | None = None,
|
||||
ip_address: IPv4Address | IPv6Address | None = None,
|
||||
protocol: str | None = None,
|
||||
):
|
||||
self.headers: Headers = Headers(headers or {})
|
||||
self.status: int = int(status)
|
||||
self._set_body(body)
|
||||
self._set_url(url)
|
||||
self.request: Optional[Request] = request
|
||||
self.request: Request | None = request
|
||||
self.flags: list[str] = [] if flags is None else list(flags)
|
||||
self.certificate: Optional[Certificate] = certificate
|
||||
self.ip_address: Union[IPv4Address, IPv6Address, None] = ip_address
|
||||
self.protocol: Optional[str] = protocol
|
||||
self.certificate: Certificate | None = certificate
|
||||
self.ip_address: IPv4Address | IPv6Address | None = ip_address
|
||||
self.protocol: str | None = protocol
|
||||
|
||||
@property
|
||||
def cb_kwargs(self) -> dict[str, Any]:
|
||||
@ -114,7 +114,7 @@ class Response(object_ref):
|
||||
def body(self) -> bytes:
|
||||
return self._body
|
||||
|
||||
def _set_body(self, body: Optional[bytes]) -> None:
|
||||
def _set_body(self, body: bytes | None) -> None:
|
||||
if body is None:
|
||||
self._body = b""
|
||||
elif not isinstance(body, bytes):
|
||||
@ -142,7 +142,7 @@ class Response(object_ref):
|
||||
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
|
||||
|
||||
def replace(
|
||||
self, *args: Any, cls: Optional[type[Response]] = None, **kwargs: Any
|
||||
self, *args: Any, cls: type[Response] | None = None, **kwargs: Any
|
||||
) -> Response:
|
||||
"""Create a new Response with the same attributes except for those given new values"""
|
||||
for x in self.attributes:
|
||||
@ -183,19 +183,19 @@ class Response(object_ref):
|
||||
|
||||
def follow(
|
||||
self,
|
||||
url: Union[str, Link],
|
||||
callback: Optional[CallbackT] = None,
|
||||
url: str | Link,
|
||||
callback: CallbackT | None = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
body: bytes | str | None = None,
|
||||
cookies: CookiesT | None = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
encoding: str | None = "utf-8",
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
errback: Callable[[Failure], Any] | None = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
flags: list[str] | None = None,
|
||||
) -> Request:
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
@ -236,19 +236,19 @@ class Response(object_ref):
|
||||
|
||||
def follow_all(
|
||||
self,
|
||||
urls: Iterable[Union[str, Link]],
|
||||
callback: Optional[CallbackT] = None,
|
||||
urls: Iterable[str | Link],
|
||||
callback: CallbackT | None = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
body: bytes | str | None = None,
|
||||
cookies: CookiesT | None = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
encoding: str | None = "utf-8",
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
errback: Callable[[Failure], Any] | None = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
flags: list[str] | None = None,
|
||||
) -> Iterable[Request]:
|
||||
"""
|
||||
.. versionadded:: 2.0
|
||||
|
@ -8,9 +8,8 @@ See documentation in docs/topics/request-response.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections.abc import Iterable
|
||||
from contextlib import suppress
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import parsel
|
||||
@ -24,16 +23,16 @@ from w3lib.encoding import (
|
||||
from w3lib.html import strip_html5_whitespace
|
||||
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.python import memoizemethod_noargs, to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Mapping
|
||||
from collections.abc import Callable, Iterable, Mapping
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.http.request import CallbackT, CookiesT, Request
|
||||
from scrapy.link import Link
|
||||
from scrapy.selector import Selector, SelectorList
|
||||
|
||||
|
||||
@ -47,13 +46,13 @@ class TextResponse(Response):
|
||||
attributes: tuple[str, ...] = Response.attributes + ("encoding",)
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any):
|
||||
self._encoding: Optional[str] = kwargs.pop("encoding", None)
|
||||
self._cached_benc: Optional[str] = None
|
||||
self._cached_ubody: Optional[str] = None
|
||||
self._cached_selector: Optional[Selector] = None
|
||||
self._encoding: str | None = kwargs.pop("encoding", None)
|
||||
self._cached_benc: str | None = None
|
||||
self._cached_ubody: str | None = None
|
||||
self._cached_selector: Selector | None = None
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _set_body(self, body: Union[str, bytes, None]) -> None:
|
||||
def _set_body(self, body: str | bytes | None) -> None:
|
||||
self._body: bytes = b"" # used by encoding detection
|
||||
if isinstance(body, str):
|
||||
if self._encoding is None:
|
||||
@ -69,7 +68,7 @@ class TextResponse(Response):
|
||||
def encoding(self) -> str:
|
||||
return self._declared_encoding() or self._body_inferred_encoding()
|
||||
|
||||
def _declared_encoding(self) -> Optional[str]:
|
||||
def _declared_encoding(self) -> str | None:
|
||||
return (
|
||||
self._encoding
|
||||
or self._bom_encoding()
|
||||
@ -104,7 +103,7 @@ class TextResponse(Response):
|
||||
return urljoin(get_base_url(self), url)
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _headers_encoding(self) -> Optional[str]:
|
||||
def _headers_encoding(self) -> str | None:
|
||||
content_type = cast(bytes, self.headers.get(b"Content-Type", b""))
|
||||
return http_content_type_encoding(to_unicode(content_type, encoding="latin-1"))
|
||||
|
||||
@ -123,7 +122,7 @@ class TextResponse(Response):
|
||||
self._cached_ubody = ubody
|
||||
return self._cached_benc
|
||||
|
||||
def _auto_detect_fun(self, text: bytes) -> Optional[str]:
|
||||
def _auto_detect_fun(self, text: bytes) -> str | None:
|
||||
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
|
||||
try:
|
||||
text.decode(enc)
|
||||
@ -133,11 +132,11 @@ class TextResponse(Response):
|
||||
return None
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _body_declared_encoding(self) -> Optional[str]:
|
||||
def _body_declared_encoding(self) -> str | None:
|
||||
return html_body_declared_encoding(self.body)
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _bom_encoding(self) -> Optional[str]:
|
||||
def _bom_encoding(self) -> str | None:
|
||||
return read_bom(self.body)[0]
|
||||
|
||||
@property
|
||||
@ -170,19 +169,19 @@ class TextResponse(Response):
|
||||
|
||||
def follow(
|
||||
self,
|
||||
url: Union[str, Link, parsel.Selector],
|
||||
callback: Optional[CallbackT] = None,
|
||||
url: str | Link | parsel.Selector,
|
||||
callback: CallbackT | None = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
body: bytes | str | None = None,
|
||||
cookies: CookiesT | None = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
encoding: str | None = None,
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
errback: Callable[[Failure], Any] | None = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
flags: list[str] | None = None,
|
||||
) -> Request:
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
@ -223,21 +222,21 @@ class TextResponse(Response):
|
||||
|
||||
def follow_all(
|
||||
self,
|
||||
urls: Union[Iterable[Union[str, Link]], parsel.SelectorList, None] = None,
|
||||
callback: Optional[CallbackT] = None,
|
||||
urls: Iterable[str | Link] | parsel.SelectorList | None = None,
|
||||
callback: CallbackT | None = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
headers: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
body: bytes | str | None = None,
|
||||
cookies: CookiesT | None = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
encoding: str | None = None,
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
css: Optional[str] = None,
|
||||
xpath: Optional[str] = None,
|
||||
errback: Callable[[Failure], Any] | None = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
flags: list[str] | None = None,
|
||||
css: str | None = None,
|
||||
xpath: str | None = None,
|
||||
) -> Iterable[Request]:
|
||||
"""
|
||||
A generator that produces :class:`~.Request` instances to follow all
|
||||
@ -279,7 +278,7 @@ class TextResponse(Response):
|
||||
with suppress(_InvalidSelector):
|
||||
urls.append(_url_from_selector(sel))
|
||||
return super().follow_all(
|
||||
urls=cast(Iterable[Union[str, Link]], urls),
|
||||
urls=cast("Iterable[str | Link]", urls),
|
||||
callback=callback,
|
||||
method=method,
|
||||
headers=headers,
|
||||
|
@ -9,7 +9,7 @@ import operator
|
||||
import re
|
||||
from collections.abc import Callable, Iterable
|
||||
from functools import partial
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from lxml import etree # nosec
|
||||
@ -58,9 +58,9 @@ def _canonicalize_link_url(link: Link) -> str:
|
||||
class LxmlParserLinkExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
tag: Union[str, Callable[[str], bool]] = "a",
|
||||
attr: Union[str, Callable[[str], bool]] = "href",
|
||||
process: Optional[Callable[[Any], Any]] = None,
|
||||
tag: str | Callable[[str], bool] = "a",
|
||||
attr: str | Callable[[str], bool] = "href",
|
||||
process: Callable[[Any], Any] | None = None,
|
||||
unique: bool = False,
|
||||
strip: bool = True,
|
||||
canonicalized: bool = False,
|
||||
@ -166,18 +166,18 @@ class LxmlLinkExtractor:
|
||||
self,
|
||||
allow: _RegexOrSeveralT = (),
|
||||
deny: _RegexOrSeveralT = (),
|
||||
allow_domains: Union[str, Iterable[str]] = (),
|
||||
deny_domains: Union[str, Iterable[str]] = (),
|
||||
restrict_xpaths: Union[str, Iterable[str]] = (),
|
||||
tags: Union[str, Iterable[str]] = ("a", "area"),
|
||||
attrs: Union[str, Iterable[str]] = ("href",),
|
||||
allow_domains: str | Iterable[str] = (),
|
||||
deny_domains: str | Iterable[str] = (),
|
||||
restrict_xpaths: str | Iterable[str] = (),
|
||||
tags: str | Iterable[str] = ("a", "area"),
|
||||
attrs: str | Iterable[str] = ("href",),
|
||||
canonicalize: bool = False,
|
||||
unique: bool = True,
|
||||
process_value: Optional[Callable[[Any], Any]] = None,
|
||||
deny_extensions: Union[str, Iterable[str], None] = None,
|
||||
restrict_css: Union[str, Iterable[str]] = (),
|
||||
process_value: Callable[[Any], Any] | None = None,
|
||||
deny_extensions: str | Iterable[str] | None = None,
|
||||
restrict_css: str | Iterable[str] = (),
|
||||
strip: bool = True,
|
||||
restrict_text: Optional[_RegexOrSeveralT] = None,
|
||||
restrict_text: _RegexOrSeveralT | None = None,
|
||||
):
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
self.link_extractor = LxmlParserLinkExtractor(
|
||||
@ -206,7 +206,7 @@ class LxmlLinkExtractor:
|
||||
self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text)
|
||||
|
||||
@staticmethod
|
||||
def _compile_regexes(value: Optional[_RegexOrSeveralT]) -> list[re.Pattern[str]]:
|
||||
def _compile_regexes(value: _RegexOrSeveralT | None) -> list[re.Pattern[str]]:
|
||||
return [
|
||||
x if isinstance(x, re.Pattern) else re.compile(x)
|
||||
for x in arg_to_iter(value)
|
||||
|
@ -6,7 +6,7 @@ See documentation in docs/topics/loaders.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import itemloaders
|
||||
|
||||
@ -92,9 +92,9 @@ class ItemLoader(itemloaders.ItemLoader):
|
||||
def __init__(
|
||||
self,
|
||||
item: Any = None,
|
||||
selector: Optional[Selector] = None,
|
||||
response: Optional[TextResponse] = None,
|
||||
parent: Optional[itemloaders.ItemLoader] = None,
|
||||
selector: Selector | None = None,
|
||||
response: TextResponse | None = None,
|
||||
parent: itemloaders.ItemLoader | None = None,
|
||||
**context: Any,
|
||||
):
|
||||
if selector is None and response is not None:
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypedDict, Union
|
||||
from typing import TYPE_CHECKING, Any, TypedDict
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -31,7 +31,7 @@ DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
|
||||
class LogFormatterResult(TypedDict):
|
||||
level: int
|
||||
msg: str
|
||||
args: Union[dict[str, Any], tuple[Any, ...]]
|
||||
args: dict[str, Any] | tuple[Any, ...]
|
||||
|
||||
|
||||
class LogFormatter:
|
||||
@ -93,7 +93,7 @@ class LogFormatter:
|
||||
}
|
||||
|
||||
def scraped(
|
||||
self, item: Any, response: Union[Response, Failure, None], spider: Spider
|
||||
self, item: Any, response: Response | Failure | None, spider: Spider
|
||||
) -> LogFormatterResult:
|
||||
"""Logs a message when an item is scraped by a spider."""
|
||||
src: Any
|
||||
@ -116,7 +116,7 @@ class LogFormatter:
|
||||
self,
|
||||
item: Any,
|
||||
exception: BaseException,
|
||||
response: Optional[Response],
|
||||
response: Response | None,
|
||||
spider: Spider,
|
||||
) -> LogFormatterResult:
|
||||
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
|
||||
@ -133,7 +133,7 @@ class LogFormatter:
|
||||
self,
|
||||
item: Any,
|
||||
exception: BaseException,
|
||||
response: Optional[Response],
|
||||
response: Response | None,
|
||||
spider: Spider,
|
||||
) -> LogFormatterResult:
|
||||
"""Logs a message when an item causes an error while it is passing
|
||||
@ -153,7 +153,7 @@ class LogFormatter:
|
||||
self,
|
||||
failure: Failure,
|
||||
request: Request,
|
||||
response: Union[Response, Failure],
|
||||
response: Response | Failure,
|
||||
spider: Spider,
|
||||
) -> LogFormatterResult:
|
||||
"""Logs an error message from a spider.
|
||||
@ -174,7 +174,7 @@ class LogFormatter:
|
||||
failure: Failure,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
errmsg: Optional[str] = None,
|
||||
errmsg: str | None = None,
|
||||
) -> LogFormatterResult:
|
||||
"""Logs a download error message from a spider (typically coming from
|
||||
the engine).
|
||||
|
@ -14,7 +14,7 @@ from email.mime.nonmultipart import MIMENonMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.utils import formatdate
|
||||
from io import BytesIO
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import IO, TYPE_CHECKING, Any
|
||||
|
||||
from twisted import version as twisted_version
|
||||
from twisted.internet import ssl
|
||||
@ -45,7 +45,7 @@ logger = logging.getLogger(__name__)
|
||||
COMMASPACE = ", "
|
||||
|
||||
|
||||
def _to_bytes_or_none(text: Union[str, bytes, None]) -> Optional[bytes]:
|
||||
def _to_bytes_or_none(text: str | bytes | None) -> bytes | None:
|
||||
if text is None:
|
||||
return None
|
||||
return to_bytes(text)
|
||||
@ -56,8 +56,8 @@ class MailSender:
|
||||
self,
|
||||
smtphost: str = "localhost",
|
||||
mailfrom: str = "scrapy@localhost",
|
||||
smtpuser: Optional[str] = None,
|
||||
smtppass: Optional[str] = None,
|
||||
smtpuser: str | None = None,
|
||||
smtppass: str | None = None,
|
||||
smtpport: int = 25,
|
||||
smtptls: bool = False,
|
||||
smtpssl: bool = False,
|
||||
@ -65,8 +65,8 @@ class MailSender:
|
||||
):
|
||||
self.smtphost: str = smtphost
|
||||
self.smtpport: int = smtpport
|
||||
self.smtpuser: Optional[bytes] = _to_bytes_or_none(smtpuser)
|
||||
self.smtppass: Optional[bytes] = _to_bytes_or_none(smtppass)
|
||||
self.smtpuser: bytes | None = _to_bytes_or_none(smtpuser)
|
||||
self.smtppass: bytes | None = _to_bytes_or_none(smtppass)
|
||||
self.smtptls: bool = smtptls
|
||||
self.smtpssl: bool = smtpssl
|
||||
self.mailfrom: str = mailfrom
|
||||
@ -86,15 +86,15 @@ class MailSender:
|
||||
|
||||
def send(
|
||||
self,
|
||||
to: Union[str, list[str]],
|
||||
to: str | list[str],
|
||||
subject: str,
|
||||
body: str,
|
||||
cc: Union[str, list[str], None] = None,
|
||||
cc: str | list[str] | None = None,
|
||||
attachs: Sequence[tuple[str, str, IO[Any]]] = (),
|
||||
mimetype: str = "text/plain",
|
||||
charset: Optional[str] = None,
|
||||
_callback: Optional[Callable[..., None]] = None,
|
||||
) -> Optional[Deferred[None]]:
|
||||
charset: str | None = None,
|
||||
_callback: Callable[..., None] | None = None,
|
||||
) -> Deferred[None] | None:
|
||||
from twisted.internet import reactor
|
||||
|
||||
msg: MIMEBase
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import pprint
|
||||
from collections import defaultdict, deque
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.defer import process_chain, process_parallel
|
||||
@ -40,9 +40,9 @@ class MiddlewareManager:
|
||||
self.middlewares = middlewares
|
||||
# Only process_spider_output and process_spider_exception can be None.
|
||||
# Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
|
||||
self.methods: dict[
|
||||
str, deque[Union[None, Callable, tuple[Callable, Callable]]]
|
||||
] = defaultdict(deque)
|
||||
self.methods: dict[str, deque[None | Callable | tuple[Callable, Callable]]] = (
|
||||
defaultdict(deque)
|
||||
)
|
||||
for mw in middlewares:
|
||||
self._add_middleware(mw)
|
||||
|
||||
@ -51,9 +51,7 @@ class MiddlewareManager:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def from_settings(
|
||||
cls, settings: Settings, crawler: Optional[Crawler] = None
|
||||
) -> Self:
|
||||
def from_settings(cls, settings: Settings, crawler: Crawler | None = None) -> Self:
|
||||
mwlist = cls._get_mwlist_from_settings(settings)
|
||||
middlewares = []
|
||||
enabled = []
|
||||
|
@ -17,17 +17,7 @@ from contextlib import suppress
|
||||
from ftplib import FTP
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
NoReturn,
|
||||
Optional,
|
||||
Protocol,
|
||||
TypedDict,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import IO, TYPE_CHECKING, Any, NoReturn, Protocol, TypedDict, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
@ -61,7 +51,7 @@ if TYPE_CHECKING:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_string(path: Union[str, PathLike[str]]) -> str:
|
||||
def _to_string(path: str | PathLike[str]) -> str:
|
||||
return str(path) # convert a Path object to string
|
||||
|
||||
|
||||
@ -99,17 +89,17 @@ class FilesStoreProtocol(Protocol):
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
) -> Optional[Deferred[Any]]: ...
|
||||
meta: dict[str, Any] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
) -> Deferred[Any] | None: ...
|
||||
|
||||
def stat_file(
|
||||
self, path: str, info: MediaPipeline.SpiderInfo
|
||||
) -> Union[StatInfo, Deferred[StatInfo]]: ...
|
||||
) -> StatInfo | Deferred[StatInfo]: ...
|
||||
|
||||
|
||||
class FSFilesStore:
|
||||
def __init__(self, basedir: Union[str, PathLike[str]]):
|
||||
def __init__(self, basedir: str | PathLike[str]):
|
||||
basedir = _to_string(basedir)
|
||||
if "://" in basedir:
|
||||
basedir = basedir.split("://", 1)[1]
|
||||
@ -121,18 +111,18 @@ class FSFilesStore:
|
||||
|
||||
def persist_file(
|
||||
self,
|
||||
path: Union[str, PathLike[str]],
|
||||
path: str | PathLike[str],
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
self._mkdir(absolute_path.parent, info)
|
||||
absolute_path.write_bytes(buf.getvalue())
|
||||
|
||||
def stat_file(
|
||||
self, path: Union[str, PathLike[str]], info: MediaPipeline.SpiderInfo
|
||||
self, path: str | PathLike[str], info: MediaPipeline.SpiderInfo
|
||||
) -> StatInfo:
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
try:
|
||||
@ -145,12 +135,12 @@ class FSFilesStore:
|
||||
|
||||
return {"last_modified": last_modified, "checksum": checksum}
|
||||
|
||||
def _get_filesystem_path(self, path: Union[str, PathLike[str]]) -> Path:
|
||||
def _get_filesystem_path(self, path: str | PathLike[str]) -> Path:
|
||||
path_comps = _to_string(path).split("/")
|
||||
return Path(self.basedir, *path_comps)
|
||||
|
||||
def _mkdir(
|
||||
self, dirname: Path, domain: Optional[MediaPipeline.SpiderInfo] = None
|
||||
self, dirname: Path, domain: MediaPipeline.SpiderInfo | None = None
|
||||
) -> None:
|
||||
seen: set[str] = self.created_directories[domain] if domain else set()
|
||||
if str(dirname) not in seen:
|
||||
@ -218,8 +208,8 @@ class S3FilesStore:
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
) -> Deferred[Any]:
|
||||
"""Upload file to S3 storage"""
|
||||
key_name = f"{self.prefix}{path}"
|
||||
@ -327,7 +317,7 @@ class GCSFilesStore:
|
||||
deferToThread(self.bucket.get_blob, blob_path).addCallback(_onsuccess),
|
||||
)
|
||||
|
||||
def _get_content_type(self, headers: Optional[dict[str, str]]) -> str:
|
||||
def _get_content_type(self, headers: dict[str, str] | None) -> str:
|
||||
if headers and "Content-Type" in headers:
|
||||
return headers["Content-Type"]
|
||||
return "application/octet-stream"
|
||||
@ -340,8 +330,8 @@ class GCSFilesStore:
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
) -> Deferred[Any]:
|
||||
blob_path = self._get_blob_path(path)
|
||||
blob = self.bucket.blob(blob_path)
|
||||
@ -356,9 +346,9 @@ class GCSFilesStore:
|
||||
|
||||
|
||||
class FTPFilesStore:
|
||||
FTP_USERNAME: Optional[str] = None
|
||||
FTP_PASSWORD: Optional[str] = None
|
||||
USE_ACTIVE_MODE: Optional[bool] = None
|
||||
FTP_USERNAME: str | None = None
|
||||
FTP_PASSWORD: str | None = None
|
||||
USE_ACTIVE_MODE: bool | None = None
|
||||
|
||||
def __init__(self, uri: str):
|
||||
if not uri.startswith("ftp://"):
|
||||
@ -380,8 +370,8 @@ class FTPFilesStore:
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
meta: dict[str, Any] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
) -> Deferred[Any]:
|
||||
path = f"{self.basedir}/{path}"
|
||||
return deferToThread(
|
||||
@ -450,9 +440,9 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
store_uri: Union[str, PathLike[str]],
|
||||
download_func: Optional[Callable[[Request, Spider], Response]] = None,
|
||||
settings: Union[Settings, dict[str, Any], None] = None,
|
||||
store_uri: str | PathLike[str],
|
||||
download_func: Callable[[Request, Spider], Response] | None = None,
|
||||
settings: Settings | dict[str, Any] | None = None,
|
||||
):
|
||||
store_uri = _to_string(store_uri)
|
||||
if not store_uri:
|
||||
@ -517,8 +507,8 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
def media_to_download(
|
||||
self, request: Request, info: MediaPipeline.SpiderInfo, *, item: Any = None
|
||||
) -> Deferred[Optional[FileInfo]]:
|
||||
def _onsuccess(result: StatInfo) -> Optional[FileInfo]:
|
||||
) -> Deferred[FileInfo | None]:
|
||||
def _onsuccess(result: StatInfo) -> FileInfo | None:
|
||||
if not result:
|
||||
return None # returning None force download
|
||||
|
||||
@ -551,7 +541,7 @@ class FilesPipeline(MediaPipeline):
|
||||
path = self.file_path(request, info=info, item=item)
|
||||
# maybeDeferred() overloads don't seem to support a Union[_T, Deferred[_T]] return type
|
||||
dfd: Deferred[StatInfo] = maybeDeferred(self.store.stat_file, path, info) # type: ignore[call-overload]
|
||||
dfd2: Deferred[Optional[FileInfo]] = dfd.addCallback(_onsuccess)
|
||||
dfd2: Deferred[FileInfo | None] = dfd.addCallback(_onsuccess)
|
||||
dfd2.addErrback(lambda _: None)
|
||||
dfd2.addErrback(
|
||||
lambda f: logger.error(
|
||||
@ -684,8 +674,8 @@ class FilesPipeline(MediaPipeline):
|
||||
def file_path(
|
||||
self,
|
||||
request: Request,
|
||||
response: Optional[Response] = None,
|
||||
info: Optional[MediaPipeline.SpiderInfo] = None,
|
||||
response: Response | None = None,
|
||||
info: MediaPipeline.SpiderInfo | None = None,
|
||||
*,
|
||||
item: Any = None,
|
||||
) -> str:
|
||||
|
@ -11,7 +11,7 @@ import hashlib
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
@ -74,9 +74,9 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
store_uri: Union[str, PathLike[str]],
|
||||
download_func: Optional[Callable[[Request, Spider], Response]] = None,
|
||||
settings: Union[Settings, dict[str, Any], None] = None,
|
||||
store_uri: str | PathLike[str],
|
||||
download_func: Callable[[Request, Spider], Response] | None = None,
|
||||
settings: Settings | dict[str, Any] | None = None,
|
||||
):
|
||||
try:
|
||||
from PIL import Image
|
||||
@ -120,7 +120,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
resolve("IMAGES_THUMBS"), self.THUMBS
|
||||
)
|
||||
|
||||
self._deprecated_convert_image: Optional[bool] = None
|
||||
self._deprecated_convert_image: bool | None = None
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings: Settings) -> Self:
|
||||
@ -168,7 +168,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
*,
|
||||
item: Any = None,
|
||||
) -> str:
|
||||
checksum: Optional[str] = None
|
||||
checksum: str | None = None
|
||||
for path, image, buf in self.get_images(response, request, info, item=item):
|
||||
if checksum is None:
|
||||
buf.seek(0)
|
||||
@ -235,8 +235,8 @@ class ImagesPipeline(FilesPipeline):
|
||||
def convert_image(
|
||||
self,
|
||||
image: Image.Image,
|
||||
size: Optional[tuple[int, int]] = None,
|
||||
response_body: Optional[BytesIO] = None,
|
||||
size: tuple[int, int] | None = None,
|
||||
response_body: BytesIO | None = None,
|
||||
) -> tuple[Image.Image, BytesIO]:
|
||||
if response_body is None:
|
||||
warnings.warn(
|
||||
@ -291,8 +291,8 @@ class ImagesPipeline(FilesPipeline):
|
||||
def file_path(
|
||||
self,
|
||||
request: Request,
|
||||
response: Optional[Response] = None,
|
||||
info: Optional[MediaPipeline.SpiderInfo] = None,
|
||||
response: Response | None = None,
|
||||
info: MediaPipeline.SpiderInfo | None = None,
|
||||
*,
|
||||
item: Any = None,
|
||||
) -> str:
|
||||
@ -303,8 +303,8 @@ class ImagesPipeline(FilesPipeline):
|
||||
self,
|
||||
request: Request,
|
||||
thumb_id: str,
|
||||
response: Optional[Response] = None,
|
||||
info: Optional[MediaPipeline.SpiderInfo] = None,
|
||||
response: Response | None = None,
|
||||
info: MediaPipeline.SpiderInfo | None = None,
|
||||
*,
|
||||
item: Any = None,
|
||||
) -> str:
|
||||
|
@ -9,7 +9,6 @@ from typing import (
|
||||
Any,
|
||||
Literal,
|
||||
NoReturn,
|
||||
Optional,
|
||||
TypedDict,
|
||||
TypeVar,
|
||||
Union,
|
||||
@ -44,7 +43,7 @@ _T = TypeVar("_T")
|
||||
class FileInfo(TypedDict):
|
||||
url: str
|
||||
path: str
|
||||
checksum: Optional[str]
|
||||
checksum: str | None
|
||||
status: str
|
||||
|
||||
|
||||
@ -64,15 +63,15 @@ class MediaPipeline(ABC):
|
||||
def __init__(self, spider: Spider):
|
||||
self.spider: Spider = spider
|
||||
self.downloading: set[bytes] = set()
|
||||
self.downloaded: dict[bytes, Union[FileInfo, Failure]] = {}
|
||||
self.downloaded: dict[bytes, FileInfo | Failure] = {}
|
||||
self.waiting: defaultdict[bytes, list[Deferred[FileInfo]]] = defaultdict(
|
||||
list
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
download_func: Optional[Callable[[Request, Spider], Response]] = None,
|
||||
settings: Union[Settings, dict[str, Any], None] = None,
|
||||
download_func: Callable[[Request, Spider], Response] | None = None,
|
||||
settings: Settings | dict[str, Any] | None = None,
|
||||
):
|
||||
self.download_func = download_func
|
||||
|
||||
@ -94,8 +93,8 @@ class MediaPipeline(ABC):
|
||||
def _key_for_pipe(
|
||||
self,
|
||||
key: str,
|
||||
base_class_name: Optional[str] = None,
|
||||
settings: Optional[Settings] = None,
|
||||
base_class_name: str | None = None,
|
||||
settings: Settings | None = None,
|
||||
) -> str:
|
||||
class_name = self.__class__.__name__
|
||||
formatted_key = f"{class_name.upper()}_{key}"
|
||||
@ -161,7 +160,7 @@ class MediaPipeline(ABC):
|
||||
|
||||
# Download request checking media_to_download hook output first
|
||||
info.downloading.add(fp)
|
||||
dfd: Deferred[Optional[FileInfo]] = mustbe_deferred(
|
||||
dfd: Deferred[FileInfo | None] = mustbe_deferred(
|
||||
self.media_to_download, request, info, item=item
|
||||
)
|
||||
dfd2: Deferred[FileInfo] = dfd.addCallback(
|
||||
@ -182,8 +181,8 @@ class MediaPipeline(ABC):
|
||||
request.meta["handle_httpstatus_all"] = True
|
||||
|
||||
def _check_media_to_download(
|
||||
self, result: Optional[FileInfo], request: Request, info: SpiderInfo, item: Any
|
||||
) -> Union[FileInfo, Deferred[FileInfo]]:
|
||||
self, result: FileInfo | None, request: Request, info: SpiderInfo, item: Any
|
||||
) -> FileInfo | Deferred[FileInfo]:
|
||||
if result is not None:
|
||||
return result
|
||||
dfd: Deferred[Response]
|
||||
@ -201,7 +200,7 @@ class MediaPipeline(ABC):
|
||||
return dfd2
|
||||
|
||||
def _cache_result_and_execute_waiters(
|
||||
self, result: Union[FileInfo, Failure], fp: bytes, info: SpiderInfo
|
||||
self, result: FileInfo | Failure, fp: bytes, info: SpiderInfo
|
||||
) -> None:
|
||||
if isinstance(result, Failure):
|
||||
# minimize cached information for failure
|
||||
@ -243,7 +242,7 @@ class MediaPipeline(ABC):
|
||||
@abstractmethod
|
||||
def media_to_download(
|
||||
self, request: Request, info: SpiderInfo, *, item: Any = None
|
||||
) -> Deferred[Optional[FileInfo]]:
|
||||
) -> Deferred[FileInfo | None]:
|
||||
"""Check request before starting download"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@ -291,8 +290,8 @@ class MediaPipeline(ABC):
|
||||
def file_path(
|
||||
self,
|
||||
request: Request,
|
||||
response: Optional[Response] = None,
|
||||
info: Optional[SpiderInfo] = None,
|
||||
response: Response | None = None,
|
||||
info: SpiderInfo | None = None,
|
||||
*,
|
||||
item: Any = None,
|
||||
) -> str:
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional, Protocol, cast
|
||||
from typing import TYPE_CHECKING, Protocol, cast
|
||||
|
||||
from scrapy import Request
|
||||
from scrapy.core.downloader import Downloader
|
||||
@ -42,7 +42,7 @@ class QueueProtocol(Protocol):
|
||||
|
||||
def push(self, request: Request) -> None: ...
|
||||
|
||||
def pop(self) -> Optional[Request]: ...
|
||||
def pop(self) -> Request | None: ...
|
||||
|
||||
def close(self) -> None: ...
|
||||
|
||||
@ -96,7 +96,7 @@ class ScrapyPriorityQueue:
|
||||
self.downstream_queue_cls: type[QueueProtocol] = downstream_queue_cls
|
||||
self.key: str = key
|
||||
self.queues: dict[int, QueueProtocol] = {}
|
||||
self.curprio: Optional[int] = None
|
||||
self.curprio: int | None = None
|
||||
self.init_prios(startprios)
|
||||
|
||||
def init_prios(self, startprios: Iterable[int]) -> None:
|
||||
@ -127,7 +127,7 @@ class ScrapyPriorityQueue:
|
||||
if self.curprio is None or priority < self.curprio:
|
||||
self.curprio = priority
|
||||
|
||||
def pop(self) -> Optional[Request]:
|
||||
def pop(self) -> Request | None:
|
||||
if self.curprio is None:
|
||||
return None
|
||||
q = self.queues[self.curprio]
|
||||
@ -139,7 +139,7 @@ class ScrapyPriorityQueue:
|
||||
self.curprio = min(prios) if prios else None
|
||||
return m
|
||||
|
||||
def peek(self) -> Optional[Request]:
|
||||
def peek(self) -> Request | None:
|
||||
"""Returns the next object to be returned by :meth:`pop`,
|
||||
but without removing it from the queue.
|
||||
|
||||
@ -193,7 +193,7 @@ class DownloaderAwarePriorityQueue:
|
||||
crawler: Crawler,
|
||||
downstream_queue_cls: type[QueueProtocol],
|
||||
key: str,
|
||||
startprios: Optional[dict[str, Iterable[int]]] = None,
|
||||
startprios: dict[str, Iterable[int]] | None = None,
|
||||
) -> Self:
|
||||
return cls(crawler, downstream_queue_cls, key, startprios)
|
||||
|
||||
@ -202,7 +202,7 @@ class DownloaderAwarePriorityQueue:
|
||||
crawler: Crawler,
|
||||
downstream_queue_cls: type[QueueProtocol],
|
||||
key: str,
|
||||
slot_startprios: Optional[dict[str, Iterable[int]]] = None,
|
||||
slot_startprios: dict[str, Iterable[int]] | None = None,
|
||||
):
|
||||
if crawler.settings.getint("CONCURRENT_REQUESTS_PER_IP") != 0:
|
||||
raise ValueError(
|
||||
@ -239,7 +239,7 @@ class DownloaderAwarePriorityQueue:
|
||||
startprios,
|
||||
)
|
||||
|
||||
def pop(self) -> Optional[Request]:
|
||||
def pop(self) -> Request | None:
|
||||
stats = self._downloader_interface.stats(self.pqueues)
|
||||
|
||||
if not stats:
|
||||
@ -259,7 +259,7 @@ class DownloaderAwarePriorityQueue:
|
||||
queue = self.pqueues[slot]
|
||||
queue.push(request)
|
||||
|
||||
def peek(self) -> Optional[Request]:
|
||||
def peek(self) -> Request | None:
|
||||
"""Returns the next object to be returned by :meth:`pop`,
|
||||
but without removing it from the queue.
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.base import ReactorBase, ThreadedResolver
|
||||
@ -128,7 +128,7 @@ class CachingHostnameResolver:
|
||||
resolutionReceiver: IResolutionReceiver,
|
||||
hostName: str,
|
||||
portNumber: int = 0,
|
||||
addressTypes: Optional[Sequence[type[IAddress]]] = None,
|
||||
addressTypes: Sequence[type[IAddress]] | None = None,
|
||||
transportSemantics: str = "TCP",
|
||||
) -> IHostResolution:
|
||||
try:
|
||||
|
@ -8,7 +8,7 @@ from __future__ import annotations
|
||||
from io import StringIO
|
||||
from mimetypes import MimeTypes
|
||||
from pkgutil import get_data
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.utils.misc import load_object
|
||||
@ -58,7 +58,7 @@ class ResponseTypes:
|
||||
return self.classes.get(basetype, Response)
|
||||
|
||||
def from_content_type(
|
||||
self, content_type: Union[str, bytes], content_encoding: Optional[bytes] = None
|
||||
self, content_type: str | bytes, content_encoding: bytes | None = None
|
||||
) -> type[Response]:
|
||||
"""Return the most appropriate Response class from an HTTP Content-Type
|
||||
header"""
|
||||
@ -70,7 +70,7 @@ class ResponseTypes:
|
||||
return self.from_mimetype(mimetype)
|
||||
|
||||
def from_content_disposition(
|
||||
self, content_disposition: Union[str, bytes]
|
||||
self, content_disposition: str | bytes
|
||||
) -> type[Response]:
|
||||
try:
|
||||
filename = (
|
||||
@ -123,10 +123,10 @@ class ResponseTypes:
|
||||
|
||||
def from_args(
|
||||
self,
|
||||
headers: Optional[Mapping[bytes, bytes]] = None,
|
||||
url: Optional[str] = None,
|
||||
filename: Optional[str] = None,
|
||||
body: Optional[bytes] = None,
|
||||
headers: Mapping[bytes, bytes] | None = None,
|
||||
url: str | None = None,
|
||||
filename: str | None = None,
|
||||
body: bytes | None = None,
|
||||
) -> type[Response]:
|
||||
"""Guess the most appropriate Response class based on
|
||||
the given arguments."""
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import sys
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from typing import TYPE_CHECKING
|
||||
from warnings import warn
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decode_robotstxt(
|
||||
robotstxt_body: bytes, spider: Optional[Spider], to_native_str_type: bool = False
|
||||
robotstxt_body: bytes, spider: Spider | None, to_native_str_type: bool = False
|
||||
) -> str:
|
||||
try:
|
||||
if to_native_str_type:
|
||||
@ -57,7 +57,7 @@ class RobotParser(metaclass=ABCMeta):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
|
||||
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
|
||||
"""Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
|
||||
|
||||
:param url: Absolute URL
|
||||
@ -70,10 +70,10 @@ class RobotParser(metaclass=ABCMeta):
|
||||
|
||||
|
||||
class PythonRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
self.spider: Optional[Spider] = spider
|
||||
self.spider: Spider | None = spider
|
||||
body_decoded = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
|
||||
self.rp: RobotFileParser = RobotFileParser()
|
||||
self.rp.parse(body_decoded.splitlines())
|
||||
@ -84,18 +84,18 @@ class PythonRobotParser(RobotParser):
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
|
||||
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
|
||||
user_agent = to_unicode(user_agent)
|
||||
url = to_unicode(url)
|
||||
return self.rp.can_fetch(user_agent, url)
|
||||
|
||||
|
||||
class ReppyRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
|
||||
warn("ReppyRobotParser is deprecated.", ScrapyDeprecationWarning, stacklevel=2)
|
||||
from reppy.robots import Robots
|
||||
|
||||
self.spider: Optional[Spider] = spider
|
||||
self.spider: Spider | None = spider
|
||||
self.rp = Robots.parse("", robotstxt_body)
|
||||
|
||||
@classmethod
|
||||
@ -104,15 +104,15 @@ class ReppyRobotParser(RobotParser):
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
|
||||
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
|
||||
return self.rp.allowed(url, user_agent)
|
||||
|
||||
|
||||
class RerpRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
|
||||
from robotexclusionrulesparser import RobotExclusionRulesParser
|
||||
|
||||
self.spider: Optional[Spider] = spider
|
||||
self.spider: Spider | None = spider
|
||||
self.rp: RobotExclusionRulesParser = RobotExclusionRulesParser()
|
||||
body_decoded = decode_robotstxt(robotstxt_body, spider)
|
||||
self.rp.parse(body_decoded)
|
||||
@ -123,17 +123,17 @@ class RerpRobotParser(RobotParser):
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
|
||||
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
|
||||
user_agent = to_unicode(user_agent)
|
||||
url = to_unicode(url)
|
||||
return self.rp.is_allowed(user_agent, url)
|
||||
|
||||
|
||||
class ProtegoRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Optional[Spider]):
|
||||
def __init__(self, robotstxt_body: bytes, spider: Spider | None):
|
||||
from protego import Protego
|
||||
|
||||
self.spider: Optional[Spider] = spider
|
||||
self.spider: Spider | None = spider
|
||||
body_decoded = decode_robotstxt(robotstxt_body, spider)
|
||||
self.rp = Protego.parse(body_decoded)
|
||||
|
||||
@ -143,7 +143,7 @@ class ProtegoRobotParser(RobotParser):
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url: Union[str, bytes], user_agent: Union[str, bytes]) -> bool:
|
||||
def allowed(self, url: str | bytes, user_agent: str | bytes) -> bool:
|
||||
user_agent = to_unicode(user_agent)
|
||||
url = to_unicode(url)
|
||||
return self.rp.can_fetch(url, user_agent)
|
||||
|
@ -2,7 +2,9 @@
|
||||
XPath selectors based on lxml
|
||||
"""
|
||||
|
||||
from typing import Any, Optional, Union
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from parsel import Selector as _ParselSelector
|
||||
|
||||
@ -16,13 +18,13 @@ __all__ = ["Selector", "SelectorList"]
|
||||
_NOT_SET = object()
|
||||
|
||||
|
||||
def _st(response: Optional[TextResponse], st: Optional[str]) -> str:
|
||||
def _st(response: TextResponse | None, st: str | None) -> str:
|
||||
if st is None:
|
||||
return "xml" if isinstance(response, XmlResponse) else "html"
|
||||
return st
|
||||
|
||||
|
||||
def _response_from_text(text: Union[str, bytes], st: Optional[str]) -> TextResponse:
|
||||
def _response_from_text(text: str | bytes, st: str | None) -> TextResponse:
|
||||
rt: type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
|
||||
return rt(url="about:blank", encoding="utf-8", body=to_bytes(text, "utf-8"))
|
||||
|
||||
@ -71,10 +73,10 @@ class Selector(_ParselSelector, object_ref):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
response: Optional[TextResponse] = None,
|
||||
text: Optional[str] = None,
|
||||
type: Optional[str] = None,
|
||||
root: Optional[Any] = _NOT_SET,
|
||||
response: TextResponse | None = None,
|
||||
text: str | None = None,
|
||||
type: str | None = None,
|
||||
root: Any | None = _NOT_SET,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if response is not None and text is not None:
|
||||
|
@ -5,7 +5,7 @@ import json
|
||||
from collections.abc import Iterable, Iterator, Mapping, MutableMapping
|
||||
from importlib import import_module
|
||||
from pprint import pformat
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
|
||||
from scrapy.settings import default_settings
|
||||
|
||||
@ -35,7 +35,7 @@ SETTINGS_PRIORITIES: dict[str, int] = {
|
||||
}
|
||||
|
||||
|
||||
def get_settings_priority(priority: Union[int, str]) -> int:
|
||||
def get_settings_priority(priority: int | str) -> int:
|
||||
"""
|
||||
Small helper function that looks up a given string priority in the
|
||||
:attr:`~scrapy.settings.SETTINGS_PRIORITIES` dictionary and returns its
|
||||
@ -97,9 +97,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
|
||||
__default = object()
|
||||
|
||||
def __init__(
|
||||
self, values: _SettingsInputT = None, priority: Union[int, str] = "project"
|
||||
):
|
||||
def __init__(self, values: _SettingsInputT = None, priority: int | str = "project"):
|
||||
self.frozen: bool = False
|
||||
self.attributes: dict[_SettingsKeyT, SettingsAttribute] = {}
|
||||
if values:
|
||||
@ -180,7 +178,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
return float(self.get(name, default))
|
||||
|
||||
def getlist(
|
||||
self, name: _SettingsKeyT, default: Optional[list[Any]] = None
|
||||
self, name: _SettingsKeyT, default: list[Any] | None = None
|
||||
) -> list[Any]:
|
||||
"""
|
||||
Get a setting value as a list. If the setting original type is a list, a
|
||||
@ -201,7 +199,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
return list(value)
|
||||
|
||||
def getdict(
|
||||
self, name: _SettingsKeyT, default: Optional[dict[Any, Any]] = None
|
||||
self, name: _SettingsKeyT, default: dict[Any, Any] | None = None
|
||||
) -> dict[Any, Any]:
|
||||
"""
|
||||
Get a setting value as a dictionary. If the setting original type is a
|
||||
@ -226,8 +224,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
def getdictorlist(
|
||||
self,
|
||||
name: _SettingsKeyT,
|
||||
default: Union[dict[Any, Any], list[Any], tuple[Any], None] = None,
|
||||
) -> Union[dict[Any, Any], list[Any]]:
|
||||
default: dict[Any, Any] | list[Any] | tuple[Any] | None = None,
|
||||
) -> dict[Any, Any] | list[Any]:
|
||||
"""Get a setting value as either a :class:`dict` or a :class:`list`.
|
||||
|
||||
If the setting is already a dict or a list, a copy of it will be
|
||||
@ -278,7 +276,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
compbs.update(self[name])
|
||||
return compbs
|
||||
|
||||
def getpriority(self, name: _SettingsKeyT) -> Optional[int]:
|
||||
def getpriority(self, name: _SettingsKeyT) -> int | None:
|
||||
"""
|
||||
Return the current numerical priority value of a setting, or ``None`` if
|
||||
the given ``name`` does not exist.
|
||||
@ -305,7 +303,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
self.set(name, value)
|
||||
|
||||
def set(
|
||||
self, name: _SettingsKeyT, value: Any, priority: Union[int, str] = "project"
|
||||
self, name: _SettingsKeyT, value: Any, priority: int | str = "project"
|
||||
) -> None:
|
||||
"""
|
||||
Store a key/value attribute with a given priority.
|
||||
@ -338,7 +336,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
self,
|
||||
name: _SettingsKeyT,
|
||||
default: Any = None,
|
||||
priority: Union[int, str] = "project",
|
||||
priority: int | str = "project",
|
||||
) -> Any:
|
||||
if name not in self:
|
||||
self.set(name, default, priority)
|
||||
@ -346,13 +344,11 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
|
||||
return self.attributes[name].value
|
||||
|
||||
def setdict(
|
||||
self, values: _SettingsInputT, priority: Union[int, str] = "project"
|
||||
) -> None:
|
||||
def setdict(self, values: _SettingsInputT, priority: int | str = "project") -> None:
|
||||
self.update(values, priority)
|
||||
|
||||
def setmodule(
|
||||
self, module: Union[ModuleType, str], priority: Union[int, str] = "project"
|
||||
self, module: ModuleType | str, priority: int | str = "project"
|
||||
) -> None:
|
||||
"""
|
||||
Store settings from a module with a given priority.
|
||||
@ -376,7 +372,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
self.set(key, getattr(module, key), priority)
|
||||
|
||||
# BaseSettings.update() doesn't support all inputs that MutableMapping.update() supports
|
||||
def update(self, values: _SettingsInputT, priority: Union[int, str] = "project") -> None: # type: ignore[override]
|
||||
def update(self, values: _SettingsInputT, priority: int | str = "project") -> None: # type: ignore[override]
|
||||
"""
|
||||
Store key/value pairs with a given priority.
|
||||
|
||||
@ -409,9 +405,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
for name, value in values.items():
|
||||
self.set(name, value, priority)
|
||||
|
||||
def delete(
|
||||
self, name: _SettingsKeyT, priority: Union[int, str] = "project"
|
||||
) -> None:
|
||||
def delete(self, name: _SettingsKeyT, priority: int | str = "project") -> None:
|
||||
if name not in self:
|
||||
raise KeyError(name)
|
||||
self._assert_mutability()
|
||||
@ -525,9 +519,7 @@ class Settings(BaseSettings):
|
||||
described on :ref:`topics-settings-ref` already populated.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, values: _SettingsInputT = None, priority: Union[int, str] = "project"
|
||||
):
|
||||
def __init__(self, values: _SettingsInputT = None, priority: int | str = "project"):
|
||||
# Do not pass kwarg values here. We don't want to promote user-defined
|
||||
# dicts, and we want to update, not replace, default dicts with the
|
||||
# values given by the user
|
||||
|
@ -8,7 +8,7 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
import signal
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import defer, threads
|
||||
@ -37,25 +37,25 @@ class Shell:
|
||||
def __init__(
|
||||
self,
|
||||
crawler: Crawler,
|
||||
update_vars: Optional[Callable[[dict[str, Any]], None]] = None,
|
||||
code: Optional[str] = None,
|
||||
update_vars: Callable[[dict[str, Any]], None] | None = None,
|
||||
code: str | None = None,
|
||||
):
|
||||
self.crawler: Crawler = crawler
|
||||
self.update_vars: Callable[[dict[str, Any]], None] = update_vars or (
|
||||
lambda x: None
|
||||
)
|
||||
self.item_class: type = load_object(crawler.settings["DEFAULT_ITEM_CLASS"])
|
||||
self.spider: Optional[Spider] = None
|
||||
self.spider: Spider | None = None
|
||||
self.inthread: bool = not threadable.isInIOThread()
|
||||
self.code: Optional[str] = code
|
||||
self.code: str | None = code
|
||||
self.vars: dict[str, Any] = {}
|
||||
|
||||
def start(
|
||||
self,
|
||||
url: Optional[str] = None,
|
||||
request: Optional[Request] = None,
|
||||
response: Optional[Response] = None,
|
||||
spider: Optional[Spider] = None,
|
||||
url: str | None = None,
|
||||
request: Request | None = None,
|
||||
response: Response | None = None,
|
||||
spider: Spider | None = None,
|
||||
redirect: bool = True,
|
||||
) -> None:
|
||||
# disable accidental Ctrl-C key press from shutting down the engine
|
||||
@ -97,9 +97,7 @@ class Shell:
|
||||
self.vars, shells=shells, banner=self.vars.pop("banner", "")
|
||||
)
|
||||
|
||||
def _schedule(
|
||||
self, request: Request, spider: Optional[Spider]
|
||||
) -> defer.Deferred[Any]:
|
||||
def _schedule(self, request: Request, spider: Spider | None) -> defer.Deferred[Any]:
|
||||
if is_asyncio_reactor_installed():
|
||||
# set the asyncio event loop for the current thread
|
||||
event_loop_path = self.crawler.settings["ASYNCIO_EVENT_LOOP"]
|
||||
@ -111,7 +109,7 @@ class Shell:
|
||||
self.crawler.engine.crawl(request)
|
||||
return d
|
||||
|
||||
def _open_spider(self, request: Request, spider: Optional[Spider]) -> Spider:
|
||||
def _open_spider(self, request: Request, spider: Spider | None) -> Spider:
|
||||
if self.spider:
|
||||
return self.spider
|
||||
|
||||
@ -126,8 +124,8 @@ class Shell:
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
request_or_url: Union[Request, str],
|
||||
spider: Optional[Spider] = None,
|
||||
request_or_url: Request | str,
|
||||
spider: Spider | None = None,
|
||||
redirect: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
@ -155,9 +153,9 @@ class Shell:
|
||||
|
||||
def populate_vars(
|
||||
self,
|
||||
response: Optional[Response] = None,
|
||||
request: Optional[Request] = None,
|
||||
spider: Optional[Spider] = None,
|
||||
response: Response | None = None,
|
||||
request: Request | None = None,
|
||||
spider: Spider | None = None,
|
||||
) -> None:
|
||||
import scrapy
|
||||
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
@ -65,7 +65,7 @@ class HttpErrorMiddleware:
|
||||
|
||||
def process_spider_exception(
|
||||
self, response: Response, exception: Exception, spider: Spider
|
||||
) -> Optional[Iterable[Any]]:
|
||||
) -> Iterable[Any] | None:
|
||||
if isinstance(exception, HttpError):
|
||||
assert spider.crawler.stats
|
||||
spider.crawler.stats.inc_value("httperror/response_ignored_count")
|
||||
|
@ -6,7 +6,7 @@ originated it.
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
@ -50,20 +50,20 @@ class ReferrerPolicy:
|
||||
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES
|
||||
name: str
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def stripped_referrer(self, url: str) -> Optional[str]:
|
||||
def stripped_referrer(self, url: str) -> str | None:
|
||||
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
|
||||
return self.strip_url(url)
|
||||
return None
|
||||
|
||||
def origin_referrer(self, url: str) -> Optional[str]:
|
||||
def origin_referrer(self, url: str) -> str | None:
|
||||
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
|
||||
return self.origin(url)
|
||||
return None
|
||||
|
||||
def strip_url(self, url: str, origin_only: bool = False) -> Optional[str]:
|
||||
def strip_url(self, url: str, origin_only: bool = False) -> str | None:
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#strip-url
|
||||
|
||||
@ -87,7 +87,7 @@ class ReferrerPolicy:
|
||||
origin_only=origin_only,
|
||||
)
|
||||
|
||||
def origin(self, url: str) -> Optional[str]:
|
||||
def origin(self, url: str) -> str | None:
|
||||
"""Return serialized origin (scheme, host, path) for a request or response URL."""
|
||||
return self.strip_url(url, origin_only=True)
|
||||
|
||||
@ -113,7 +113,7 @@ class NoReferrerPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_NO_REFERRER
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
@ -134,7 +134,7 @@ class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_NO_REFERRER_WHEN_DOWNGRADE
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
if not self.tls_protected(response_url) or self.tls_protected(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
return None
|
||||
@ -153,7 +153,7 @@ class SameOriginPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_SAME_ORIGIN
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
if self.origin(response_url) == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
return None
|
||||
@ -171,7 +171,7 @@ class OriginPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_ORIGIN
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
return self.origin_referrer(response_url)
|
||||
|
||||
|
||||
@ -191,7 +191,7 @@ class StrictOriginPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_STRICT_ORIGIN
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
if (
|
||||
self.tls_protected(response_url)
|
||||
and self.potentially_trustworthy(request_url)
|
||||
@ -215,7 +215,7 @@ class OriginWhenCrossOriginPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
origin = self.origin(response_url)
|
||||
if origin == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
@ -242,7 +242,7 @@ class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
origin = self.origin(response_url)
|
||||
if origin == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
@ -271,7 +271,7 @@ class UnsafeUrlPolicy(ReferrerPolicy):
|
||||
|
||||
name: str = POLICY_UNSAFE_URL
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
def referrer(self, response_url: str, request_url: str) -> str | None:
|
||||
return self.stripped_referrer(response_url)
|
||||
|
||||
|
||||
@ -307,7 +307,7 @@ _policy_classes[""] = NoReferrerWhenDowngradePolicy
|
||||
|
||||
def _load_policy_class(
|
||||
policy: str, warning_only: bool = False
|
||||
) -> Optional[type[ReferrerPolicy]]:
|
||||
) -> type[ReferrerPolicy] | None:
|
||||
"""
|
||||
Expect a string for the path to the policy class,
|
||||
otherwise try to interpret the string as a standard value
|
||||
@ -331,7 +331,7 @@ def _load_policy_class(
|
||||
|
||||
|
||||
class RefererMiddleware:
|
||||
def __init__(self, settings: Optional[BaseSettings] = None):
|
||||
def __init__(self, settings: BaseSettings | None = None):
|
||||
self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy
|
||||
if settings is not None:
|
||||
settings_policy = _load_policy_class(settings.get("REFERRER_POLICY"))
|
||||
@ -349,9 +349,7 @@ class RefererMiddleware:
|
||||
|
||||
return mw
|
||||
|
||||
def policy(
|
||||
self, resp_or_url: Union[Response, str], request: Request
|
||||
) -> ReferrerPolicy:
|
||||
def policy(self, resp_or_url: Response | str, request: Request) -> ReferrerPolicy:
|
||||
"""
|
||||
Determine Referrer-Policy to use from a parent Response (or URL),
|
||||
and a Request to be sent.
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request, Response
|
||||
@ -34,9 +34,9 @@ class Spider(object_ref):
|
||||
"""
|
||||
|
||||
name: str
|
||||
custom_settings: Optional[dict[_SettingsKeyT, Any]] = None
|
||||
custom_settings: dict[_SettingsKeyT, Any] | None = None
|
||||
|
||||
def __init__(self, name: Optional[str] = None, **kwargs: Any):
|
||||
def __init__(self, name: str | None = None, **kwargs: Any):
|
||||
if name is not None:
|
||||
self.name: str = name
|
||||
elif not getattr(self, "name", None):
|
||||
@ -103,10 +103,10 @@ class Spider(object_ref):
|
||||
return url_is_from_spider(request.url, cls)
|
||||
|
||||
@staticmethod
|
||||
def close(spider: Spider, reason: str) -> Optional[Deferred[None]]:
|
||||
def close(spider: Spider, reason: str) -> Deferred[None] | None:
|
||||
closed = getattr(spider, "closed", None)
|
||||
if callable(closed):
|
||||
return cast("Optional[Deferred[None]]", closed(reason))
|
||||
return cast("Deferred[None] | None", closed(reason))
|
||||
return None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
|
@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
|
||||
import copy
|
||||
from collections.abc import AsyncIterable, Awaitable, Callable
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -39,15 +39,11 @@ def _identity(x: _T) -> _T:
|
||||
return x
|
||||
|
||||
|
||||
def _identity_process_request(
|
||||
request: Request, response: Response
|
||||
) -> Optional[Request]:
|
||||
def _identity_process_request(request: Request, response: Response) -> Request | None:
|
||||
return request
|
||||
|
||||
|
||||
def _get_method(
|
||||
method: Union[Callable, str, None], spider: Spider
|
||||
) -> Optional[Callable]:
|
||||
def _get_method(method: Callable | str | None, spider: Spider) -> Callable | None:
|
||||
if callable(method):
|
||||
return method
|
||||
if isinstance(method, str):
|
||||
@ -61,20 +57,20 @@ _default_link_extractor = LinkExtractor()
|
||||
class Rule:
|
||||
def __init__(
|
||||
self,
|
||||
link_extractor: Optional[LinkExtractor] = None,
|
||||
callback: Union[CallbackT, str, None] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
follow: Optional[bool] = None,
|
||||
process_links: Union[ProcessLinksT, str, None] = None,
|
||||
process_request: Union[ProcessRequestT, str, None] = None,
|
||||
errback: Union[Callable[[Failure], Any], str, None] = None,
|
||||
link_extractor: LinkExtractor | None = None,
|
||||
callback: CallbackT | str | None = None,
|
||||
cb_kwargs: dict[str, Any] | None = None,
|
||||
follow: bool | None = None,
|
||||
process_links: ProcessLinksT | str | None = None,
|
||||
process_request: ProcessRequestT | str | None = None,
|
||||
errback: Callable[[Failure], Any] | str | None = None,
|
||||
):
|
||||
self.link_extractor: LinkExtractor = link_extractor or _default_link_extractor
|
||||
self.callback: Union[CallbackT, str, None] = callback
|
||||
self.errback: Union[Callable[[Failure], Any], str, None] = errback
|
||||
self.callback: CallbackT | str | None = callback
|
||||
self.errback: Callable[[Failure], Any] | str | None = errback
|
||||
self.cb_kwargs: dict[str, Any] = cb_kwargs or {}
|
||||
self.process_links: Union[ProcessLinksT, str] = process_links or _identity
|
||||
self.process_request: Union[ProcessRequestT, str] = (
|
||||
self.process_links: ProcessLinksT | str = process_links or _identity
|
||||
self.process_request: ProcessRequestT | str = (
|
||||
process_request or _identity_process_request
|
||||
)
|
||||
self.follow: bool = follow if follow is not None else not callback
|
||||
@ -124,7 +120,7 @@ class CrawlSpider(Spider):
|
||||
meta={"rule": rule_index, "link_text": link.text},
|
||||
)
|
||||
|
||||
def _requests_to_follow(self, response: Response) -> Iterable[Optional[Request]]:
|
||||
def _requests_to_follow(self, response: Response) -> Iterable[Request | None]:
|
||||
if not isinstance(response, HtmlResponse):
|
||||
return
|
||||
seen: set[Link] = set()
|
||||
@ -157,7 +153,7 @@ class CrawlSpider(Spider):
|
||||
async def _parse_response(
|
||||
self,
|
||||
response: Response,
|
||||
callback: Optional[CallbackT],
|
||||
callback: CallbackT | None,
|
||||
cb_kwargs: dict[str, Any],
|
||||
follow: bool = True,
|
||||
) -> AsyncIterable[Any]:
|
||||
@ -176,7 +172,7 @@ class CrawlSpider(Spider):
|
||||
yield request_or_item
|
||||
|
||||
def _handle_failure(
|
||||
self, failure: Failure, errback: Optional[Callable[[Failure], Any]]
|
||||
self, failure: Failure, errback: Callable[[Failure], Any] | None
|
||||
) -> Iterable[Any]:
|
||||
if errback:
|
||||
results = errback(failure) or ()
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
from scrapy.http import Response, TextResponse
|
||||
@ -117,13 +117,13 @@ class CSVFeedSpider(Spider):
|
||||
and the file's headers.
|
||||
"""
|
||||
|
||||
delimiter: Optional[str] = (
|
||||
delimiter: str | None = (
|
||||
None # When this is None, python's csv module's default delimiter is used
|
||||
)
|
||||
quotechar: Optional[str] = (
|
||||
quotechar: str | None = (
|
||||
None # When this is None, python's csv module's default quotechar is used
|
||||
)
|
||||
headers: Optional[list[str]] = None
|
||||
headers: list[str] | None = None
|
||||
|
||||
def process_results(
|
||||
self, response: Response, results: Iterable[Any]
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from scrapy import Request
|
||||
from scrapy.spiders import Spider
|
||||
@ -18,7 +18,7 @@ class InitSpider(Spider):
|
||||
self._postinit_reqs: Iterable[Request] = super().start_requests()
|
||||
return cast(Iterable[Request], iterate_spider_output(self.init_request()))
|
||||
|
||||
def initialized(self, response: Optional[Response] = None) -> Any:
|
||||
def initialized(self, response: Response | None = None) -> Any:
|
||||
"""This method must be set as the callback of your last initialization
|
||||
request. See self.init_request() docstring for more info.
|
||||
"""
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from scrapy.http import Request, Response, XmlResponse
|
||||
from scrapy.spiders import Spider
|
||||
@ -24,10 +24,10 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class SitemapSpider(Spider):
|
||||
sitemap_urls: Sequence[str] = ()
|
||||
sitemap_rules: Sequence[
|
||||
tuple[Union[re.Pattern[str], str], Union[str, CallbackT]]
|
||||
] = [("", "parse")]
|
||||
sitemap_follow: Sequence[Union[re.Pattern[str], str]] = [""]
|
||||
sitemap_rules: Sequence[tuple[re.Pattern[str] | str, str | CallbackT]] = [
|
||||
("", "parse")
|
||||
]
|
||||
sitemap_follow: Sequence[re.Pattern[str] | str] = [""]
|
||||
sitemap_alternate_links: bool = False
|
||||
_max_size: int
|
||||
_warn_size: int
|
||||
@ -93,7 +93,7 @@ class SitemapSpider(Spider):
|
||||
yield Request(loc, callback=c)
|
||||
break
|
||||
|
||||
def _get_sitemap_body(self, response: Response) -> Optional[bytes]:
|
||||
def _get_sitemap_body(self, response: Response) -> bytes | None:
|
||||
"""Return the sitemap body contained in the given response,
|
||||
or None if the response is not a sitemap.
|
||||
"""
|
||||
@ -127,7 +127,7 @@ class SitemapSpider(Spider):
|
||||
return None
|
||||
|
||||
|
||||
def regex(x: Union[re.Pattern[str], str]) -> re.Pattern[str]:
|
||||
def regex(x: re.Pattern[str] | str) -> re.Pattern[str]:
|
||||
if isinstance(x, str):
|
||||
return re.compile(x)
|
||||
return x
|
||||
|
@ -7,7 +7,7 @@ from __future__ import annotations
|
||||
import marshal
|
||||
import pickle # nosec
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from queuelib import queue
|
||||
|
||||
@ -26,7 +26,7 @@ if TYPE_CHECKING:
|
||||
|
||||
def _with_mkdir(queue_class: type[queue.BaseQueue]) -> type[queue.BaseQueue]:
|
||||
class DirectoriesCreated(queue_class): # type: ignore[valid-type,misc]
|
||||
def __init__(self, path: Union[str, PathLike], *args: Any, **kwargs: Any):
|
||||
def __init__(self, path: str | PathLike, *args: Any, **kwargs: Any):
|
||||
dirname = Path(path).parent
|
||||
if not dirname.exists():
|
||||
dirname.mkdir(parents=True, exist_ok=True)
|
||||
@ -45,13 +45,13 @@ def _serializable_queue(
|
||||
s = serialize(obj)
|
||||
super().push(s)
|
||||
|
||||
def pop(self) -> Optional[Any]:
|
||||
def pop(self) -> Any | None:
|
||||
s = super().pop()
|
||||
if s:
|
||||
return deserialize(s)
|
||||
return None
|
||||
|
||||
def peek(self) -> Optional[Any]:
|
||||
def peek(self) -> Any | None:
|
||||
"""Returns the next object to be returned by :meth:`pop`,
|
||||
but without removing it from the queue.
|
||||
|
||||
@ -89,13 +89,13 @@ def _scrapy_serialization_queue(
|
||||
request_dict = request.to_dict(spider=self.spider)
|
||||
super().push(request_dict)
|
||||
|
||||
def pop(self) -> Optional[Request]:
|
||||
def pop(self) -> Request | None:
|
||||
request = super().pop()
|
||||
if not request:
|
||||
return None
|
||||
return request_from_dict(request, spider=self.spider)
|
||||
|
||||
def peek(self) -> Optional[Request]:
|
||||
def peek(self) -> Request | None:
|
||||
"""Returns the next object to be returned by :meth:`pop`,
|
||||
but without removing it from the queue.
|
||||
|
||||
@ -118,7 +118,7 @@ def _scrapy_non_serialization_queue(
|
||||
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self:
|
||||
return cls()
|
||||
|
||||
def peek(self) -> Optional[Any]:
|
||||
def peek(self) -> Any | None:
|
||||
"""Returns the next object to be returned by :meth:`pop`,
|
||||
but without removing it from the queue.
|
||||
|
||||
|
@ -6,7 +6,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import pprint
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from scrapy import Spider
|
||||
@ -25,32 +25,32 @@ class StatsCollector:
|
||||
self._stats: StatsT = {}
|
||||
|
||||
def get_value(
|
||||
self, key: str, default: Any = None, spider: Optional[Spider] = None
|
||||
self, key: str, default: Any = None, spider: Spider | None = None
|
||||
) -> Any:
|
||||
return self._stats.get(key, default)
|
||||
|
||||
def get_stats(self, spider: Optional[Spider] = None) -> StatsT:
|
||||
def get_stats(self, spider: Spider | None = None) -> StatsT:
|
||||
return self._stats
|
||||
|
||||
def set_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
|
||||
def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
|
||||
self._stats[key] = value
|
||||
|
||||
def set_stats(self, stats: StatsT, spider: Optional[Spider] = None) -> None:
|
||||
def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
|
||||
self._stats = stats
|
||||
|
||||
def inc_value(
|
||||
self, key: str, count: int = 1, start: int = 0, spider: Optional[Spider] = None
|
||||
self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
|
||||
) -> None:
|
||||
d = self._stats
|
||||
d[key] = d.setdefault(key, start) + count
|
||||
|
||||
def max_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
|
||||
def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
|
||||
self._stats[key] = max(self._stats.setdefault(key, value), value)
|
||||
|
||||
def min_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
|
||||
def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
|
||||
self._stats[key] = min(self._stats.setdefault(key, value), value)
|
||||
|
||||
def clear_stats(self, spider: Optional[Spider] = None) -> None:
|
||||
def clear_stats(self, spider: Spider | None = None) -> None:
|
||||
self._stats.clear()
|
||||
|
||||
def open_spider(self, spider: Spider) -> None:
|
||||
@ -79,23 +79,23 @@ class MemoryStatsCollector(StatsCollector):
|
||||
|
||||
class DummyStatsCollector(StatsCollector):
|
||||
def get_value(
|
||||
self, key: str, default: Any = None, spider: Optional[Spider] = None
|
||||
self, key: str, default: Any = None, spider: Spider | None = None
|
||||
) -> Any:
|
||||
return default
|
||||
|
||||
def set_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
|
||||
def set_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
|
||||
pass
|
||||
|
||||
def set_stats(self, stats: StatsT, spider: Optional[Spider] = None) -> None:
|
||||
def set_stats(self, stats: StatsT, spider: Spider | None = None) -> None:
|
||||
pass
|
||||
|
||||
def inc_value(
|
||||
self, key: str, count: int = 1, start: int = 0, spider: Optional[Spider] = None
|
||||
self, key: str, count: int = 1, start: int = 0, spider: Spider | None = None
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
def max_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
|
||||
def max_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
|
||||
pass
|
||||
|
||||
def min_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
|
||||
def min_value(self, key: str, value: Any, spider: Spider | None = None) -> None:
|
||||
pass
|
||||
|
@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncGenerator, AsyncIterable, Iterable
|
||||
from typing import TypeVar, Union
|
||||
from typing import TypeVar
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
@ -12,8 +14,8 @@ async def collect_asyncgen(result: AsyncIterable[_T]) -> list[_T]:
|
||||
|
||||
|
||||
async def as_async_generator(
|
||||
it: Union[Iterable[_T], AsyncIterable[_T]]
|
||||
) -> AsyncGenerator[_T, None]:
|
||||
it: Iterable[_T] | AsyncIterable[_T],
|
||||
) -> AsyncGenerator[_T]:
|
||||
"""Wraps an iterable (sync or async) into an async generator."""
|
||||
if isinstance(it, AsyncIterable):
|
||||
async for r in it:
|
||||
|
@ -8,7 +8,7 @@ from collections.abc import Iterable
|
||||
from configparser import ConfigParser
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Callable, cast
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
|
||||
from scrapy.settings import BaseSettings
|
||||
@ -33,7 +33,7 @@ def build_component_list(
|
||||
"please update your settings"
|
||||
)
|
||||
|
||||
def _map_keys(compdict: Mapping[Any, Any]) -> Union[BaseSettings, dict[Any, Any]]:
|
||||
def _map_keys(compdict: Mapping[Any, Any]) -> BaseSettings | dict[Any, Any]:
|
||||
if isinstance(compdict, BaseSettings):
|
||||
compbs = BaseSettings()
|
||||
for k, v in compdict.items():
|
||||
@ -86,8 +86,8 @@ def arglist_to_dict(arglist: list[str]) -> dict[str, str]:
|
||||
|
||||
|
||||
def closest_scrapy_cfg(
|
||||
path: Union[str, os.PathLike] = ".",
|
||||
prevpath: Optional[Union[str, os.PathLike]] = None,
|
||||
path: str | os.PathLike = ".",
|
||||
prevpath: str | os.PathLike | None = None,
|
||||
) -> str:
|
||||
"""Return the path to the closest scrapy.cfg file by traversing the current
|
||||
directory and its parents
|
||||
@ -159,8 +159,8 @@ def feed_complete_default_values_from_settings(
|
||||
def feed_process_params_from_cli(
|
||||
settings: BaseSettings,
|
||||
output: list[str],
|
||||
output_format: Optional[str] = None,
|
||||
overwrite_output: Optional[list[str]] = None,
|
||||
output_format: str | None = None,
|
||||
overwrite_output: list[str] | None = None,
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
"""
|
||||
Receives feed export params (from the 'crawl' or 'runspider' commands),
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from functools import wraps
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
@ -100,7 +100,7 @@ DEFAULT_PYTHON_SHELLS: KnownShellsT = {
|
||||
|
||||
|
||||
def get_shell_embed_func(
|
||||
shells: Optional[Iterable[str]] = None, known_shells: Optional[KnownShellsT] = None
|
||||
shells: Iterable[str] | None = None, known_shells: KnownShellsT | None = None
|
||||
) -> Any:
|
||||
"""Return the first acceptable shell-embed function
|
||||
from a given list of shell names.
|
||||
@ -120,9 +120,9 @@ def get_shell_embed_func(
|
||||
|
||||
|
||||
def start_python_console(
|
||||
namespace: Optional[dict[str, Any]] = None,
|
||||
namespace: dict[str, Any] | None = None,
|
||||
banner: str = "",
|
||||
shells: Optional[Iterable[str]] = None,
|
||||
shells: Iterable[str] | None = None,
|
||||
) -> None:
|
||||
"""Start Python console bound to the given namespace.
|
||||
Readline support and tab completion will be used on Unix, if available.
|
||||
|
@ -4,7 +4,7 @@ import argparse
|
||||
import warnings
|
||||
from http.cookies import SimpleCookie
|
||||
from shlex import split
|
||||
from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, NoReturn
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
@ -18,8 +18,8 @@ class DataAction(argparse.Action):
|
||||
self,
|
||||
parser: argparse.ArgumentParser,
|
||||
namespace: argparse.Namespace,
|
||||
values: Union[str, Sequence[Any], None],
|
||||
option_string: Optional[str] = None,
|
||||
values: str | Sequence[Any] | None,
|
||||
option_string: str | None = None,
|
||||
) -> None:
|
||||
value = str(values)
|
||||
if value.startswith("$"):
|
||||
|
@ -12,7 +12,7 @@ import warnings
|
||||
import weakref
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Mapping
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, TypeVar
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
@ -44,7 +44,7 @@ class CaselessDict(dict):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]] | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
if seq:
|
||||
@ -84,7 +84,7 @@ class CaselessDict(dict):
|
||||
return dict.setdefault(self, self.normkey(key), self.normvalue(def_val)) # type: ignore[arg-type]
|
||||
|
||||
# doesn't fully implement MutableMapping.update()
|
||||
def update(self, seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]]]) -> None: # type: ignore[override]
|
||||
def update(self, seq: Mapping[AnyStr, Any] | Iterable[tuple[AnyStr, Any]]) -> None: # type: ignore[override]
|
||||
seq = seq.items() if isinstance(seq, Mapping) else seq
|
||||
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
|
||||
super().update(iseq)
|
||||
@ -145,9 +145,9 @@ class LocalCache(OrderedDict[_KT, _VT]):
|
||||
Older items expires first.
|
||||
"""
|
||||
|
||||
def __init__(self, limit: Optional[int] = None):
|
||||
def __init__(self, limit: int | None = None):
|
||||
super().__init__()
|
||||
self.limit: Optional[int] = limit
|
||||
self.limit: int | None = limit
|
||||
|
||||
def __setitem__(self, key: _KT, value: _VT) -> None:
|
||||
if self.limit:
|
||||
@ -168,7 +168,7 @@ class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
|
||||
it cannot be instantiated with an initial dictionary.
|
||||
"""
|
||||
|
||||
def __init__(self, limit: Optional[int] = None):
|
||||
def __init__(self, limit: int | None = None):
|
||||
super().__init__()
|
||||
self.data: LocalCache = LocalCache(limit=limit)
|
||||
|
||||
@ -178,7 +178,7 @@ class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
|
||||
except TypeError:
|
||||
pass # key is not weak-referenceable, skip caching
|
||||
|
||||
def __getitem__(self, key: _KT) -> Optional[_VT]: # type: ignore[override]
|
||||
def __getitem__(self, key: _KT) -> _VT | None: # type: ignore[override]
|
||||
try:
|
||||
return super().__getitem__(key)
|
||||
except (TypeError, KeyError):
|
||||
|
@ -11,7 +11,7 @@ from asyncio import Future
|
||||
from collections.abc import Awaitable, Coroutine, Iterable, Iterator
|
||||
from functools import wraps
|
||||
from types import CoroutineType
|
||||
from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar, Union, cast, overload
|
||||
from typing import TYPE_CHECKING, Any, Generic, TypeVar, Union, cast, overload
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.defer import Deferred, DeferredList, ensureDeferred
|
||||
@ -93,7 +93,7 @@ def mustbe_deferred(
|
||||
|
||||
|
||||
def mustbe_deferred(
|
||||
f: Callable[_P, Union[Deferred[_T], Coroutine[Deferred[Any], Any, _T], _T]],
|
||||
f: Callable[_P, Deferred[_T] | Coroutine[Deferred[Any], Any, _T] | _T],
|
||||
*args: _P.args,
|
||||
**kw: _P.kwargs,
|
||||
) -> Deferred[_T]:
|
||||
@ -179,17 +179,17 @@ class _AsyncCooperatorAdapter(Iterator, Generic[_T]):
|
||||
def __init__(
|
||||
self,
|
||||
aiterable: AsyncIterable[_T],
|
||||
callable: Callable[Concatenate[_T, _P], Optional[Deferred[Any]]],
|
||||
callable: Callable[Concatenate[_T, _P], Deferred[Any] | None],
|
||||
*callable_args: _P.args,
|
||||
**callable_kwargs: _P.kwargs,
|
||||
):
|
||||
self.aiterator: AsyncIterator[_T] = aiterable.__aiter__()
|
||||
self.callable: Callable[Concatenate[_T, _P], Optional[Deferred[Any]]] = callable
|
||||
self.callable: Callable[Concatenate[_T, _P], Deferred[Any] | None] = callable
|
||||
self.callable_args: tuple[Any, ...] = callable_args
|
||||
self.callable_kwargs: dict[str, Any] = callable_kwargs
|
||||
self.finished: bool = False
|
||||
self.waiting_deferreds: list[Deferred[Any]] = []
|
||||
self.anext_deferred: Optional[Deferred[_T]] = None
|
||||
self.anext_deferred: Deferred[_T] | None = None
|
||||
|
||||
def _callback(self, result: _T) -> None:
|
||||
# This gets called when the result from aiterator.__anext__() is available.
|
||||
@ -237,7 +237,7 @@ class _AsyncCooperatorAdapter(Iterator, Generic[_T]):
|
||||
def parallel_async(
|
||||
async_iterable: AsyncIterable[_T],
|
||||
count: int,
|
||||
callable: Callable[Concatenate[_T, _P], Optional[Deferred[Any]]],
|
||||
callable: Callable[Concatenate[_T, _P], Deferred[Any] | None],
|
||||
*args: _P.args,
|
||||
**named: _P.kwargs,
|
||||
) -> Deferred[list[tuple[bool, Iterator[Deferred[Any]]]]]:
|
||||
@ -362,7 +362,7 @@ def deferred_from_coro(o: _CT) -> Deferred: ...
|
||||
def deferred_from_coro(o: _T) -> _T: ...
|
||||
|
||||
|
||||
def deferred_from_coro(o: _T) -> Union[Deferred, _T]:
|
||||
def deferred_from_coro(o: _T) -> Deferred | _T:
|
||||
"""Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
|
||||
if isinstance(o, Deferred):
|
||||
return o
|
||||
@ -433,7 +433,7 @@ def deferred_to_future(d: Deferred[_T]) -> Future[_T]:
|
||||
return d.asFuture(_get_asyncio_event_loop())
|
||||
|
||||
|
||||
def maybe_deferred_to_future(d: Deferred[_T]) -> Union[Deferred[_T], Future[_T]]:
|
||||
def maybe_deferred_to_future(d: Deferred[_T]) -> Deferred[_T] | Future[_T]:
|
||||
"""
|
||||
.. versionadded:: 2.6.0
|
||||
|
||||
|
@ -1,8 +1,10 @@
|
||||
"""Some helpers for deprecation messages"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import warnings
|
||||
from typing import Any, Optional, overload
|
||||
from typing import Any, overload
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
@ -20,11 +22,11 @@ def attribute(obj: Any, oldattr: str, newattr: str, version: str = "0.12") -> No
|
||||
def create_deprecated_class(
|
||||
name: str,
|
||||
new_class: type,
|
||||
clsdict: Optional[dict[str, Any]] = None,
|
||||
clsdict: dict[str, Any] | None = None,
|
||||
warn_category: type[Warning] = ScrapyDeprecationWarning,
|
||||
warn_once: bool = True,
|
||||
old_class_path: Optional[str] = None,
|
||||
new_class_path: Optional[str] = None,
|
||||
old_class_path: str | None = None,
|
||||
new_class_path: str | None = None,
|
||||
subclass_warn_message: str = "{cls} inherits from deprecated class {old}, please inherit from {new}.",
|
||||
instance_warn_message: str = "{cls} is deprecated, instantiate {new} instead.",
|
||||
) -> type:
|
||||
@ -55,7 +57,7 @@ def create_deprecated_class(
|
||||
|
||||
# https://github.com/python/mypy/issues/4177
|
||||
class DeprecatedClass(new_class.__class__): # type: ignore[misc, name-defined]
|
||||
deprecated_class: Optional[type] = None
|
||||
deprecated_class: type | None = None
|
||||
warned_on_subclass: bool = False
|
||||
|
||||
def __new__(
|
||||
@ -128,7 +130,7 @@ def create_deprecated_class(
|
||||
return deprecated_cls
|
||||
|
||||
|
||||
def _clspath(cls: type, forced: Optional[str] = None) -> str:
|
||||
def _clspath(cls: type, forced: str | None = None) -> str:
|
||||
if forced is not None:
|
||||
return forced
|
||||
return f"{cls.__module__}.{cls.__name__}"
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Union
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import ParseResult, urlparse
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
@ -10,12 +10,12 @@ if TYPE_CHECKING:
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
|
||||
_urlparse_cache: WeakKeyDictionary[Union[Request, Response], ParseResult] = (
|
||||
_urlparse_cache: WeakKeyDictionary[Request | Response, ParseResult] = (
|
||||
WeakKeyDictionary()
|
||||
)
|
||||
|
||||
|
||||
def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult:
|
||||
def urlparse_cached(request_or_response: Request | Response) -> ParseResult:
|
||||
"""Return urlparse.urlparse caching the result, where the argument can be a
|
||||
Request or Response object
|
||||
"""
|
||||
|
@ -4,7 +4,7 @@ import csv
|
||||
import logging
|
||||
import re
|
||||
from io import StringIO
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast, overload
|
||||
from typing import TYPE_CHECKING, Any, Literal, cast, overload
|
||||
from warnings import warn
|
||||
|
||||
from lxml import etree # nosec
|
||||
@ -20,7 +20,7 @@ if TYPE_CHECKING:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def xmliter(obj: Union[Response, str, bytes], nodename: str) -> Iterator[Selector]:
|
||||
def xmliter(obj: Response | str | bytes, nodename: str) -> Iterator[Selector]:
|
||||
"""Return a iterator of Selector's over all nodes of a XML document,
|
||||
given the name of the node to iterate. Useful for parsing XML feeds.
|
||||
|
||||
@ -77,9 +77,9 @@ def xmliter(obj: Union[Response, str, bytes], nodename: str) -> Iterator[Selecto
|
||||
|
||||
|
||||
def xmliter_lxml(
|
||||
obj: Union[Response, str, bytes],
|
||||
obj: Response | str | bytes,
|
||||
nodename: str,
|
||||
namespace: Optional[str] = None,
|
||||
namespace: str | None = None,
|
||||
prefix: str = "x",
|
||||
) -> Iterator[Selector]:
|
||||
reader = _StreamReader(obj)
|
||||
@ -120,9 +120,9 @@ def xmliter_lxml(
|
||||
|
||||
|
||||
class _StreamReader:
|
||||
def __init__(self, obj: Union[Response, str, bytes]):
|
||||
def __init__(self, obj: Response | str | bytes):
|
||||
self._ptr: int = 0
|
||||
self._text: Union[str, bytes]
|
||||
self._text: str | bytes
|
||||
if isinstance(obj, TextResponse):
|
||||
self._text, self.encoding = obj.body, obj.encoding
|
||||
elif isinstance(obj, Response):
|
||||
@ -154,11 +154,11 @@ class _StreamReader:
|
||||
|
||||
|
||||
def csviter(
|
||||
obj: Union[Response, str, bytes],
|
||||
delimiter: Optional[str] = None,
|
||||
headers: Optional[list[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
quotechar: Optional[str] = None,
|
||||
obj: Response | str | bytes,
|
||||
delimiter: str | None = None,
|
||||
headers: list[str] | None = None,
|
||||
encoding: str | None = None,
|
||||
quotechar: str | None = None,
|
||||
) -> Iterator[dict[str, str]]:
|
||||
"""Returns an iterator of dictionaries from the given csv object
|
||||
|
||||
@ -214,22 +214,18 @@ def csviter(
|
||||
|
||||
|
||||
@overload
|
||||
def _body_or_str(obj: Union[Response, str, bytes]) -> str: ...
|
||||
def _body_or_str(obj: Response | str | bytes) -> str: ...
|
||||
|
||||
|
||||
@overload
|
||||
def _body_or_str(obj: Union[Response, str, bytes], unicode: Literal[True]) -> str: ...
|
||||
def _body_or_str(obj: Response | str | bytes, unicode: Literal[True]) -> str: ...
|
||||
|
||||
|
||||
@overload
|
||||
def _body_or_str(
|
||||
obj: Union[Response, str, bytes], unicode: Literal[False]
|
||||
) -> bytes: ...
|
||||
def _body_or_str(obj: Response | str | bytes, unicode: Literal[False]) -> bytes: ...
|
||||
|
||||
|
||||
def _body_or_str(
|
||||
obj: Union[Response, str, bytes], unicode: bool = True
|
||||
) -> Union[str, bytes]:
|
||||
def _body_or_str(obj: Response | str | bytes, unicode: bool = True) -> str | bytes:
|
||||
expected_types = (Response, str, bytes)
|
||||
if not isinstance(obj, expected_types):
|
||||
expected_types_str = " or ".join(t.__name__ for t in expected_types)
|
||||
|
@ -1,14 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from scrapy.settings import BaseSettings
|
||||
|
||||
|
||||
def job_dir(settings: BaseSettings) -> Optional[str]:
|
||||
path: Optional[str] = settings["JOBDIR"]
|
||||
def job_dir(settings: BaseSettings) -> str | None:
|
||||
path: str | None = settings["JOBDIR"]
|
||||
if not path:
|
||||
return None
|
||||
if not Path(path).exists():
|
||||
|
@ -5,7 +5,7 @@ import sys
|
||||
from collections.abc import MutableMapping
|
||||
from logging.config import dictConfig
|
||||
from types import TracebackType
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
|
||||
from twisted.python import log as twisted_log
|
||||
from twisted.python.failure import Failure
|
||||
@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
def failure_to_exc_info(
|
||||
failure: Failure,
|
||||
) -> Optional[tuple[type[BaseException], BaseException, Optional[TracebackType]]]:
|
||||
) -> tuple[type[BaseException], BaseException, TracebackType | None] | None:
|
||||
"""Extract exc_info from Failure instances"""
|
||||
if isinstance(failure, Failure):
|
||||
assert failure.type
|
||||
@ -50,7 +50,7 @@ class TopLevelFormatter(logging.Filter):
|
||||
``loggers`` list where it should act.
|
||||
"""
|
||||
|
||||
def __init__(self, loggers: Optional[list[str]] = None):
|
||||
def __init__(self, loggers: list[str] | None = None):
|
||||
self.loggers: list[str] = loggers or []
|
||||
|
||||
def filter(self, record: logging.LogRecord) -> bool:
|
||||
@ -80,7 +80,7 @@ DEFAULT_LOGGING = {
|
||||
|
||||
|
||||
def configure_logging(
|
||||
settings: Union[Settings, dict[_SettingsKeyT, Any], None] = None,
|
||||
settings: Settings | dict[_SettingsKeyT, Any] | None = None,
|
||||
install_root_handler: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
@ -125,7 +125,7 @@ def configure_logging(
|
||||
install_scrapy_root_handler(settings)
|
||||
|
||||
|
||||
_scrapy_root_handler: Optional[logging.Handler] = None
|
||||
_scrapy_root_handler: logging.Handler | None = None
|
||||
|
||||
|
||||
def install_scrapy_root_handler(settings: Settings) -> None:
|
||||
@ -141,7 +141,7 @@ def install_scrapy_root_handler(settings: Settings) -> None:
|
||||
logging.root.addHandler(_scrapy_root_handler)
|
||||
|
||||
|
||||
def get_scrapy_root_handler() -> Optional[logging.Handler]:
|
||||
def get_scrapy_root_handler() -> logging.Handler | None:
|
||||
return _scrapy_root_handler
|
||||
|
||||
|
||||
@ -231,7 +231,7 @@ class LogCounterHandler(logging.Handler):
|
||||
|
||||
def logformatter_adapter(
|
||||
logkws: LogFormatterResult,
|
||||
) -> tuple[int, str, Union[dict[str, Any], tuple[Any, ...]]]:
|
||||
) -> tuple[int, str, dict[str, Any] | tuple[Any, ...]]:
|
||||
"""
|
||||
Helper that takes the dictionary output from the methods in LogFormatter
|
||||
and adapts it into a tuple of positional arguments for logger.log calls,
|
||||
|
@ -14,7 +14,7 @@ from contextlib import contextmanager
|
||||
from functools import partial
|
||||
from importlib import import_module
|
||||
from pkgutil import iter_modules
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
from typing import IO, TYPE_CHECKING, Any, TypeVar, cast
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.item import Item
|
||||
@ -46,7 +46,7 @@ def arg_to_iter(arg: Any) -> Iterable[Any]:
|
||||
return [arg]
|
||||
|
||||
|
||||
def load_object(path: Union[str, Callable[..., Any]]) -> Any:
|
||||
def load_object(path: str | Callable[..., Any]) -> Any:
|
||||
"""Load an object given its absolute object path, and return it.
|
||||
|
||||
The object can be the import path of a class, function, variable or an
|
||||
@ -126,7 +126,7 @@ def md5sum(file: IO[bytes]) -> str:
|
||||
return m.hexdigest()
|
||||
|
||||
|
||||
def rel_has_nofollow(rel: Optional[str]) -> bool:
|
||||
def rel_has_nofollow(rel: str | None) -> bool:
|
||||
"""Return True if link rel attribute has nofollow type"""
|
||||
return rel is not None and "nofollow" in rel.replace(",", " ").split()
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import signal
|
||||
from collections.abc import Callable
|
||||
from types import FrameType
|
||||
|
@ -1,8 +1,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.settings import Settings
|
||||
@ -45,7 +46,7 @@ def project_data_dir(project: str = "default") -> str:
|
||||
return str(d)
|
||||
|
||||
|
||||
def data_path(path: Union[str, os.PathLike[str]], createdir: bool = False) -> str:
|
||||
def data_path(path: str | os.PathLike[str], createdir: bool = False) -> str:
|
||||
"""
|
||||
Return the given path joined with the .scrapy data directory.
|
||||
If given an absolute path, return it unmodified.
|
||||
|
@ -12,7 +12,7 @@ import weakref
|
||||
from collections.abc import AsyncIterable, Iterable, Mapping
|
||||
from functools import partial, wraps
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, overload
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, overload
|
||||
|
||||
from scrapy.utils.asyncgen import as_async_generator
|
||||
|
||||
@ -99,7 +99,7 @@ def unique(list_: Iterable[_T], key: Callable[[_T], Any] = lambda x: x) -> list[
|
||||
|
||||
|
||||
def to_unicode(
|
||||
text: Union[str, bytes], encoding: Optional[str] = None, errors: str = "strict"
|
||||
text: str | bytes, encoding: str | None = None, errors: str = "strict"
|
||||
) -> str:
|
||||
"""Return the unicode representation of a bytes object ``text``. If
|
||||
``text`` is already an unicode object, return it as-is."""
|
||||
@ -116,7 +116,7 @@ def to_unicode(
|
||||
|
||||
|
||||
def to_bytes(
|
||||
text: Union[str, bytes], encoding: Optional[str] = None, errors: str = "strict"
|
||||
text: str | bytes, encoding: str | None = None, errors: str = "strict"
|
||||
) -> bytes:
|
||||
"""Return the binary representation of ``text``. If ``text``
|
||||
is already a bytes object, return it as-is."""
|
||||
@ -132,8 +132,8 @@ def to_bytes(
|
||||
|
||||
|
||||
def re_rsearch(
|
||||
pattern: Union[str, Pattern[str]], text: str, chunk_size: int = 1024
|
||||
) -> Optional[tuple[int, int]]:
|
||||
pattern: str | Pattern[str], text: str, chunk_size: int = 1024
|
||||
) -> tuple[int, int] | None:
|
||||
"""
|
||||
This function does a reverse search in a text using a regular expression
|
||||
given in the attribute 'pattern'.
|
||||
@ -269,7 +269,7 @@ def get_spec(func: Callable[..., Any]) -> tuple[list[str], dict[str, Any]]:
|
||||
|
||||
|
||||
def equal_attributes(
|
||||
obj1: Any, obj2: Any, attributes: Optional[list[Union[str, Callable[[Any], Any]]]]
|
||||
obj1: Any, obj2: Any, attributes: list[str | Callable[[Any], Any]] | None
|
||||
) -> bool:
|
||||
"""Compare two objects attributes"""
|
||||
# not attributes given return False by default
|
||||
@ -297,8 +297,8 @@ def without_none_values(iterable: Iterable[_KT]) -> Iterable[_KT]: ...
|
||||
|
||||
|
||||
def without_none_values(
|
||||
iterable: Union[Mapping[_KT, _VT], Iterable[_KT]]
|
||||
) -> Union[dict[_KT, _VT], Iterable[_KT]]:
|
||||
iterable: Mapping[_KT, _VT] | Iterable[_KT]
|
||||
) -> dict[_KT, _VT] | Iterable[_KT]:
|
||||
"""Return a copy of ``iterable`` with all ``None`` entries removed.
|
||||
|
||||
If ``iterable`` is a mapping, return a dictionary where all pairs that have
|
||||
@ -354,7 +354,7 @@ class MutableChain(Iterable[_T]):
|
||||
|
||||
|
||||
async def _async_chain(
|
||||
*iterables: Union[Iterable[_T], AsyncIterable[_T]]
|
||||
*iterables: Iterable[_T] | AsyncIterable[_T],
|
||||
) -> AsyncIterator[_T]:
|
||||
for it in iterables:
|
||||
async for o in as_async_generator(it):
|
||||
@ -366,10 +366,10 @@ class MutableAsyncChain(AsyncIterable[_T]):
|
||||
Similar to MutableChain but for async iterables
|
||||
"""
|
||||
|
||||
def __init__(self, *args: Union[Iterable[_T], AsyncIterable[_T]]):
|
||||
def __init__(self, *args: Iterable[_T] | AsyncIterable[_T]):
|
||||
self.data: AsyncIterator[_T] = _async_chain(*args)
|
||||
|
||||
def extend(self, *iterables: Union[Iterable[_T], AsyncIterable[_T]]) -> None:
|
||||
def extend(self, *iterables: Iterable[_T] | AsyncIterable[_T]) -> None:
|
||||
self.data = _async_chain(self.data, _async_chain(*iterables))
|
||||
|
||||
def __aiter__(self) -> AsyncIterator[_T]:
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
|
||||
from typing import TYPE_CHECKING, Any, Generic, TypeVar
|
||||
from warnings import catch_warnings, filterwarnings, warn
|
||||
|
||||
from twisted.internet import asyncioreactor, error
|
||||
@ -54,7 +54,7 @@ class CallLaterOnce(Generic[_T]):
|
||||
self._func: Callable[_P, _T] = func
|
||||
self._a: tuple[Any, ...] = a
|
||||
self._kw: dict[str, Any] = kw
|
||||
self._call: Optional[DelayedCall] = None
|
||||
self._call: DelayedCall | None = None
|
||||
|
||||
def schedule(self, delay: float = 0) -> None:
|
||||
from twisted.internet import reactor
|
||||
@ -107,7 +107,7 @@ def _get_asyncio_event_loop_policy() -> AbstractEventLoopPolicy:
|
||||
return policy
|
||||
|
||||
|
||||
def install_reactor(reactor_path: str, event_loop_path: Optional[str] = None) -> None:
|
||||
def install_reactor(reactor_path: str, event_loop_path: str | None = None) -> None:
|
||||
"""Installs the :mod:`~twisted.internet.reactor` with the specified
|
||||
import path. Also installs the asyncio event loop with the specified import
|
||||
path if the asyncio reactor is enabled"""
|
||||
@ -129,7 +129,7 @@ def _get_asyncio_event_loop() -> AbstractEventLoop:
|
||||
return set_asyncio_event_loop(None)
|
||||
|
||||
|
||||
def set_asyncio_event_loop(event_loop_path: Optional[str]) -> AbstractEventLoop:
|
||||
def set_asyncio_event_loop(event_loop_path: str | None) -> AbstractEventLoop:
|
||||
"""Sets and returns the event loop with specified import path."""
|
||||
if event_loop_path is not None:
|
||||
event_loop_class: type[AbstractEventLoop] = load_object(event_loop_path)
|
||||
|
@ -8,7 +8,7 @@ from __future__ import annotations
|
||||
import hashlib
|
||||
import json
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Optional, Protocol, Union
|
||||
from typing import TYPE_CHECKING, Any, Protocol
|
||||
from urllib.parse import urlunparse
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
@ -38,7 +38,7 @@ def _serialize_headers(headers: Iterable[bytes], request: Request) -> Iterable[b
|
||||
|
||||
|
||||
_fingerprint_cache: WeakKeyDictionary[
|
||||
Request, dict[tuple[Optional[tuple[bytes, ...]], bool], bytes]
|
||||
Request, dict[tuple[tuple[bytes, ...] | None, bool], bytes]
|
||||
]
|
||||
_fingerprint_cache = WeakKeyDictionary()
|
||||
|
||||
@ -46,7 +46,7 @@ _fingerprint_cache = WeakKeyDictionary()
|
||||
def fingerprint(
|
||||
request: Request,
|
||||
*,
|
||||
include_headers: Optional[Iterable[Union[bytes, str]]] = None,
|
||||
include_headers: Iterable[bytes | str] | None = None,
|
||||
keep_fragments: bool = False,
|
||||
) -> bytes:
|
||||
"""
|
||||
@ -79,7 +79,7 @@ def fingerprint(
|
||||
If you want to include them, set the keep_fragments argument to True
|
||||
(for instance when handling requests with a headless browser).
|
||||
"""
|
||||
processed_include_headers: Optional[tuple[bytes, ...]] = None
|
||||
processed_include_headers: tuple[bytes, ...] | None = None
|
||||
if include_headers:
|
||||
processed_include_headers = tuple(
|
||||
to_bytes(h.lower()) for h in sorted(include_headers)
|
||||
@ -129,7 +129,7 @@ class RequestFingerprinter:
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
return cls(crawler)
|
||||
|
||||
def __init__(self, crawler: Optional[Crawler] = None):
|
||||
def __init__(self, crawler: Crawler | None = None):
|
||||
if crawler:
|
||||
implementation = crawler.settings.get(
|
||||
"REQUEST_FINGERPRINTER_IMPLEMENTATION"
|
||||
@ -177,7 +177,7 @@ def request_httprepr(request: Request) -> bytes:
|
||||
return s
|
||||
|
||||
|
||||
def referer_str(request: Request) -> Optional[str]:
|
||||
def referer_str(request: Request) -> str | None:
|
||||
"""Return Referer HTTP header suitable for logging."""
|
||||
referrer = request.headers.get("Referer")
|
||||
if referrer is None:
|
||||
@ -185,7 +185,7 @@ def referer_str(request: Request) -> Optional[str]:
|
||||
return to_unicode(referrer, errors="replace")
|
||||
|
||||
|
||||
def request_from_dict(d: dict[str, Any], *, spider: Optional[Spider] = None) -> Request:
|
||||
def request_from_dict(d: dict[str, Any], *, spider: Spider | None = None) -> Request:
|
||||
"""Create a :class:`~scrapy.Request` object from a dict.
|
||||
|
||||
If a spider is given, it will try to resolve the callbacks looking at the
|
||||
|
@ -9,7 +9,7 @@ import os
|
||||
import re
|
||||
import tempfile
|
||||
import webbrowser
|
||||
from typing import TYPE_CHECKING, Any, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
from twisted.web import http
|
||||
@ -35,15 +35,15 @@ def get_base_url(response: TextResponse) -> str:
|
||||
return _baseurl_cache[response]
|
||||
|
||||
|
||||
_metaref_cache: WeakKeyDictionary[
|
||||
Response, Union[tuple[None, None], tuple[float, str]]
|
||||
] = WeakKeyDictionary()
|
||||
_metaref_cache: WeakKeyDictionary[Response, tuple[None, None] | tuple[float, str]] = (
|
||||
WeakKeyDictionary()
|
||||
)
|
||||
|
||||
|
||||
def get_meta_refresh(
|
||||
response: TextResponse,
|
||||
ignore_tags: Iterable[str] = ("script", "noscript"),
|
||||
) -> Union[tuple[None, None], tuple[float, str]]:
|
||||
) -> tuple[None, None] | tuple[float, str]:
|
||||
"""Parse the http-equiv refresh parameter from the given response"""
|
||||
if response not in _metaref_cache:
|
||||
text = response.text[0:4096]
|
||||
@ -53,7 +53,7 @@ def get_meta_refresh(
|
||||
return _metaref_cache[response]
|
||||
|
||||
|
||||
def response_status_message(status: Union[bytes, float, int, str]) -> str:
|
||||
def response_status_message(status: bytes | float | int | str) -> str:
|
||||
"""Return status code plus status text descriptive message"""
|
||||
status_int = int(status)
|
||||
message = http.RESPONSES.get(status_int, "Unknown Status")
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user