1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 09:07:32 +00:00

Drop Python 3.8 Support (#6472)

This commit is contained in:
Vsevolod Breus 2024-10-16 08:03:16 +00:00 committed by GitHub
parent 9736e49b52
commit 5391663072
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
153 changed files with 1011 additions and 1307 deletions

View File

@ -15,10 +15,10 @@ jobs:
- python-version: "3.12"
env:
TOXENV: pylint
- python-version: 3.8
- python-version: "3.9"
env:
TOXENV: typing
- python-version: 3.8
- python-version: "3.9"
env:
TOXENV: typing-tests
- python-version: "3.12" # Keep in sync with .readthedocs.yml

View File

@ -11,7 +11,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4

View File

@ -12,7 +12,7 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: 3.9
- python-version: "3.9"
env:
TOXENV: py
- python-version: "3.10"
@ -35,19 +35,19 @@ jobs:
TOXENV: pypy3
# pinned deps
- python-version: 3.8.17
- python-version: 3.9.19
env:
TOXENV: pinned
- python-version: 3.8.17
- python-version: 3.9.19
env:
TOXENV: asyncio-pinned
- python-version: pypy3.8
- python-version: pypy3.9
env:
TOXENV: pypy3-pinned
- python-version: 3.8.17
- python-version: 3.9.19
env:
TOXENV: extra-deps-pinned
- python-version: 3.8.17
- python-version: 3.9.19
env:
TOXENV: botocore-pinned

View File

@ -12,12 +12,9 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: 3.8
- python-version: "3.9"
env:
TOXENV: windows-pinned
- python-version: 3.9
env:
TOXENV: py
- python-version: "3.10"
env:
TOXENV: py

View File

@ -33,4 +33,4 @@ repos:
rev: v3.16.0
hooks:
- id: pyupgrade
args: [--py38-plus, --keep-runtime-typing]
args: [--py39-plus, --keep-runtime-typing]

View File

@ -59,7 +59,7 @@ including a list of features.
Requirements
============
* Python 3.8+
* Python 3.9+
* Works on Linux, Windows, macOS, BSD
Install

View File

@ -9,7 +9,7 @@ Installation guide
Supported Python versions
=========================
Scrapy requires Python 3.8+, either the CPython implementation (default) or
Scrapy requires Python 3.9+, either the CPython implementation (default) or
the PyPy implementation (see :ref:`python:implementations`).
.. _intro-install-scrapy:

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, List
from typing import TYPE_CHECKING, Any
from scrapy.exceptions import NotConfigured
from scrapy.utils.conf import build_component_list
@ -20,7 +20,7 @@ class AddonManager:
def __init__(self, crawler: Crawler) -> None:
self.crawler: Crawler = crawler
self.addons: List[Any] = []
self.addons: list[Any] = []
def load_settings(self, settings: Settings) -> None:
"""Load add-ons and configurations from a settings object and apply them.

View File

@ -6,7 +6,7 @@ import inspect
import os
import sys
from importlib.metadata import entry_points
from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Tuple, Type
from typing import TYPE_CHECKING, Optional
import scrapy
from scrapy.commands import BaseRunSpiderCommand, ScrapyCommand, ScrapyHelpFormatter
@ -17,6 +17,8 @@ from scrapy.utils.project import get_project_settings, inside_project
from scrapy.utils.python import garbage_collect
if TYPE_CHECKING:
from collections.abc import Callable, Iterable
# typing.ParamSpec requires Python 3.10
from typing_extensions import ParamSpec
@ -28,7 +30,7 @@ if TYPE_CHECKING:
class ScrapyArgumentParser(argparse.ArgumentParser):
def _parse_optional(
self, arg_string: str
) -> Optional[Tuple[Optional[argparse.Action], str, Optional[str]]]:
) -> Optional[tuple[Optional[argparse.Action], str, Optional[str]]]:
# if starts with -: it means that is a parameter not a argument
if arg_string[:2] == "-:":
return None
@ -36,7 +38,7 @@ class ScrapyArgumentParser(argparse.ArgumentParser):
return super()._parse_optional(arg_string)
def _iter_command_classes(module_name: str) -> Iterable[Type[ScrapyCommand]]:
def _iter_command_classes(module_name: str) -> Iterable[type[ScrapyCommand]]:
# TODO: add `name` attribute to commands and merge this function with
# scrapy.utils.spider.iter_spider_classes
for module in walk_modules(module_name):
@ -50,8 +52,8 @@ def _iter_command_classes(module_name: str) -> Iterable[Type[ScrapyCommand]]:
yield obj
def _get_commands_from_module(module: str, inproject: bool) -> Dict[str, ScrapyCommand]:
d: Dict[str, ScrapyCommand] = {}
def _get_commands_from_module(module: str, inproject: bool) -> dict[str, ScrapyCommand]:
d: dict[str, ScrapyCommand] = {}
for cmd in _iter_command_classes(module):
if inproject or not cmd.requires_project:
cmdname = cmd.__module__.split(".")[-1]
@ -61,8 +63,8 @@ def _get_commands_from_module(module: str, inproject: bool) -> Dict[str, ScrapyC
def _get_commands_from_entry_points(
inproject: bool, group: str = "scrapy.commands"
) -> Dict[str, ScrapyCommand]:
cmds: Dict[str, ScrapyCommand] = {}
) -> dict[str, ScrapyCommand]:
cmds: dict[str, ScrapyCommand] = {}
if sys.version_info >= (3, 10):
eps = entry_points(group=group)
else:
@ -78,7 +80,7 @@ def _get_commands_from_entry_points(
def _get_commands_dict(
settings: BaseSettings, inproject: bool
) -> Dict[str, ScrapyCommand]:
) -> dict[str, ScrapyCommand]:
cmds = _get_commands_from_module("scrapy.commands", inproject)
cmds.update(_get_commands_from_entry_points(inproject))
cmds_module = settings["COMMANDS_MODULE"]
@ -87,7 +89,7 @@ def _get_commands_dict(
return cmds
def _pop_command_name(argv: List[str]) -> Optional[str]:
def _pop_command_name(argv: list[str]) -> Optional[str]:
i = 0
for arg in argv[1:]:
if not arg.startswith("-"):
@ -146,7 +148,7 @@ def _run_print_help(
def execute(
argv: Optional[List[str]] = None, settings: Optional[Settings] = None
argv: Optional[list[str]] = None, settings: Optional[Settings] = None
) -> None:
if argv is None:
argv = sys.argv
@ -189,7 +191,7 @@ def execute(
sys.exit(cmd.exitcode)
def _run_command(cmd: ScrapyCommand, args: List[str], opts: argparse.Namespace) -> None:
def _run_command(cmd: ScrapyCommand, args: list[str], opts: argparse.Namespace) -> None:
if opts.profile:
_run_command_profiled(cmd, args, opts)
else:
@ -197,7 +199,7 @@ def _run_command(cmd: ScrapyCommand, args: List[str], opts: argparse.Namespace)
def _run_command_profiled(
cmd: ScrapyCommand, args: List[str], opts: argparse.Namespace
cmd: ScrapyCommand, args: list[str], opts: argparse.Namespace
) -> None:
if opts.profile:
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")

View File

@ -8,7 +8,7 @@ import argparse
import builtins
import os
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional
from typing import TYPE_CHECKING, Any, Optional
from twisted.python import failure
@ -16,6 +16,8 @@ from scrapy.exceptions import UsageError
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
if TYPE_CHECKING:
from collections.abc import Iterable
from scrapy.crawler import Crawler, CrawlerProcess
@ -24,7 +26,7 @@ class ScrapyCommand:
crawler_process: Optional[CrawlerProcess] = None
# default settings to be used for this command instead of global defaults
default_settings: Dict[str, Any] = {}
default_settings: dict[str, Any] = {}
exitcode: int = 0
@ -97,7 +99,7 @@ class ScrapyCommand:
)
group.add_argument("--pdb", action="store_true", help="enable pdb on failure")
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
def process_options(self, args: list[str], opts: argparse.Namespace) -> None:
try:
self.settings.setdict(arglist_to_dict(opts.set), priority="cmdline")
except ValueError:
@ -122,7 +124,7 @@ class ScrapyCommand:
if opts.pdb:
failure.startDebugMode()
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
"""
Entry point for running commands
"""
@ -167,7 +169,7 @@ class BaseRunSpiderCommand(ScrapyCommand):
help="format to use for dumping items",
)
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
def process_options(self, args: list[str], opts: argparse.Namespace) -> None:
super().process_options(args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
@ -207,7 +209,7 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
parts = self.format_part_strings(builtins.list(part_strings))
return super()._join_parts(parts)
def format_part_strings(self, part_strings: List[str]) -> List[str]:
def format_part_strings(self, part_strings: list[str]) -> list[str]:
"""
Underline and title case command line help message headers.
"""

View File

@ -4,7 +4,7 @@ import argparse
import subprocess # nosec
import sys
import time
from typing import TYPE_CHECKING, Any, Iterable, List
from typing import TYPE_CHECKING, Any
from urllib.parse import urlencode
import scrapy
@ -13,6 +13,8 @@ from scrapy.http import Response, TextResponse
from scrapy.linkextractors import LinkExtractor
if TYPE_CHECKING:
from collections.abc import Iterable
from scrapy import Request
@ -26,7 +28,7 @@ class Command(ScrapyCommand):
def short_desc(self) -> str:
return "Run quick benchmark test"
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
with _BenchServer():
assert self.crawler_process
self.crawler_process.crawl(_BenchSpider, total=100000)

View File

@ -1,7 +1,6 @@
import argparse
import time
from collections import defaultdict
from typing import List
from unittest import TextTestResult as _TextTestResult
from unittest import TextTestRunner
@ -69,7 +68,7 @@ class Command(ScrapyCommand):
help="print contract tests for all spiders",
)
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
# load contracts
contracts = build_component_list(self.settings.getwithbase("SPIDER_CONTRACTS"))
conman = ContractsManager(load_object(c) for c in contracts)

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, List, cast
from typing import TYPE_CHECKING, cast
from twisted.python.failure import Failure
@ -20,7 +20,7 @@ class Command(BaseRunSpiderCommand):
def short_desc(self) -> str:
return "Run a spider"
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if len(args) < 1:
raise UsageError()
elif len(args) > 1:

View File

@ -1,7 +1,6 @@
import argparse
import os
import sys
from typing import List
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
@ -27,7 +26,7 @@ class Command(ScrapyCommand):
sys.stderr.write(msg + os.linesep)
self.exitcode = 1
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if len(args) != 1:
raise UsageError()

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import sys
from typing import TYPE_CHECKING, Dict, List, Type
from typing import TYPE_CHECKING
from w3lib.url import is_url
@ -48,7 +48,7 @@ class Command(ScrapyCommand):
help="do not handle HTTP 3xx status codes and print response as-is",
)
def _print_headers(self, headers: Dict[bytes, List[bytes]], prefix: bytes) -> None:
def _print_headers(self, headers: dict[bytes, list[bytes]], prefix: bytes) -> None:
for key, values in headers.items():
for value in values:
self._print_bytes(prefix + b" " + key + b": " + value)
@ -65,7 +65,7 @@ class Command(ScrapyCommand):
def _print_bytes(self, bytes_: bytes) -> None:
sys.stdout.buffer.write(bytes_ + b"\n")
def run(self, args: List[str], opts: Namespace) -> None:
def run(self, args: list[str], opts: Namespace) -> None:
if len(args) != 1 or not is_url(args[0]):
raise UsageError()
request = Request(
@ -81,7 +81,7 @@ class Command(ScrapyCommand):
else:
request.meta["handle_httpstatus_all"] = True
spidercls: Type[Spider] = DefaultSpider
spidercls: type[Spider] = DefaultSpider
assert self.crawler_process
spider_loader = self.crawler_process.spider_loader
if opts.spider:

View File

@ -4,7 +4,7 @@ import shutil
import string
from importlib import import_module
from pathlib import Path
from typing import List, Optional, Union, cast
from typing import Optional, Union, cast
from urllib.parse import urlparse
import scrapy
@ -87,7 +87,7 @@ class Command(ScrapyCommand):
help="If the spider already exists, overwrite it with the template",
)
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if opts.list:
self._list_templates()
return

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, List
from typing import TYPE_CHECKING
from scrapy.commands import ScrapyCommand
@ -15,7 +15,7 @@ class Command(ScrapyCommand):
def short_desc(self) -> str:
return "List available spiders"
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
assert self.crawler_process
for s in sorted(self.crawler_process.spider_loader.list()):
print(s)

View File

@ -5,20 +5,7 @@ import functools
import inspect
import json
import logging
from typing import (
TYPE_CHECKING,
Any,
AsyncGenerator,
Coroutine,
Dict,
Iterable,
List,
Optional,
Tuple,
TypeVar,
Union,
overload,
)
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, overload
from itemadapter import ItemAdapter, is_item
from twisted.internet.defer import Deferred, maybeDeferred
@ -35,6 +22,8 @@ from scrapy.utils.misc import arg_to_iter
from scrapy.utils.spider import spidercls_for_request
if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Coroutine, Iterable
from twisted.python.failure import Failure
from scrapy.http.request import CallbackT
@ -50,8 +39,8 @@ class Command(BaseRunSpiderCommand):
requires_project = True
spider = None
items: Dict[int, List[Any]] = {}
requests: Dict[int, List[Request]] = {}
items: dict[int, list[Any]] = {}
requests: dict[int, list[Request]] = {}
first_response = None
@ -166,11 +155,11 @@ class Command(BaseRunSpiderCommand):
return d
return arg_to_iter(deferred_from_coro(result))
def add_items(self, lvl: int, new_items: List[Any]) -> None:
def add_items(self, lvl: int, new_items: list[Any]) -> None:
old_items = self.items.get(lvl, [])
self.items[lvl] = old_items + new_items
def add_requests(self, lvl: int, new_reqs: List[Request]) -> None:
def add_requests(self, lvl: int, new_reqs: list[Request]) -> None:
old_reqs = self.requests.get(lvl, [])
self.requests[lvl] = old_reqs + new_reqs
@ -219,7 +208,7 @@ class Command(BaseRunSpiderCommand):
depth: int,
spider: Spider,
callback: CallbackT,
) -> Tuple[List[Any], List[Request], argparse.Namespace, int, Spider, CallbackT]:
) -> tuple[list[Any], list[Request], argparse.Namespace, int, Spider, CallbackT]:
items, requests = [], []
for x in spider_output:
if is_item(x):
@ -232,7 +221,7 @@ class Command(BaseRunSpiderCommand):
self,
response: Response,
callback: CallbackT,
cb_kwargs: Optional[Dict[str, Any]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
) -> Deferred[Any]:
cb_kwargs = cb_kwargs or {}
d = maybeDeferred(self.iterate_spider_output, callback(response, **cb_kwargs))
@ -285,10 +274,10 @@ class Command(BaseRunSpiderCommand):
def scraped_data(
self,
args: Tuple[
List[Any], List[Request], argparse.Namespace, int, Spider, CallbackT
args: tuple[
list[Any], list[Request], argparse.Namespace, int, Spider, CallbackT
],
) -> List[Any]:
) -> list[Any]:
items, requests, opts, depth, spider, callback = args
if opts.pipelines:
itemproc = self.pcrawler.engine.scraper.itemproc
@ -345,7 +334,7 @@ class Command(BaseRunSpiderCommand):
def prepare_request(
self, spider: Spider, request: Request, opts: argparse.Namespace
) -> Request:
def callback(response: Response, **cb_kwargs: Any) -> Deferred[List[Any]]:
def callback(response: Response, **cb_kwargs: Any) -> Deferred[list[Any]]:
# memorize first request
if not self.first_response:
self.first_response = response
@ -376,7 +365,7 @@ class Command(BaseRunSpiderCommand):
request.callback = callback
return request
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
def process_options(self, args: list[str], opts: argparse.Namespace) -> None:
super().process_options(args, opts)
self.process_request_meta(opts)
@ -404,7 +393,7 @@ class Command(BaseRunSpiderCommand):
print_help=False,
)
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
# parse arguments
if not len(args) == 1 or not is_url(args[0]):
raise UsageError()

View File

@ -4,7 +4,7 @@ import argparse
import sys
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, List, Union
from typing import TYPE_CHECKING, Union
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
@ -41,7 +41,7 @@ class Command(BaseRunSpiderCommand):
def long_desc(self) -> str:
return "Run the spider defined in the given file"
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if len(args) != 1:
raise UsageError()
filename = Path(args[0])

View File

@ -1,6 +1,5 @@
import argparse
import json
from typing import List
from scrapy.commands import ScrapyCommand
from scrapy.settings import BaseSettings
@ -46,7 +45,7 @@ class Command(ScrapyCommand):
help="print setting value, interpreted as a list",
)
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
assert self.crawler_process
settings = self.crawler_process.settings
if opts.get:

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/shell.rst
from __future__ import annotations
from threading import Thread
from typing import TYPE_CHECKING, Any, Dict, List, Type
from typing import TYPE_CHECKING, Any
from scrapy import Spider
from scrapy.commands import ScrapyCommand
@ -56,13 +56,13 @@ class Command(ScrapyCommand):
help="do not handle HTTP 3xx status codes and print response as-is",
)
def update_vars(self, vars: Dict[str, Any]) -> None:
def update_vars(self, vars: dict[str, Any]) -> None:
"""You can use this function to update the Scrapy objects that will be
available in the shell
"""
pass
def run(self, args: List[str], opts: Namespace) -> None:
def run(self, args: list[str], opts: Namespace) -> None:
url = args[0] if args else None
if url:
# first argument may be a local file
@ -71,7 +71,7 @@ class Command(ScrapyCommand):
assert self.crawler_process
spider_loader = self.crawler_process.spider_loader
spidercls: Type[Spider] = DefaultSpider
spidercls: type[Spider] = DefaultSpider
if opts.spider:
spidercls = spider_loader.load(opts.spider)
elif url:

View File

@ -6,14 +6,14 @@ from importlib.util import find_spec
from pathlib import Path
from shutil import copy2, copystat, ignore_patterns, move
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
from typing import List, Tuple, Union
from typing import Union
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.template import render_templatefile, string_camelcase
TEMPLATES_TO_RENDER: Tuple[Tuple[str, ...], ...] = (
TEMPLATES_TO_RENDER: tuple[tuple[str, ...], ...] = (
("scrapy.cfg",),
("${project_name}", "settings.py.tmpl"),
("${project_name}", "items.py.tmpl"),
@ -86,7 +86,7 @@ class Command(ScrapyCommand):
copystat(src, dst)
_make_writable(dst)
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if len(args) not in (1, 2):
raise UsageError()
@ -107,9 +107,7 @@ class Command(ScrapyCommand):
return
self._copytree(Path(self.templates_dir), project_dir.resolve())
# On 3.8 shutil.move doesn't fully support Path args, but it supports our use case
# See https://bugs.python.org/issue32689
move(project_dir / "module", project_dir / project_name) # type: ignore[arg-type]
move(project_dir / "module", project_dir / project_name)
for paths in TEMPLATES_TO_RENDER:
tplfile = Path(
project_dir,

View File

@ -1,5 +1,4 @@
import argparse
from typing import List
import scrapy
from scrapy.commands import ScrapyCommand
@ -25,7 +24,7 @@ class Command(ScrapyCommand):
help="also display twisted/python/platform info (useful for bug reports)",
)
def run(self, args: List[str], opts: argparse.Namespace) -> None:
def run(self, args: list[str], opts: argparse.Namespace) -> None:
if opts.verbose:
versions = scrapy_components_versions()
width = max(len(n) for (n, _) in versions)

View File

@ -2,22 +2,11 @@ from __future__ import annotations
import re
import sys
from collections.abc import AsyncGenerator, Iterable
from functools import wraps
from inspect import getmembers
from types import CoroutineType
from typing import (
TYPE_CHECKING,
Any,
AsyncGenerator,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
Type,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, cast
from unittest import TestCase, TestResult
from scrapy.http import Request, Response
@ -25,6 +14,8 @@ from scrapy.utils.python import get_spec
from scrapy.utils.spider import iterate_spider_output
if TYPE_CHECKING:
from collections.abc import Callable
from twisted.python.failure import Failure
from scrapy import Spider
@ -33,13 +24,13 @@ if TYPE_CHECKING:
class Contract:
"""Abstract class for contracts"""
request_cls: Optional[Type[Request]] = None
request_cls: Optional[type[Request]] = None
name: str
def __init__(self, method: Callable, *args: Any):
self.testcase_pre = _create_testcase(method, f"@{self.name} pre-hook")
self.testcase_post = _create_testcase(method, f"@{self.name} post-hook")
self.args: Tuple[Any, ...] = args
self.args: tuple[Any, ...] = args
def add_pre_hook(self, request: Request, results: TestResult) -> Request:
if hasattr(self, "pre_process"):
@ -47,7 +38,7 @@ class Contract:
assert cb is not None
@wraps(cb)
def wrapper(response: Response, **cb_kwargs: Any) -> List[Any]:
def wrapper(response: Response, **cb_kwargs: Any) -> list[Any]:
try:
results.startTest(self.testcase_pre)
self.pre_process(response)
@ -76,7 +67,7 @@ class Contract:
assert cb is not None
@wraps(cb)
def wrapper(response: Response, **cb_kwargs: Any) -> List[Any]:
def wrapper(response: Response, **cb_kwargs: Any) -> list[Any]:
cb_result = cb(response, **cb_kwargs)
if isinstance(cb_result, (AsyncGenerator, CoroutineType)):
raise TypeError("Contracts don't support async callbacks")
@ -98,18 +89,18 @@ class Contract:
return request
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
return args
class ContractsManager:
contracts: Dict[str, Type[Contract]] = {}
contracts: dict[str, type[Contract]] = {}
def __init__(self, contracts: Iterable[Type[Contract]]):
def __init__(self, contracts: Iterable[type[Contract]]):
for contract in contracts:
self.contracts[contract.name] = contract
def tested_methods_from_spidercls(self, spidercls: Type[Spider]) -> List[str]:
def tested_methods_from_spidercls(self, spidercls: type[Spider]) -> list[str]:
is_method = re.compile(r"^\s*@", re.MULTILINE).search
methods = []
for key, value in getmembers(spidercls):
@ -118,8 +109,8 @@ class ContractsManager:
return methods
def extract_contracts(self, method: Callable) -> List[Contract]:
contracts: List[Contract] = []
def extract_contracts(self, method: Callable) -> list[Contract]:
contracts: list[Contract] = []
assert method.__doc__ is not None
for line in method.__doc__.split("\n"):
line = line.strip()
@ -137,8 +128,8 @@ class ContractsManager:
def from_spider(
self, spider: Spider, results: TestResult
) -> List[Optional[Request]]:
requests: List[Optional[Request]] = []
) -> list[Optional[Request]]:
requests: list[Optional[Request]] = []
for method in self.tested_methods_from_spidercls(type(spider)):
bound_method = spider.__getattribute__(method)
try:

View File

@ -1,5 +1,5 @@
import json
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Optional
from itemadapter import ItemAdapter, is_item
@ -16,7 +16,7 @@ class UrlContract(Contract):
name = "url"
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
args["url"] = self.args[0]
return args
@ -30,7 +30,7 @@ class CallbackKeywordArgumentsContract(Contract):
name = "cb_kwargs"
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
args["cb_kwargs"] = json.loads(" ".join(self.args))
return args
@ -44,7 +44,7 @@ class MetadataContract(Contract):
name = "meta"
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
args["meta"] = json.loads(" ".join(self.args))
return args
@ -63,7 +63,7 @@ class ReturnsContract(Contract):
"""
name = "returns"
object_type_verifiers: Dict[Optional[str], Callable[[Any], bool]] = {
object_type_verifiers: dict[Optional[str], Callable[[Any], bool]] = {
"request": lambda x: isinstance(x, Request),
"requests": lambda x: isinstance(x, Request),
"item": is_item,
@ -90,7 +90,7 @@ class ReturnsContract(Contract):
except IndexError:
self.max_bound = float("inf")
def post_process(self, output: List[Any]) -> None:
def post_process(self, output: list[Any]) -> None:
occurrences = 0
for x in output:
if self.obj_type_verifier(x):
@ -116,7 +116,7 @@ class ScrapesContract(Contract):
name = "scrapes"
def post_process(self, output: List[Any]) -> None:
def post_process(self, output: list[Any]) -> None:
for x in output:
if is_item(x):
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]

View File

@ -5,18 +5,7 @@ import warnings
from collections import deque
from datetime import datetime
from time import time
from typing import (
TYPE_CHECKING,
Any,
Deque,
Dict,
Optional,
Set,
Tuple,
TypeVar,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from twisted.internet import task
from twisted.internet.defer import Deferred
@ -55,9 +44,9 @@ class Slot:
self.randomize_delay: bool = randomize_delay
self.throttle = throttle
self.active: Set[Request] = set()
self.queue: Deque[Tuple[Request, Deferred[Response]]] = deque()
self.transferring: Set[Request] = set()
self.active: set[Request] = set()
self.queue: deque[tuple[Request, Deferred[Response]]] = deque()
self.transferring: set[Request] = set()
self.lastseen: float = 0
self.latercall = None
@ -95,7 +84,7 @@ class Slot:
def _get_concurrency_delay(
concurrency: int, spider: Spider, settings: BaseSettings
) -> Tuple[int, float]:
) -> tuple[int, float]:
delay: float = settings.getfloat("DOWNLOAD_DELAY")
if hasattr(spider, "download_delay"):
delay = spider.download_delay
@ -112,8 +101,8 @@ class Downloader:
def __init__(self, crawler: Crawler):
self.settings: BaseSettings = crawler.settings
self.signals: SignalManager = crawler.signals
self.slots: Dict[str, Slot] = {}
self.active: Set[Request] = set()
self.slots: dict[str, Slot] = {}
self.active: set[Request] = set()
self.handlers: DownloadHandlers = DownloadHandlers(crawler)
self.total_concurrency: int = self.settings.getint("CONCURRENT_REQUESTS")
self.domain_concurrency: int = self.settings.getint(
@ -126,7 +115,7 @@ class Downloader:
)
self._slot_gc_loop: task.LoopingCall = task.LoopingCall(self._slot_gc)
self._slot_gc_loop.start(60)
self.per_slot_settings: Dict[str, Dict[str, Any]] = self.settings.getdict(
self.per_slot_settings: dict[str, dict[str, Any]] = self.settings.getdict(
"DOWNLOAD_SLOTS", {}
)
@ -146,7 +135,7 @@ class Downloader:
def needs_backout(self) -> bool:
return len(self.active) >= self.total_concurrency
def _get_slot(self, request: Request, spider: Spider) -> Tuple[str, Slot]:
def _get_slot(self, request: Request, spider: Spider) -> tuple[str, Slot]:
key = self.get_slot_key(request)
if key not in self.slots:
slot_settings = self.per_slot_settings.get(key, {})

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import warnings
from typing import TYPE_CHECKING, Any, List, Optional
from typing import TYPE_CHECKING, Any, Optional
from OpenSSL import SSL
from twisted.internet._sslverify import _setAcceptableProtocols
@ -154,10 +154,10 @@ class AcceptableProtocolsContextFactory:
negotiation.
"""
def __init__(self, context_factory: Any, acceptable_protocols: List[bytes]):
def __init__(self, context_factory: Any, acceptable_protocols: list[bytes]):
verifyObject(IPolicyForHTTPS, context_factory)
self._wrapped_context_factory: Any = context_factory
self._acceptable_protocols: List[bytes] = acceptable_protocols
self._acceptable_protocols: list[bytes] = acceptable_protocols
def creatorForNetloc(self, hostname: bytes, port: int) -> ClientTLSOptions:
options: ClientTLSOptions = self._wrapped_context_factory.creatorForNetloc(

View File

@ -3,18 +3,8 @@
from __future__ import annotations
import logging
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
Optional,
Protocol,
Type,
Union,
cast,
)
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Optional, Protocol, Union, cast
from twisted.internet import defer
@ -25,6 +15,8 @@ from scrapy.utils.misc import build_from_crawler, load_object
from scrapy.utils.python import without_none_values
if TYPE_CHECKING:
from collections.abc import Generator
from twisted.internet.defer import Deferred
from scrapy.crawler import Crawler
@ -43,16 +35,16 @@ class DownloadHandlerProtocol(Protocol):
class DownloadHandlers:
def __init__(self, crawler: Crawler):
self._crawler: Crawler = crawler
self._schemes: Dict[str, Union[str, Callable[..., Any]]] = (
self._schemes: dict[str, Union[str, Callable[..., Any]]] = (
{}
) # stores acceptable schemes on instancing
self._handlers: Dict[str, DownloadHandlerProtocol] = (
self._handlers: dict[str, DownloadHandlerProtocol] = (
{}
) # stores instanced handlers for schemes
self._notconfigured: Dict[str, str] = {} # remembers failed handlers
handlers: Dict[str, Union[str, Callable[..., Any]]] = without_none_values(
self._notconfigured: dict[str, str] = {} # remembers failed handlers
handlers: dict[str, Union[str, Callable[..., Any]]] = without_none_values(
cast(
Dict[str, Union[str, Callable[..., Any]]],
dict[str, Union[str, Callable[..., Any]]],
crawler.settings.getwithbase("DOWNLOAD_HANDLERS"),
)
)
@ -81,7 +73,7 @@ class DownloadHandlers:
) -> Optional[DownloadHandlerProtocol]:
path = self._schemes[scheme]
try:
dhcls: Type[DownloadHandlerProtocol] = load_object(path)
dhcls: type[DownloadHandlerProtocol] = load_object(path)
if skip_lazy and getattr(dhcls, "lazy", True):
return None
dh = build_from_crawler(

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict
from typing import TYPE_CHECKING, Any
from w3lib.url import parse_data_uri
@ -20,7 +20,7 @@ class DataURIDownloadHandler:
uri = parse_data_uri(request.url)
respcls = responsetypes.from_mimetype(uri.media_type)
resp_kwargs: Dict[str, Any] = {}
resp_kwargs: dict[str, Any] = {}
if issubclass(respcls, TextResponse) and uri.media_type.split("/")[0] == "text":
charset = uri.media_type_parameters.get("charset")
resp_kwargs["encoding"] = charset

View File

@ -32,7 +32,7 @@ from __future__ import annotations
import re
from io import BytesIO
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Optional
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
from urllib.parse import unquote
from twisted.internet.protocol import ClientCreator, Protocol
@ -79,7 +79,7 @@ _CODE_RE = re.compile(r"\d+")
class FTPDownloadHandler:
lazy = False
CODE_MAPPING: Dict[str, int] = {
CODE_MAPPING: dict[str, int] = {
"550": 404,
"default": 503,
}

View File

@ -1,9 +1,8 @@
"""Download handlers for http and https schemes
"""
"""Download handlers for http and https schemes"""
from __future__ import annotations
from typing import TYPE_CHECKING, Type
from typing import TYPE_CHECKING
from scrapy.utils.misc import build_from_crawler, load_object
from scrapy.utils.python import to_unicode
@ -27,10 +26,10 @@ class HTTP10DownloadHandler:
lazy = False
def __init__(self, settings: BaseSettings, crawler: Crawler):
self.HTTPClientFactory: Type[ScrapyHTTPClientFactory] = load_object(
self.HTTPClientFactory: type[ScrapyHTTPClientFactory] = load_object(
settings["DOWNLOADER_HTTPCLIENTFACTORY"]
)
self.ClientContextFactory: Type[ScrapyClientContextFactory] = load_object(
self.ClientContextFactory: type[ScrapyClientContextFactory] = load_object(
settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]
)
self._settings: BaseSettings = settings

View File

@ -8,7 +8,7 @@ import re
from contextlib import suppress
from io import BytesIO
from time import time
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, TypedDict, TypeVar, Union
from typing import TYPE_CHECKING, Any, Optional, TypedDict, TypeVar, Union
from urllib.parse import urldefrag, urlunparse
from twisted.internet import ssl
@ -52,7 +52,7 @@ _T = TypeVar("_T")
class _ResultT(TypedDict):
txresponse: TxResponse
body: bytes
flags: Optional[List[str]]
flags: Optional[list[str]]
certificate: Optional[ssl.Certificate]
ip_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address, None]
failure: NotRequired[Optional[Failure]]
@ -143,10 +143,10 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
reactor: ReactorBase,
host: str,
port: int,
proxyConf: Tuple[str, int, Optional[bytes]],
proxyConf: tuple[str, int, Optional[bytes]],
contextFactory: IPolicyForHTTPS,
timeout: float = 30,
bindAddress: Optional[Tuple[str, int]] = None,
bindAddress: Optional[tuple[str, int]] = None,
):
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
@ -254,14 +254,14 @@ class TunnelingAgent(Agent):
self,
*,
reactor: ReactorBase,
proxyConf: Tuple[str, int, Optional[bytes]],
proxyConf: tuple[str, int, Optional[bytes]],
contextFactory: IPolicyForHTTPS,
connectTimeout: Optional[float] = None,
bindAddress: Optional[bytes] = None,
pool: Optional[HTTPConnectionPool] = None,
):
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
self._proxyConf: Tuple[str, int, Optional[bytes]] = proxyConf
self._proxyConf: tuple[str, int, Optional[bytes]] = proxyConf
self._contextFactory: IPolicyForHTTPS = contextFactory
def _getEndpoint(self, uri: URI) -> TunnelingTCP4ClientEndpoint:
@ -621,7 +621,7 @@ class _ResponseReader(Protocol):
self._crawler: Crawler = crawler
def _finish_response(
self, flags: Optional[List[str]] = None, failure: Optional[Failure] = None
self, flags: Optional[list[str]] = None, failure: Optional[Failure] = None
) -> None:
self._finished.callback(
{

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional, Type
from typing import TYPE_CHECKING, Any, Optional
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.exceptions import NotConfigured
@ -29,7 +29,7 @@ class S3DownloadHandler:
aws_access_key_id: Optional[str] = None,
aws_secret_access_key: Optional[str] = None,
aws_session_token: Optional[str] = None,
httpdownloadhandler: Type[HTTPDownloadHandler] = HTTPDownloadHandler,
httpdownloadhandler: type[HTTPDownloadHandler] = HTTPDownloadHandler,
**kw: Any,
):
if not is_botocore_available():

View File

@ -6,7 +6,8 @@ See documentation in docs/topics/downloader-middleware.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Callable, Generator, List, Union, cast
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Union, cast
from twisted.internet.defer import Deferred, inlineCallbacks
@ -17,6 +18,8 @@ from scrapy.utils.conf import build_component_list
from scrapy.utils.defer import deferred_from_coro, mustbe_deferred
if TYPE_CHECKING:
from collections.abc import Generator
from twisted.python.failure import Failure
from scrapy import Spider
@ -27,7 +30,7 @@ class DownloaderMiddlewareManager(MiddlewareManager):
component_name = "downloader middleware"
@classmethod
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> List[Any]:
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> list[Any]:
return build_component_list(settings.getwithbase("DOWNLOADER_MIDDLEWARES"))
def _add_middleware(self, mw: Any) -> None:

View File

@ -1,5 +1,5 @@
import logging
from typing import Any, Dict
from typing import Any
from OpenSSL import SSL
from service_identity.exceptions import CertificateError
@ -21,7 +21,7 @@ METHOD_TLSv11 = "TLSv1.1"
METHOD_TLSv12 = "TLSv1.2"
openssl_methods: Dict[str, int] = {
openssl_methods: dict[str, int] = {
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
METHOD_TLSv11: SSL.TLSv1_1_METHOD, # TLS 1.1 only

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import re
from time import time
from typing import TYPE_CHECKING, Optional, Tuple
from typing import TYPE_CHECKING, Optional
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
from twisted.internet import defer
@ -18,7 +18,7 @@ if TYPE_CHECKING:
from scrapy import Request
def _parsed_url_args(parsed: ParseResult) -> Tuple[bytes, bytes, bytes, int, bytes]:
def _parsed_url_args(parsed: ParseResult) -> tuple[bytes, bytes, bytes, int, bytes]:
# Assume parsed is urlparse-d from Request.url,
# which was passed via safe_url_string and is ascii-only.
path_str = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
@ -33,7 +33,7 @@ def _parsed_url_args(parsed: ParseResult) -> Tuple[bytes, bytes, bytes, int, byt
return scheme, netloc, host, port, path
def _parse(url: str) -> Tuple[bytes, bytes, bytes, int, bytes]:
def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
"""Return tuple of (scheme, netloc, host, port, path),
all in bytes except for port which is int.
Assume url is from Request.url, which was passed via safe_url_string

View File

@ -9,20 +9,7 @@ from __future__ import annotations
import logging
from time import time
from typing import (
TYPE_CHECKING,
Any,
Callable,
Generator,
Iterable,
Iterator,
Optional,
Set,
Type,
TypeVar,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from itemadapter import is_item
from twisted.internet.defer import Deferred, inlineCallbacks, succeed
@ -42,6 +29,8 @@ from scrapy.utils.misc import build_from_crawler, load_object
from scrapy.utils.reactor import CallLaterOnce
if TYPE_CHECKING:
from collections.abc import Callable, Generator, Iterable, Iterator
from scrapy.core.scheduler import BaseScheduler
from scrapy.core.scraper import _HandleOutputDeferred
from scrapy.crawler import Crawler
@ -63,7 +52,7 @@ class Slot:
scheduler: BaseScheduler,
) -> None:
self.closing: Optional[Deferred[None]] = None
self.inprogress: Set[Request] = set()
self.inprogress: set[Request] = set()
self.start_requests: Optional[Iterator[Request]] = iter(start_requests)
self.close_if_idle: bool = close_if_idle
self.nextcall: CallLaterOnce[None] = nextcall
@ -106,10 +95,10 @@ class ExecutionEngine:
self.spider: Optional[Spider] = None
self.running: bool = False
self.paused: bool = False
self.scheduler_cls: Type[BaseScheduler] = self._get_scheduler_class(
self.scheduler_cls: type[BaseScheduler] = self._get_scheduler_class(
crawler.settings
)
downloader_cls: Type[Downloader] = load_object(self.settings["DOWNLOADER"])
downloader_cls: type[Downloader] = load_object(self.settings["DOWNLOADER"])
self.downloader: Downloader = downloader_cls(crawler)
self.scraper = Scraper(crawler)
self._spider_closed_callback: Callable[[Spider], Optional[Deferred[None]]] = (
@ -117,10 +106,10 @@ class ExecutionEngine:
)
self.start_time: Optional[float] = None
def _get_scheduler_class(self, settings: BaseSettings) -> Type[BaseScheduler]:
def _get_scheduler_class(self, settings: BaseSettings) -> type[BaseScheduler]:
from scrapy.core.scheduler import BaseScheduler
scheduler_cls: Type[BaseScheduler] = load_object(settings["SCHEDULER"])
scheduler_cls: type[BaseScheduler] = load_object(settings["SCHEDULER"])
if not issubclass(scheduler_cls, BaseScheduler):
raise TypeError(
f"The provided scheduler class ({settings['SCHEDULER']})"

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from collections import deque
from typing import TYPE_CHECKING, Deque, Dict, List, Optional, Tuple
from typing import TYPE_CHECKING, Optional
from twisted.internet import defer
from twisted.internet.defer import Deferred
@ -26,7 +26,7 @@ if TYPE_CHECKING:
from scrapy.spiders import Spider
ConnectionKeyT = Tuple[bytes, bytes, int]
ConnectionKeyT = tuple[bytes, bytes, int]
class H2ConnectionPool:
@ -36,11 +36,11 @@ class H2ConnectionPool:
# Store a dictionary which is used to get the respective
# H2ClientProtocolInstance using the key as Tuple(scheme, hostname, port)
self._connections: Dict[ConnectionKeyT, H2ClientProtocol] = {}
self._connections: dict[ConnectionKeyT, H2ClientProtocol] = {}
# Save all requests that arrive before the connection is established
self._pending_requests: Dict[
ConnectionKeyT, Deque[Deferred[H2ClientProtocol]]
self._pending_requests: dict[
ConnectionKeyT, deque[Deferred[H2ClientProtocol]]
] = {}
def get_connection(
@ -68,7 +68,7 @@ class H2ConnectionPool:
) -> Deferred[H2ClientProtocol]:
self._pending_requests[key] = deque()
conn_lost_deferred: Deferred[List[BaseException]] = Deferred()
conn_lost_deferred: Deferred[list[BaseException]] = Deferred()
conn_lost_deferred.addCallback(self._remove_connection, key)
factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
@ -94,7 +94,7 @@ class H2ConnectionPool:
return conn
def _remove_connection(
self, errors: List[BaseException], key: ConnectionKeyT
self, errors: list[BaseException], key: ConnectionKeyT
) -> None:
self._connections.pop(key)

View File

@ -4,7 +4,7 @@ import ipaddress
import itertools
import logging
from collections import deque
from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from h2.config import H2Configuration
from h2.connection import H2Connection
@ -91,7 +91,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self,
uri: URI,
settings: Settings,
conn_lost_deferred: Deferred[List[BaseException]],
conn_lost_deferred: Deferred[list[BaseException]],
) -> None:
"""
Arguments:
@ -102,7 +102,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
conn_lost_deferred -- Deferred fires with the reason: Failure to notify
that connection was lost
"""
self._conn_lost_deferred: Deferred[List[BaseException]] = conn_lost_deferred
self._conn_lost_deferred: Deferred[list[BaseException]] = conn_lost_deferred
config = H2Configuration(client_side=True, header_encoding="utf-8")
self.conn = H2Connection(config=config)
@ -113,19 +113,19 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self._stream_id_generator = itertools.count(start=1, step=2)
# Streams are stored in a dictionary keyed off their stream IDs
self.streams: Dict[int, Stream] = {}
self.streams: dict[int, Stream] = {}
# If requests are received before connection is made we keep
# all requests in a pool and send them as the connection is made
self._pending_request_stream_pool: Deque[Stream] = deque()
self._pending_request_stream_pool: deque[Stream] = deque()
# Save an instance of errors raised which lead to losing the connection
# We pass these instances to the streams ResponseFailed() failure
self._conn_lost_errors: List[BaseException] = []
self._conn_lost_errors: list[BaseException] = []
# Some meta data of this connection
# initialized when connection is successfully made
self.metadata: Dict[str, Any] = {
self.metadata: dict[str, Any] = {
# Peer certificate instance
"certificate": None,
# Address of the server we are connected to which
@ -250,7 +250,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self.conn.initiate_connection()
self._write_to_transport()
def _lose_connection_with_error(self, errors: List[BaseException]) -> None:
def _lose_connection_with_error(self, errors: list[BaseException]) -> None:
"""Helper function to lose the connection with the error sent as a
reason"""
self._conn_lost_errors += errors
@ -353,7 +353,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self._pending_request_stream_pool.clear()
self.conn.close_connection()
def _handle_events(self, events: List[Event]) -> None:
def _handle_events(self, events: list[Event]) -> None:
"""Private method which acts as a bridge between the events
received from the HTTP/2 data and IH2EventsHandler
@ -442,7 +442,7 @@ class H2ClientFactory(Factory):
self,
uri: URI,
settings: Settings,
conn_lost_deferred: Deferred[List[BaseException]],
conn_lost_deferred: Deferred[list[BaseException]],
) -> None:
self.uri = uri
self.settings = settings
@ -451,5 +451,5 @@ class H2ClientFactory(Factory):
def buildProtocol(self, addr: IAddress) -> H2ClientProtocol:
return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)
def acceptableProtocols(self) -> List[bytes]:
def acceptableProtocols(self) -> list[bytes]:
return [PROTOCOL_NAME]

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
from enum import Enum
from io import BytesIO
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
from typing import TYPE_CHECKING, Any, Optional
from h2.errors import ErrorCodes
from h2.exceptions import H2Error, ProtocolError, StreamClosedError
@ -113,7 +113,7 @@ class Stream:
# Metadata of an HTTP/2 connection stream
# initialized when stream is instantiated
self.metadata: Dict[str, Any] = {
self.metadata: dict[str, Any] = {
"request_content_length": (
0 if self._request.body is None else len(self._request.body)
),
@ -134,7 +134,7 @@ class Stream:
# Private variable used to build the response
# this response is then converted to appropriate Response class
# passed to the response deferred callback
self._response: Dict[str, Any] = {
self._response: dict[str, Any] = {
# Data received frame by frame from the server is appended
# and passed to the response Deferred when completely received.
"body": BytesIO(),
@ -196,7 +196,7 @@ class Stream:
== f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
)
def _get_request_headers(self) -> List[Tuple[str, str]]:
def _get_request_headers(self) -> list[tuple[str, str]]:
url = urlparse_cached(self._request)
path = url.path
@ -349,7 +349,7 @@ class Stream:
self._response["flow_controlled_size"], self.stream_id
)
def receive_headers(self, headers: List[HeaderTuple]) -> None:
def receive_headers(self, headers: list[HeaderTuple]) -> None:
for name, value in headers:
self._response["headers"].appendlist(name, value)
@ -382,7 +382,7 @@ class Stream:
def close(
self,
reason: StreamCloseReason,
errors: Optional[List[BaseException]] = None,
errors: Optional[list[BaseException]] = None,
from_protocol: bool = False,
) -> None:
"""Based on the reason sent we will handle each case."""

View File

@ -4,7 +4,7 @@ import json
import logging
from abc import abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING, Any, List, Optional, Type, cast
from typing import TYPE_CHECKING, Any, Optional, cast
# working around https://github.com/sphinx-doc/sphinx/issues/10400
from twisted.internet.defer import Deferred # noqa: TC002
@ -182,18 +182,18 @@ class Scheduler(BaseScheduler):
self,
dupefilter: BaseDupeFilter,
jobdir: Optional[str] = None,
dqclass: Optional[Type[BaseQueue]] = None,
mqclass: Optional[Type[BaseQueue]] = None,
dqclass: Optional[type[BaseQueue]] = None,
mqclass: Optional[type[BaseQueue]] = None,
logunser: bool = False,
stats: Optional[StatsCollector] = None,
pqclass: Optional[Type[ScrapyPriorityQueue]] = None,
pqclass: Optional[type[ScrapyPriorityQueue]] = None,
crawler: Optional[Crawler] = None,
):
self.df: BaseDupeFilter = dupefilter
self.dqdir: Optional[str] = self._dqdir(jobdir)
self.pqclass: Optional[Type[ScrapyPriorityQueue]] = pqclass
self.dqclass: Optional[Type[BaseQueue]] = dqclass
self.mqclass: Optional[Type[BaseQueue]] = mqclass
self.pqclass: Optional[type[ScrapyPriorityQueue]] = pqclass
self.dqclass: Optional[type[BaseQueue]] = dqclass
self.mqclass: Optional[type[BaseQueue]] = mqclass
self.logunser: bool = logunser
self.stats: Optional[StatsCollector] = stats
self.crawler: Optional[Crawler] = crawler
@ -364,13 +364,13 @@ class Scheduler(BaseScheduler):
return str(dqdir)
return None
def _read_dqs_state(self, dqdir: str) -> List[int]:
def _read_dqs_state(self, dqdir: str) -> list[int]:
path = Path(dqdir, "active.json")
if not path.exists():
return []
with path.open(encoding="utf-8") as f:
return cast(List[int], json.load(f))
return cast(list[int], json.load(f))
def _write_dqs_state(self, dqdir: str, state: List[int]) -> None:
def _write_dqs_state(self, dqdir: str, state: list[int]) -> None:
with Path(dqdir, "active.json").open("w", encoding="utf-8") as f:
json.dump(state, f)

View File

@ -5,23 +5,8 @@ from __future__ import annotations
import logging
from collections import deque
from typing import (
TYPE_CHECKING,
Any,
AsyncIterable,
Deque,
Generator,
Iterable,
Iterator,
List,
Optional,
Set,
Tuple,
Type,
TypeVar,
Union,
cast,
)
from collections.abc import AsyncIterable, Iterator
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from itemadapter import is_item
from twisted.internet.defer import Deferred, inlineCallbacks
@ -47,6 +32,8 @@ from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
if TYPE_CHECKING:
from collections.abc import Generator, Iterable
from scrapy.crawler import Crawler
@ -54,12 +41,12 @@ logger = logging.getLogger(__name__)
_T = TypeVar("_T")
_ParallelResult = List[Tuple[bool, Iterator[Any]]]
_ParallelResult = list[tuple[bool, Iterator[Any]]]
if TYPE_CHECKING:
# parameterized Deferreds require Twisted 21.7.0
_HandleOutputDeferred = Deferred[Union[_ParallelResult, None]]
QueueTuple = Tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
QueueTuple = tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
class Slot:
@ -69,8 +56,8 @@ class Slot:
def __init__(self, max_active_size: int = 5000000):
self.max_active_size = max_active_size
self.queue: Deque[QueueTuple] = deque()
self.active: Set[Request] = set()
self.queue: deque[QueueTuple] = deque()
self.active: set[Request] = set()
self.active_size: int = 0
self.itemproc_size: int = 0
self.closing: Optional[Deferred[Spider]] = None
@ -113,7 +100,7 @@ class Scraper:
self.spidermw: SpiderMiddlewareManager = SpiderMiddlewareManager.from_crawler(
crawler
)
itemproc_cls: Type[ItemPipelineManager] = load_object(
itemproc_cls: type[ItemPipelineManager] = load_object(
crawler.settings["ITEM_PROCESSOR"]
)
self.itemproc: ItemPipelineManager = itemproc_cls.from_crawler(crawler)

View File

@ -7,22 +7,10 @@ See documentation in docs/topics/spider-middleware.rst
from __future__ import annotations
import logging
from collections.abc import AsyncIterable, Callable, Iterable
from inspect import isasyncgenfunction, iscoroutine
from itertools import islice
from typing import (
TYPE_CHECKING,
Any,
AsyncIterable,
Callable,
Generator,
Iterable,
List,
Optional,
Tuple,
TypeVar,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python.failure import Failure
@ -42,6 +30,8 @@ from scrapy.utils.defer import (
from scrapy.utils.python import MutableAsyncChain, MutableChain
if TYPE_CHECKING:
from collections.abc import Generator
from scrapy.settings import BaseSettings
@ -66,7 +56,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
self.downgrade_warning_done = False
@classmethod
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> List[Any]:
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> list[Any]:
return build_component_list(settings.getwithbase("SPIDER_MIDDLEWARES"))
def _add_middleware(self, mw: Any) -> None:
@ -349,7 +339,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
@staticmethod
def _get_async_method_pair(
mw: Any, methodname: str
) -> Union[None, Callable, Tuple[Callable, Callable]]:
) -> Union[None, Callable, tuple[Callable, Callable]]:
normal_method: Optional[Callable] = getattr(mw, methodname, None)
methodname_async = methodname + "_async"
async_method: Optional[Callable] = getattr(mw, methodname_async, None)

View File

@ -4,18 +4,7 @@ import logging
import pprint
import signal
import warnings
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Optional,
Set,
Type,
TypeVar,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from twisted.internet.defer import (
Deferred,
@ -53,6 +42,8 @@ from scrapy.utils.reactor import (
)
if TYPE_CHECKING:
from collections.abc import Generator
from scrapy.utils.request import RequestFingerprinter
@ -64,8 +55,8 @@ _T = TypeVar("_T")
class Crawler:
def __init__(
self,
spidercls: Type[Spider],
settings: Union[None, Dict[str, Any], Settings] = None,
spidercls: type[Spider],
settings: Union[None, dict[str, Any], Settings] = None,
init_reactor: bool = False,
):
if isinstance(spidercls, Spider):
@ -74,7 +65,7 @@ class Crawler:
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls: Type[Spider] = spidercls
self.spidercls: type[Spider] = spidercls
self.settings: Settings = settings.copy()
self.spidercls.update_settings(self.settings)
self._update_root_log_handler()
@ -112,7 +103,7 @@ class Crawler:
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls: Type[LogFormatter] = load_object(self.settings["LOG_FORMATTER"])
lf_cls: type[LogFormatter] = load_object(self.settings["LOG_FORMATTER"])
self.logformatter = lf_cls.from_crawler(self)
self.request_fingerprinter = build_from_crawler(
@ -256,18 +247,18 @@ class CrawlerRunner:
verifyClass(ISpiderLoader, loader_cls)
return loader_cls.from_settings(settings.frozencopy())
def __init__(self, settings: Union[Dict[str, Any], Settings, None] = None):
def __init__(self, settings: Union[dict[str, Any], Settings, None] = None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = self._get_spider_loader(settings)
self._crawlers: Set[Crawler] = set()
self._active: Set[Deferred[None]] = set()
self._crawlers: set[Crawler] = set()
self._active: set[Deferred[None]] = set()
self.bootstrap_failed = False
def crawl(
self,
crawler_or_spidercls: Union[Type[Spider], str, Crawler],
crawler_or_spidercls: Union[type[Spider], str, Crawler],
*args: Any,
**kwargs: Any,
) -> Deferred[None]:
@ -314,7 +305,7 @@ class CrawlerRunner:
return d.addBoth(_done)
def create_crawler(
self, crawler_or_spidercls: Union[Type[Spider], str, Crawler]
self, crawler_or_spidercls: Union[type[Spider], str, Crawler]
) -> Crawler:
"""
Return a :class:`~scrapy.crawler.Crawler` object.
@ -335,11 +326,11 @@ class CrawlerRunner:
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)
def _create_crawler(self, spidercls: Union[str, Type[Spider]]) -> Crawler:
def _create_crawler(self, spidercls: Union[str, type[Spider]]) -> Crawler:
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
# temporary cast until self.spider_loader is typed
return Crawler(cast(Type[Spider], spidercls), self.settings)
return Crawler(cast(type[Spider], spidercls), self.settings)
def stop(self) -> Deferred[Any]:
"""
@ -387,7 +378,7 @@ class CrawlerProcess(CrawlerRunner):
def __init__(
self,
settings: Union[Dict[str, Any], Settings, None] = None,
settings: Union[dict[str, Any], Settings, None] = None,
install_root_handler: bool = True,
):
super().__init__(settings)
@ -416,14 +407,14 @@ class CrawlerProcess(CrawlerRunner):
)
reactor.callFromThread(self._stop_reactor)
def _create_crawler(self, spidercls: Union[Type[Spider], str]) -> Crawler:
def _create_crawler(self, spidercls: Union[type[Spider], str]) -> Crawler:
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
init_reactor = not self._initialized_reactor
self._initialized_reactor = True
# temporary cast until self.spider_loader is typed
return Crawler(
cast(Type[Spider], spidercls), self.settings, init_reactor=init_reactor
cast(type[Spider], spidercls), self.settings, init_reactor=init_reactor
)
def start(

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any, DefaultDict, Iterable, Optional, Sequence, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from tldextract import TLDExtract
@ -13,6 +13,7 @@ from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
from http.cookiejar import Cookie
# typing.Self requires Python 3.11
@ -39,7 +40,7 @@ class CookiesMiddleware:
"""This middleware enables working with sites that need cookies"""
def __init__(self, debug: bool = False):
self.jars: DefaultDict[Any, CookieJar] = defaultdict(CookieJar)
self.jars: defaultdict[Any, CookieJar] = defaultdict(CookieJar)
self.debug: bool = debug
@classmethod

View File

@ -6,11 +6,13 @@ See documentation in docs/topics/downloader-middleware.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Iterable, Tuple, Union
from typing import TYPE_CHECKING, Union
from scrapy.utils.python import without_none_values
if TYPE_CHECKING:
from collections.abc import Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -20,8 +22,8 @@ if TYPE_CHECKING:
class DefaultHeadersMiddleware:
def __init__(self, headers: Iterable[Tuple[str, str]]):
self._headers: Iterable[Tuple[str, str]] = headers
def __init__(self, headers: Iterable[tuple[str, str]]):
self._headers: Iterable[tuple[str, str]] = headers
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import warnings
from itertools import chain
from logging import getLogger
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from scrapy import Request, Spider, signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
@ -28,7 +28,7 @@ if TYPE_CHECKING:
logger = getLogger(__name__)
ACCEPTED_ENCODINGS: List[bytes] = [b"gzip", b"deflate"]
ACCEPTED_ENCODINGS: list[bytes] = [b"gzip", b"deflate"]
try:
try:
@ -50,7 +50,7 @@ else:
class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
sent/received from websites"""
def __init__(
self,
@ -140,7 +140,7 @@ class HttpCompressionMiddleware:
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs: Dict[str, Any] = {"body": decoded_body}
kwargs: dict[str, Any] = {"body": decoded_body}
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
@ -152,23 +152,23 @@ class HttpCompressionMiddleware:
return response
def _handle_encoding(
self, body: bytes, content_encoding: List[bytes], max_size: int
) -> Tuple[bytes, List[bytes]]:
self, body: bytes, content_encoding: list[bytes], max_size: int
) -> tuple[bytes, list[bytes]]:
to_decode, to_keep = self._split_encodings(content_encoding)
for encoding in to_decode:
body = self._decode(body, encoding, max_size)
return body, to_keep
def _split_encodings(
self, content_encoding: List[bytes]
) -> Tuple[List[bytes], List[bytes]]:
to_keep: List[bytes] = [
self, content_encoding: list[bytes]
) -> tuple[list[bytes], list[bytes]]:
to_keep: list[bytes] = [
encoding.strip().lower()
for encoding in chain.from_iterable(
encodings.split(b",") for encodings in content_encoding
)
]
to_decode: List[bytes] = []
to_decode: list[bytes] = []
while to_keep:
encoding = to_keep.pop()
if encoding not in ACCEPTED_ENCODINGS:

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import base64
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Optional, Union
from urllib.parse import unquote, urlunparse
from urllib.request import ( # type: ignore[attr-defined]
_parse_proxy,
@ -25,7 +25,7 @@ if TYPE_CHECKING:
class HttpProxyMiddleware:
def __init__(self, auth_encoding: Optional[str] = "latin-1"):
self.auth_encoding: Optional[str] = auth_encoding
self.proxies: Dict[str, Tuple[Optional[bytes], str]] = {}
self.proxies: dict[str, tuple[Optional[bytes], str]] = {}
for type_, url in getproxies().items():
try:
self.proxies[type_] = self._get_proxy(url, type_)
@ -47,7 +47,7 @@ class HttpProxyMiddleware:
)
return base64.b64encode(user_pass)
def _get_proxy(self, url: str, orig_type: str) -> Tuple[Optional[bytes], str]:
def _get_proxy(self, url: str, orig_type: str) -> tuple[Optional[bytes], str]:
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
import re
import warnings
from typing import TYPE_CHECKING, Set
from typing import TYPE_CHECKING
from scrapy import Request, Spider, signals
from scrapy.exceptions import IgnoreRequest
@ -31,7 +31,7 @@ class OffsiteMiddleware:
def __init__(self, stats: StatsCollector):
self.stats = stats
self.domains_seen: Set[str] = set()
self.domains_seen: set[str] = set()
def spider_opened(self, spider: Spider) -> None:
self.host_regex: re.Pattern[str] = self.get_host_regex(spider)

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, List, Union, cast
from typing import TYPE_CHECKING, Any, Union, cast
from urllib.parse import urljoin
from w3lib.url import safe_url_string
@ -180,7 +180,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
def __init__(self, settings: BaseSettings):
super().__init__(settings)
self._ignore_tags: List[str] = settings.getlist("METAREFRESH_IGNORE_TAGS")
self._ignore_tags: list[str] = settings.getlist("METAREFRESH_IGNORE_TAGS")
self._maxdelay: int = settings.getint("METAREFRESH_MAXDELAY")
def process_response(

View File

@ -7,14 +7,14 @@ RETRY_TIMES - how many times to retry a failed page
RETRY_HTTP_CODES - which HTTP response codes to retry
Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages.
once the spider has finished crawling all regular (non-failed) pages.
"""
from __future__ import annotations
import warnings
from logging import Logger, getLogger
from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.settings import BaseSettings, Settings
@ -35,7 +35,7 @@ if TYPE_CHECKING:
retry_logger = getLogger(__name__)
def backwards_compatibility_getattr(self: Any, name: str) -> Tuple[Any, ...]:
def backwards_compatibility_getattr(self: Any, name: str) -> tuple[Any, ...]:
if name == "EXCEPTIONS_TO_RETRY":
warnings.warn(
"Attribute RetryMiddleware.EXCEPTIONS_TO_RETRY is deprecated. "
@ -60,7 +60,7 @@ def get_retry_request(
request: Request,
*,
spider: Spider,
reason: Union[str, Exception, Type[Exception]] = "unspecified",
reason: Union[str, Exception, type[Exception]] = "unspecified",
max_retry_times: Optional[int] = None,
priority_adjust: Optional[int] = None,
logger: Logger = retry_logger,
@ -187,7 +187,7 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
def _retry(
self,
request: Request,
reason: Union[str, Exception, Type[Exception]],
reason: Union[str, Exception, type[Exception]],
spider: Spider,
) -> Optional[Request]:
max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)

View File

@ -7,7 +7,7 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Dict, Optional, TypeVar, Union
from typing import TYPE_CHECKING, Optional, TypeVar, Union
from twisted.internet.defer import Deferred, maybeDeferred
@ -45,7 +45,7 @@ class RobotsTxtMiddleware:
"ROBOTSTXT_USER_AGENT", None
)
self.crawler: Crawler = crawler
self._parsers: Dict[
self._parsers: dict[
str, Union[RobotParser, Deferred[Optional[RobotParser]], None]
] = {}
self._parserimpl: RobotParser = load_object(

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
from typing import TYPE_CHECKING, Union
from twisted.web import http
@ -19,7 +19,7 @@ if TYPE_CHECKING:
def get_header_size(
headers: Dict[str, Union[List[Union[str, bytes]], Tuple[Union[str, bytes], ...]]]
headers: dict[str, Union[list[Union[str, bytes]], tuple[Union[str, bytes], ...]]]
) -> int:
size = 0
for key, value in headers.items():

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Set
from typing import TYPE_CHECKING, Optional
from scrapy.utils.job import job_dir
from scrapy.utils.request import (
@ -56,7 +56,7 @@ class RFPDupeFilter(BaseDupeFilter):
self.fingerprinter: RequestFingerprinterProtocol = (
fingerprinter or RequestFingerprinter()
)
self.fingerprints: Set[str] = set()
self.fingerprints: set[str] = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)

View File

@ -6,9 +6,10 @@ import csv
import marshal
import pickle # nosec
import pprint
from collections.abc import Callable, Iterable, Mapping
from io import BytesIO, TextIOWrapper
from json import JSONEncoder
from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Tuple, Union
from typing import Any, Optional, Union
from xml.sax.saxutils import XMLGenerator # nosec
from xml.sax.xmlreader import AttributesImpl # nosec
@ -32,10 +33,10 @@ __all__ = [
class BaseItemExporter:
def __init__(self, *, dont_fail: bool = False, **kwargs: Any):
self._kwargs: Dict[str, Any] = kwargs
self._kwargs: dict[str, Any] = kwargs
self._configure(kwargs, dont_fail=dont_fail)
def _configure(self, options: Dict[str, Any], dont_fail: bool = False) -> None:
def _configure(self, options: dict[str, Any], dont_fail: bool = False) -> None:
"""Configure the exporter by popping options from the ``options`` dict.
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses ``__init__`` methods)
@ -66,7 +67,7 @@ class BaseItemExporter:
def _get_serialized_fields(
self, item: Any, default_value: Any = None, include_empty: Optional[bool] = None
) -> Iterable[Tuple[str, Any]]:
) -> Iterable[tuple[str, Any]]:
"""Return the fields to export as an iterable of tuples
(name, serialized_value)
"""
@ -339,7 +340,7 @@ class PythonItemExporter(BaseItemExporter):
.. _msgpack: https://pypi.org/project/msgpack/
"""
def _configure(self, options: Dict[str, Any], dont_fail: bool = False) -> None:
def _configure(self, options: dict[str, Any], dont_fail: bool = False) -> None:
super()._configure(options, dont_fail)
if not self.encoding:
self.encoding = "utf-8"
@ -363,10 +364,10 @@ class PythonItemExporter(BaseItemExporter):
return to_unicode(value, encoding=self.encoding)
return value
def _serialize_item(self, item: Any) -> Iterable[Tuple[Union[str, bytes], Any]]:
def _serialize_item(self, item: Any) -> Iterable[tuple[Union[str, bytes], Any]]:
for key, value in ItemAdapter(item).items():
yield key, self._serialize_value(value)
def export_item(self, item: Any) -> Dict[Union[str, bytes], Any]: # type: ignore[override]
result: Dict[Union[str, bytes], Any] = dict(self._get_serialized_fields(item))
def export_item(self, item: Any) -> dict[Union[str, bytes], Any]: # type: ignore[override]
result: dict[Union[str, bytes], Any] = dict(self._get_serialized_fields(item))
return result

View File

@ -6,7 +6,7 @@ See documentation in docs/topics/extensions.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Any, List
from typing import TYPE_CHECKING, Any
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
@ -19,5 +19,5 @@ class ExtensionManager(MiddlewareManager):
component_name = "extension"
@classmethod
def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
def _get_mwlist_from_settings(cls, settings: Settings) -> list[Any]:
return build_component_list(settings.getwithbase("EXTENSIONS"))

View File

@ -8,7 +8,7 @@ from __future__ import annotations
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any, DefaultDict, Dict
from typing import TYPE_CHECKING, Any
from scrapy import Request, Spider, signals
from scrapy.exceptions import NotConfigured
@ -30,7 +30,7 @@ class CloseSpider:
def __init__(self, crawler: Crawler):
self.crawler: Crawler = crawler
self.close_on: Dict[str, Any] = {
self.close_on: dict[str, Any] = {
"timeout": crawler.settings.getfloat("CLOSESPIDER_TIMEOUT"),
"itemcount": crawler.settings.getint("CLOSESPIDER_ITEMCOUNT"),
"pagecount": crawler.settings.getint("CLOSESPIDER_PAGECOUNT"),
@ -44,7 +44,7 @@ class CloseSpider:
if not any(self.close_on.values()):
raise NotConfigured
self.counter: DefaultDict[str, int] = defaultdict(int)
self.counter: defaultdict[str, int] = defaultdict(int)
if self.close_on.get("errorcount"):
crawler.signals.connect(self.error_count, signal=signals.spider_error)

View File

@ -10,25 +10,11 @@ import logging
import re
import sys
import warnings
from collections.abc import Callable
from datetime import datetime, timezone
from pathlib import Path, PureWindowsPath
from tempfile import NamedTemporaryFile
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Protocol,
Tuple,
Type,
TypeVar,
Union,
cast,
)
from typing import IO, TYPE_CHECKING, Any, Optional, Protocol, TypeVar, Union, cast
from urllib.parse import unquote, urlparse
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
@ -50,6 +36,8 @@ from scrapy.utils.misc import build_from_crawler, load_object
from scrapy.utils.python import without_none_values
if TYPE_CHECKING:
from collections.abc import Iterable
from _typeshed import OpenBinaryMode
from twisted.python.failure import Failure
@ -70,7 +58,7 @@ except ImportError:
logger = logging.getLogger(__name__)
UriParamsCallableT = Callable[[Dict[str, Any], Spider], Optional[Dict[str, Any]]]
UriParamsCallableT = Callable[[dict[str, Any], Spider], Optional[dict[str, Any]]]
_StorageT = TypeVar("_StorageT", bound="FeedStorageProtocol")
@ -79,7 +67,7 @@ def build_storage(
builder: Callable[..., _StorageT],
uri: str,
*args: Any,
feed_options: Optional[Dict[str, Any]] = None,
feed_options: Optional[dict[str, Any]] = None,
preargs: Iterable[Any] = (),
**kwargs: Any,
) -> _StorageT:
@ -96,10 +84,10 @@ class ItemFilter:
:type feed_options: dict
"""
feed_options: Optional[Dict[str, Any]]
item_classes: Tuple[type, ...]
feed_options: Optional[dict[str, Any]]
item_classes: tuple[type, ...]
def __init__(self, feed_options: Optional[Dict[str, Any]]) -> None:
def __init__(self, feed_options: Optional[dict[str, Any]]) -> None:
self.feed_options = feed_options
if feed_options is not None:
self.item_classes = tuple(
@ -141,7 +129,7 @@ class IFeedStorage(Interface):
class FeedStorageProtocol(Protocol):
"""Reimplementation of ``IFeedStorage`` that can be used in type hints."""
def __init__(self, uri: str, *, feed_options: Optional[Dict[str, Any]] = None):
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
"""Initialize the storage with the parameters given in the URI and the
feed-specific options (see :setting:`FEEDS`)"""
@ -176,7 +164,7 @@ class StdoutFeedStorage:
uri: str,
_stdout: Optional[IO[bytes]] = None,
*,
feed_options: Optional[Dict[str, Any]] = None,
feed_options: Optional[dict[str, Any]] = None,
):
if not _stdout:
_stdout = sys.stdout.buffer
@ -198,7 +186,7 @@ class StdoutFeedStorage:
@implementer(IFeedStorage)
class FileFeedStorage:
def __init__(self, uri: str, *, feed_options: Optional[Dict[str, Any]] = None):
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
self.path: str = file_uri_to_path(uri)
feed_options = feed_options or {}
self.write_mode: OpenBinaryMode = (
@ -225,7 +213,7 @@ class S3FeedStorage(BlockingFeedStorage):
acl: Optional[str] = None,
endpoint_url: Optional[str] = None,
*,
feed_options: Optional[Dict[str, Any]] = None,
feed_options: Optional[dict[str, Any]] = None,
session_token: Optional[str] = None,
region_name: Optional[str] = None,
):
@ -291,7 +279,7 @@ class S3FeedStorage(BlockingFeedStorage):
crawler: Crawler,
uri: str,
*,
feed_options: Optional[Dict[str, Any]] = None,
feed_options: Optional[dict[str, Any]] = None,
) -> Self:
return build_storage(
cls,
@ -307,7 +295,7 @@ class S3FeedStorage(BlockingFeedStorage):
def _store_in_thread(self, file: IO[bytes]) -> None:
file.seek(0)
kwargs: Dict[str, Any]
kwargs: dict[str, Any]
if IS_BOTO3_AVAILABLE:
kwargs = {"ExtraArgs": {"ACL": self.acl}} if self.acl else {}
self.s3_client.upload_fileobj(
@ -354,7 +342,7 @@ class FTPFeedStorage(BlockingFeedStorage):
uri: str,
use_active_mode: bool = False,
*,
feed_options: Optional[Dict[str, Any]] = None,
feed_options: Optional[dict[str, Any]] = None,
):
u = urlparse(uri)
if not u.hostname:
@ -373,7 +361,7 @@ class FTPFeedStorage(BlockingFeedStorage):
crawler: Crawler,
uri: str,
*,
feed_options: Optional[Dict[str, Any]] = None,
feed_options: Optional[dict[str, Any]] = None,
) -> Self:
return build_storage(
cls,
@ -405,9 +393,9 @@ class FeedSlot:
batch_id: int,
uri_template: str,
filter: ItemFilter,
feed_options: Dict[str, Any],
feed_options: dict[str, Any],
spider: Spider,
exporters: Dict[str, Type[BaseItemExporter]],
exporters: dict[str, type[BaseItemExporter]],
settings: BaseSettings,
crawler: Crawler,
):
@ -422,9 +410,9 @@ class FeedSlot:
self.uri: str = uri
self.filter: ItemFilter = filter
# exporter params
self.feed_options: Dict[str, Any] = feed_options
self.feed_options: dict[str, Any] = feed_options
self.spider: Spider = spider
self.exporters: Dict[str, Type[BaseItemExporter]] = exporters
self.exporters: dict[str, type[BaseItemExporter]] = exporters
self.settings: BaseSettings = settings
self.crawler: Crawler = crawler
# flags
@ -460,7 +448,7 @@ class FeedSlot:
self._exporting = True
def _get_instance(
self, objcls: Type[BaseItemExporter], *args: Any, **kwargs: Any
self, objcls: type[BaseItemExporter], *args: Any, **kwargs: Any
) -> BaseItemExporter:
return build_from_crawler(objcls, self.crawler, *args, **kwargs)
@ -483,7 +471,7 @@ _FeedSlot = create_deprecated_class(
class FeedExporter:
_pending_deferreds: List[Deferred[None]] = []
_pending_deferreds: list[Deferred[None]] = []
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
@ -497,8 +485,8 @@ class FeedExporter:
self.crawler: Crawler = crawler
self.settings: Settings = crawler.settings
self.feeds = {}
self.slots: List[FeedSlot] = []
self.filters: Dict[str, ItemFilter] = {}
self.slots: list[FeedSlot] = []
self.filters: dict[str, ItemFilter] = {}
if not self.settings["FEEDS"] and not self.settings["FEED_URI"]:
raise NotConfigured
@ -530,10 +518,10 @@ class FeedExporter:
)
self.filters[uri] = self._load_filter(feed_options)
self.storages: Dict[str, Type[FeedStorageProtocol]] = self._load_components(
self.storages: dict[str, type[FeedStorageProtocol]] = self._load_components(
"FEED_STORAGES"
)
self.exporters: Dict[str, Type[BaseItemExporter]] = self._load_components(
self.exporters: dict[str, type[BaseItemExporter]] = self._load_components(
"FEED_EXPORTERS"
)
for uri, feed_options in self.feeds.items():
@ -631,7 +619,7 @@ class FeedExporter:
self,
batch_id: int,
uri: str,
feed_options: Dict[str, Any],
feed_options: dict[str, Any],
spider: Spider,
uri_template: str,
) -> FeedSlot:
@ -696,9 +684,9 @@ class FeedExporter:
slots.append(slot)
self.slots = slots
def _load_components(self, setting_prefix: str) -> Dict[str, Any]:
def _load_components(self, setting_prefix: str) -> dict[str, Any]:
conf = without_none_values(
cast(Dict[str, str], self.settings.getwithbase(setting_prefix))
cast(dict[str, str], self.settings.getwithbase(setting_prefix))
)
d = {}
for k, v in conf.items():
@ -732,7 +720,7 @@ class FeedExporter:
return False
return True
def _storage_supported(self, uri: str, feed_options: Dict[str, Any]) -> bool:
def _storage_supported(self, uri: str, feed_options: dict[str, Any]) -> bool:
scheme = urlparse(uri).scheme
if scheme in self.storages or PureWindowsPath(uri).drive:
try:
@ -748,7 +736,7 @@ class FeedExporter:
return False
def _get_storage(
self, uri: str, feed_options: Dict[str, Any]
self, uri: str, feed_options: dict[str, Any]
) -> FeedStorageProtocol:
"""Fork of create_instance specific to feed storage classes
@ -759,7 +747,7 @@ class FeedExporter:
crawler = getattr(self, "crawler", None)
def build_instance(
builder: Type[FeedStorageProtocol], *preargs: Any
builder: type[FeedStorageProtocol], *preargs: Any
) -> FeedStorageProtocol:
return build_storage(
builder, uri, feed_options=feed_options, preargs=preargs
@ -784,7 +772,7 @@ class FeedExporter:
spider: Spider,
uri_params_function: Union[str, UriParamsCallableT, None],
slot: Optional[FeedSlot] = None,
) -> Dict[str, Any]:
) -> dict[str, Any]:
params = {}
for k in dir(spider):
params[k] = getattr(spider, k)
@ -800,9 +788,9 @@ class FeedExporter:
new_params = uripar_function(params, spider)
return new_params if new_params is not None else params
def _load_filter(self, feed_options: Dict[str, Any]) -> ItemFilter:
def _load_filter(self, feed_options: dict[str, Any]) -> ItemFilter:
# load the item filter if declared else load the default filter class
item_filter_class: Type[ItemFilter] = load_object(
item_filter_class: type[ItemFilter] = load_object(
feed_options.get("item_filter", ItemFilter)
)
return item_filter_class(feed_options)

View File

@ -9,7 +9,7 @@ from importlib import import_module
from pathlib import Path
from time import time
from types import ModuleType
from typing import IO, TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast
from weakref import WeakKeyDictionary
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
@ -22,6 +22,8 @@ from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.request import RequestFingerprinter
if TYPE_CHECKING:
from collections.abc import Callable
# typing.Concatenate requires Python 3.10
from typing_extensions import Concatenate
@ -35,8 +37,8 @@ logger = logging.getLogger(__name__)
class DummyPolicy:
def __init__(self, settings: BaseSettings):
self.ignore_schemes: List[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self.ignore_http_codes: List[int] = [
self.ignore_schemes: list[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self.ignore_http_codes: list[int] = [
int(x) for x in settings.getlist("HTTPCACHE_IGNORE_HTTP_CODES")
]
@ -62,18 +64,18 @@ class RFC2616Policy:
def __init__(self, settings: BaseSettings):
self.always_store: bool = settings.getbool("HTTPCACHE_ALWAYS_STORE")
self.ignore_schemes: List[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self.ignore_schemes: list[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self._cc_parsed: WeakKeyDictionary[
Union[Request, Response], Dict[bytes, Optional[bytes]]
Union[Request, Response], dict[bytes, Optional[bytes]]
] = WeakKeyDictionary()
self.ignore_response_cache_controls: List[bytes] = [
self.ignore_response_cache_controls: list[bytes] = [
to_bytes(cc)
for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
]
def _parse_cachecontrol(
self, r: Union[Request, Response]
) -> Dict[bytes, Optional[bytes]]:
) -> dict[bytes, Optional[bytes]]:
if r not in self._cc_parsed:
cch = r.headers.get(b"Cache-Control", b"")
assert cch is not None
@ -189,7 +191,7 @@ class RFC2616Policy:
if b"ETag" in cachedresponse.headers:
request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
def _get_max_age(self, cc: Dict[bytes, Optional[bytes]]) -> Optional[int]:
def _get_max_age(self, cc: dict[bytes, Optional[bytes]]) -> Optional[int]:
try:
return max(0, int(cc[b"max-age"])) # type: ignore[arg-type]
except (KeyError, ValueError):
@ -298,7 +300,7 @@ class DbmCacheStorage:
self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
self.db[f"{key}_time"] = str(time())
def _read_data(self, spider: Spider, request: Request) -> Optional[Dict[str, Any]]:
def _read_data(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
key = self._fingerprinter.fingerprint(request).hex()
db = self.db
tkey = f"{key}_time"
@ -309,7 +311,7 @@ class DbmCacheStorage:
if 0 < self.expiration_secs < time() - float(ts):
return None # expired
return cast(Dict[str, Any], pickle.loads(db[f"{key}_data"])) # nosec
return cast(dict[str, Any], pickle.loads(db[f"{key}_data"])) # nosec
class FilesystemCacheStorage:
@ -385,7 +387,7 @@ class FilesystemCacheStorage:
key = self._fingerprinter.fingerprint(request).hex()
return str(Path(self.cachedir, spider.name, key[0:2], key))
def _read_meta(self, spider: Spider, request: Request) -> Optional[Dict[str, Any]]:
def _read_meta(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
rpath = Path(self._get_request_path(spider, request))
metapath = rpath / "pickled_meta"
if not metapath.exists():
@ -394,10 +396,10 @@ class FilesystemCacheStorage:
if 0 < self.expiration_secs < time() - mtime:
return None # expired
with self._open(metapath, "rb") as f:
return cast(Dict[str, Any], pickle.load(f)) # nosec
return cast(dict[str, Any], pickle.load(f)) # nosec
def parse_cachecontrol(header: bytes) -> Dict[bytes, Optional[bytes]]:
def parse_cachecontrol(header: bytes) -> dict[bytes, Optional[bytes]]:
"""Parse Cache-Control header
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional, Tuple, Union
from typing import TYPE_CHECKING, Optional, Union
from twisted.internet import task
@ -81,7 +81,7 @@ class LogStats:
def calculate_final_stats(
self, spider: Spider
) -> Union[Tuple[None, None], Tuple[float, float]]:
) -> Union[tuple[None, None], tuple[float, float]]:
start_time = self.stats.get_value("start_time")
finished_time = self.stats.get_value("finished_time")

View File

@ -11,7 +11,7 @@ import socket
import sys
from importlib import import_module
from pprint import pformat
from typing import TYPE_CHECKING, List
from typing import TYPE_CHECKING
from twisted.internet import task
@ -42,7 +42,7 @@ class MemoryUsage:
self.crawler: Crawler = crawler
self.warned: bool = False
self.notify_mails: List[str] = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
self.notify_mails: list[str] = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
self.limit: int = crawler.settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
self.warning: int = crawler.settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
self.check_interval: float = crawler.settings.getfloat(
@ -66,7 +66,7 @@ class MemoryUsage:
def engine_started(self) -> None:
assert self.crawler.stats
self.crawler.stats.set_value("memusage/startup", self.get_virtual_size())
self.tasks: List[task.LoopingCall] = []
self.tasks: list[task.LoopingCall] = []
tsk = task.LoopingCall(self.update)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
@ -141,7 +141,7 @@ class MemoryUsage:
self.crawler.stats.set_value("memusage/warning_notified", 1)
self.warned = True
def _send_report(self, rcpts: List[str], subject: str) -> None:
def _send_report(self, rcpts: list[str], subject: str) -> None:
"""send notification mail with some additional useful info"""
assert self.crawler.engine
assert self.crawler.stats

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
from datetime import datetime, timezone
from json import JSONEncoder
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from twisted.internet import task
@ -29,8 +29,8 @@ class PeriodicLog:
self,
stats: StatsCollector,
interval: float = 60.0,
ext_stats: Dict[str, Any] = {},
ext_delta: Dict[str, Any] = {},
ext_stats: dict[str, Any] = {},
ext_delta: dict[str, Any] = {},
ext_timing_enabled: bool = False,
):
self.stats: StatsCollector = stats
@ -39,11 +39,11 @@ class PeriodicLog:
self.task: Optional[task.LoopingCall] = None
self.encoder: JSONEncoder = ScrapyJSONEncoder(sort_keys=True, indent=4)
self.ext_stats_enabled: bool = bool(ext_stats)
self.ext_stats_include: List[str] = ext_stats.get("include", [])
self.ext_stats_exclude: List[str] = ext_stats.get("exclude", [])
self.ext_stats_include: list[str] = ext_stats.get("include", [])
self.ext_stats_exclude: list[str] = ext_stats.get("exclude", [])
self.ext_delta_enabled: bool = bool(ext_delta)
self.ext_delta_include: List[str] = ext_delta.get("include", [])
self.ext_delta_exclude: List[str] = ext_delta.get("exclude", [])
self.ext_delta_include: list[str] = ext_delta.get("include", [])
self.ext_delta_exclude: list[str] = ext_delta.get("exclude", [])
self.ext_timing_enabled: bool = ext_timing_enabled
@classmethod
@ -52,7 +52,7 @@ class PeriodicLog:
if not interval:
raise NotConfigured
try:
ext_stats: Optional[Dict[str, Any]] = crawler.settings.getdict(
ext_stats: Optional[dict[str, Any]] = crawler.settings.getdict(
"PERIODIC_LOG_STATS"
)
except (TypeError, ValueError):
@ -62,7 +62,7 @@ class PeriodicLog:
else None
)
try:
ext_delta: Optional[Dict[str, Any]] = crawler.settings.getdict(
ext_delta: Optional[dict[str, Any]] = crawler.settings.getdict(
"PERIODIC_LOG_DELTA"
)
except (TypeError, ValueError):
@ -93,14 +93,14 @@ class PeriodicLog:
def spider_opened(self, spider: Spider) -> None:
self.time_prev: datetime = datetime.now(tz=timezone.utc)
self.delta_prev: Dict[str, Union[int, float]] = {}
self.stats_prev: Dict[str, Union[int, float]] = {}
self.delta_prev: dict[str, Union[int, float]] = {}
self.stats_prev: dict[str, Union[int, float]] = {}
self.task = task.LoopingCall(self.log)
self.task.start(self.interval)
def log(self) -> None:
data: Dict[str, Any] = {}
data: dict[str, Any] = {}
if self.ext_timing_enabled:
data.update(self.log_timing())
if self.ext_delta_enabled:
@ -109,8 +109,8 @@ class PeriodicLog:
data.update(self.log_crawler_stats())
logger.info(self.encoder.encode(data))
def log_delta(self) -> Dict[str, Any]:
num_stats: Dict[str, Union[int, float]] = {
def log_delta(self) -> dict[str, Any]:
num_stats: dict[str, Union[int, float]] = {
k: v
for k, v in self.stats._stats.items()
if isinstance(v, (int, float))
@ -120,7 +120,7 @@ class PeriodicLog:
self.delta_prev = num_stats
return {"delta": delta}
def log_timing(self) -> Dict[str, Any]:
def log_timing(self) -> dict[str, Any]:
now = datetime.now(tz=timezone.utc)
time = {
"log_interval": self.interval,
@ -132,7 +132,7 @@ class PeriodicLog:
self.time_prev = now
return {"time": time}
def log_crawler_stats(self) -> Dict[str, Any]:
def log_crawler_stats(self) -> dict[str, Any]:
stats = {
k: v
for k, v in self.stats._stats.items()
@ -141,7 +141,7 @@ class PeriodicLog:
return {"stats": stats}
def param_allowed(
self, stat_name: str, include: List[str], exclude: List[str]
self, stat_name: str, include: list[str], exclude: list[str]
) -> bool:
if not include and not exclude:
return True

View File

@ -6,7 +6,7 @@ from bz2 import BZ2File
from gzip import GzipFile
from io import IOBase
from lzma import LZMAFile
from typing import IO, Any, BinaryIO, Dict, List, cast
from typing import IO, Any, BinaryIO, cast
from scrapy.utils.misc import load_object
@ -24,7 +24,7 @@ class GzipPlugin:
See :py:class:`gzip.GzipFile` for more info about parameters.
"""
def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
def __init__(self, file: BinaryIO, feed_options: dict[str, Any]) -> None:
self.file = file
self.feed_options = feed_options
compress_level = self.feed_options.get("gzip_compresslevel", 9)
@ -56,7 +56,7 @@ class Bz2Plugin:
See :py:class:`bz2.BZ2File` for more info about parameters.
"""
def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
def __init__(self, file: BinaryIO, feed_options: dict[str, Any]) -> None:
self.file = file
self.feed_options = feed_options
compress_level = self.feed_options.get("bz2_compresslevel", 9)
@ -88,7 +88,7 @@ class LZMAPlugin:
See :py:class:`lzma.LZMAFile` for more info about parameters.
"""
def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
def __init__(self, file: BinaryIO, feed_options: dict[str, Any]) -> None:
self.file = file
self.feed_options = feed_options
@ -126,7 +126,7 @@ class PostProcessingManager(IOBase):
"""
def __init__(
self, plugins: List[Any], file: IO[bytes], feed_options: Dict[str, Any]
self, plugins: list[Any], file: IO[bytes], feed_options: dict[str, Any]
) -> None:
self.plugins = self._load_plugins(plugins)
self.file = file
@ -156,7 +156,7 @@ class PostProcessingManager(IOBase):
def writable(self) -> bool:
return True
def _load_plugins(self, plugins: List[Any]) -> List[Any]:
def _load_plugins(self, plugins: list[Any]) -> list[Any]:
plugins = [load_object(plugin) for plugin in plugins]
return plugins

View File

@ -6,7 +6,7 @@ Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
from __future__ import annotations
from typing import TYPE_CHECKING, List, Optional
from typing import TYPE_CHECKING, Optional
from scrapy import Spider, signals
from scrapy.exceptions import NotConfigured
@ -23,14 +23,14 @@ if TYPE_CHECKING:
class StatsMailer:
def __init__(self, stats: StatsCollector, recipients: List[str], mail: MailSender):
def __init__(self, stats: StatsCollector, recipients: list[str], mail: MailSender):
self.stats: StatsCollector = stats
self.recipients: List[str] = recipients
self.recipients: list[str] = recipients
self.mail: MailSender = mail
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
recipients: List[str] = crawler.settings.getlist("STATSMAILER_RCPTS")
recipients: list[str] = crawler.settings.getlist("STATSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail: MailSender = MailSender.from_settings(crawler.settings)

View File

@ -10,7 +10,7 @@ import binascii
import logging
import os
import pprint
from typing import TYPE_CHECKING, Any, Dict, List
from typing import TYPE_CHECKING, Any
from twisted.internet import protocol
from twisted.internet.tcp import Port
@ -45,7 +45,7 @@ class TelnetConsole(protocol.ServerFactory):
self.crawler: Crawler = crawler
self.noisy: bool = False
self.portrange: List[int] = [
self.portrange: list[int] = [
int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT")
]
self.host: str = crawler.settings["TELNETCONSOLE_HOST"]
@ -98,10 +98,10 @@ class TelnetConsole(protocol.ServerFactory):
return telnet.TelnetTransport(telnet.AuthenticatingTelnetProtocol, Portal())
def _get_telnet_vars(self) -> Dict[str, Any]:
def _get_telnet_vars(self) -> dict[str, Any]:
# Note: if you add entries here also update topics/telnetconsole.rst
assert self.crawler.engine
telnet_vars: Dict[str, Any] = {
telnet_vars: dict[str, Any] = {
"engine": self.crawler.engine,
"spider": self.crawler.engine.spider,
"slot": self.crawler.engine.slot,

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional, Tuple
from typing import TYPE_CHECKING, Optional
from scrapy import Request, Spider, signals
from scrapy.exceptions import NotConfigured
@ -90,7 +90,7 @@ class AutoThrottle:
def _get_slot(
self, request: Request, spider: Spider
) -> Tuple[Optional[str], Optional[Slot]]:
) -> tuple[Optional[str], Optional[Slot]]:
key: Optional[str] = request.meta.get("download_slot")
if key is None:
return None, None

View File

@ -5,22 +5,14 @@ import time
from http.cookiejar import Cookie
from http.cookiejar import CookieJar as _CookieJar
from http.cookiejar import CookiePolicy, DefaultCookiePolicy
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterator,
List,
Optional,
Sequence,
Tuple,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, cast
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
if TYPE_CHECKING:
from collections.abc import Iterator, Sequence
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -83,7 +75,7 @@ class CookieJar:
self.jar.clear_expired_cookies()
@property
def _cookies(self) -> Dict[str, Dict[str, Dict[str, Cookie]]]:
def _cookies(self) -> dict[str, dict[str, dict[str, Cookie]]]:
return self.jar._cookies # type: ignore[attr-defined,no-any-return]
def clear_session_cookies(self) -> None:
@ -118,7 +110,7 @@ class CookieJar:
self.jar.set_cookie_if_ok(cookie, WrappedRequest(request)) # type: ignore[arg-type]
def potential_domain_matches(domain: str) -> List[str]:
def potential_domain_matches(domain: str) -> list[str]:
"""Potential domain matches for a cookie
>>> potential_domain_matches('www.example.com')
@ -200,7 +192,7 @@ class WrappedRequest:
value = self.request.headers.get(name, default)
return to_unicode(value, errors="replace") if value is not None else None
def header_items(self) -> List[Tuple[str, List[str]]]:
def header_items(self) -> list[tuple[str, list[str]]]:
return [
(
to_unicode(k, errors="replace"),
@ -220,7 +212,7 @@ class WrappedResponse:
def info(self) -> Self:
return self
def get_all(self, name: str, default: Any = None) -> List[str]:
def get_all(self, name: str, default: Any = None) -> list[str]:
return [
to_unicode(v, errors="replace") for v in self.response.headers.getlist(name)
]

View File

@ -1,18 +1,7 @@
from __future__ import annotations
from collections.abc import Mapping
from typing import (
TYPE_CHECKING,
Any,
AnyStr,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
from w3lib.http import headers_dict_to_raw
@ -20,6 +9,8 @@ from scrapy.utils.datatypes import CaseInsensitiveDict, CaselessDict
from scrapy.utils.python import to_unicode
if TYPE_CHECKING:
from collections.abc import Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -34,17 +25,17 @@ class Headers(CaselessDict):
def __init__(
self,
seq: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
encoding: str = "utf-8",
):
self.encoding: str = encoding
super().__init__(seq)
def update( # type: ignore[override]
self, seq: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]]]
self, seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]]]
) -> None:
seq = seq.items() if isinstance(seq, Mapping) else seq
iseq: Dict[bytes, List[bytes]] = {}
iseq: dict[bytes, list[bytes]] = {}
for k, v in seq:
iseq.setdefault(self.normkey(k), []).extend(self.normvalue(v))
super().update(iseq)
@ -53,7 +44,7 @@ class Headers(CaselessDict):
"""Normalize key to bytes"""
return self._tobytes(key.title())
def normvalue(self, value: Union[_RawValueT, Iterable[_RawValueT]]) -> List[bytes]:
def normvalue(self, value: Union[_RawValueT, Iterable[_RawValueT]]) -> list[bytes]:
"""Normalize values to bytes"""
_value: Iterable[_RawValueT]
if value is None:
@ -78,19 +69,19 @@ class Headers(CaselessDict):
def __getitem__(self, key: AnyStr) -> Optional[bytes]:
try:
return cast(List[bytes], super().__getitem__(key))[-1]
return cast(list[bytes], super().__getitem__(key))[-1]
except IndexError:
return None
def get(self, key: AnyStr, def_val: Any = None) -> Optional[bytes]:
try:
return cast(List[bytes], super().get(key, def_val))[-1]
return cast(list[bytes], super().get(key, def_val))[-1]
except IndexError:
return None
def getlist(self, key: AnyStr, def_val: Any = None) -> List[bytes]:
def getlist(self, key: AnyStr, def_val: Any = None) -> list[bytes]:
try:
return cast(List[bytes], super().__getitem__(key))
return cast(list[bytes], super().__getitem__(key))
except KeyError:
if def_val is not None:
return self.normvalue(def_val)
@ -109,10 +100,10 @@ class Headers(CaselessDict):
lst.extend(self.normvalue(value))
self[key] = lst
def items(self) -> Iterable[Tuple[bytes, List[bytes]]]: # type: ignore[override]
def items(self) -> Iterable[tuple[bytes, list[bytes]]]: # type: ignore[override]
return ((k, self.getlist(k)) for k in self.keys())
def values(self) -> List[Optional[bytes]]: # type: ignore[override]
def values(self) -> list[Optional[bytes]]: # type: ignore[override]
return [
self[k] for k in self.keys() # pylint: disable=consider-using-dict-items
]

View File

@ -12,14 +12,8 @@ from typing import (
TYPE_CHECKING,
Any,
AnyStr,
Dict,
Iterable,
List,
Mapping,
NoReturn,
Optional,
Tuple,
Type,
TypedDict,
TypeVar,
Union,
@ -36,7 +30,7 @@ from scrapy.utils.trackref import object_ref
from scrapy.utils.url import escape_ajax
if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import Callable, Iterable, Mapping
from twisted.python.failure import Failure
@ -57,7 +51,7 @@ class VerboseCookie(TypedDict):
secure: NotRequired[bool]
CookiesT = Union[Dict[str, str], List[VerboseCookie]]
CookiesT = Union[dict[str, str], list[VerboseCookie]]
RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")
@ -92,7 +86,7 @@ class Request(object_ref):
executed by the Downloader, thus generating a :class:`Response`.
"""
attributes: Tuple[str, ...] = (
attributes: tuple[str, ...] = (
"url",
"callback",
"method",
@ -120,16 +114,16 @@ class Request(object_ref):
url: str,
callback: Optional[CallbackT] = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[Dict[str, Any]] = None,
meta: Optional[dict[str, Any]] = None,
encoding: str = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
flags: Optional[List[str]] = None,
cb_kwargs: Optional[Dict[str, Any]] = None,
flags: Optional[list[str]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
) -> None:
self._encoding: str = encoding # this one has to be set first
self.method: str = str(method).upper()
@ -152,20 +146,20 @@ class Request(object_ref):
self.headers: Headers = Headers(headers or {}, encoding=encoding)
self.dont_filter: bool = dont_filter
self._meta: Optional[Dict[str, Any]] = dict(meta) if meta else None
self._cb_kwargs: Optional[Dict[str, Any]] = (
self._meta: Optional[dict[str, Any]] = dict(meta) if meta else None
self._cb_kwargs: Optional[dict[str, Any]] = (
dict(cb_kwargs) if cb_kwargs else None
)
self.flags: List[str] = [] if flags is None else list(flags)
self.flags: list[str] = [] if flags is None else list(flags)
@property
def cb_kwargs(self) -> Dict[str, Any]:
def cb_kwargs(self) -> dict[str, Any]:
if self._cb_kwargs is None:
self._cb_kwargs = {}
return self._cb_kwargs
@property
def meta(self) -> Dict[str, Any]:
def meta(self) -> dict[str, Any]:
if self._meta is None:
self._meta = {}
return self._meta
@ -207,14 +201,14 @@ class Request(object_ref):
@overload
def replace(
self, *args: Any, cls: Type[RequestTypeVar], **kwargs: Any
self, *args: Any, cls: type[RequestTypeVar], **kwargs: Any
) -> RequestTypeVar: ...
@overload
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
def replace(
self, *args: Any, cls: Optional[Type[Request]] = None, **kwargs: Any
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
) -> Request:
"""Create a new Request with the same attributes except for those given new values"""
for x in self.attributes:
@ -261,7 +255,7 @@ class Request(object_ref):
request_kwargs.update(kwargs)
return cls(**request_kwargs)
def to_dict(self, *, spider: Optional[scrapy.Spider] = None) -> Dict[str, Any]:
def to_dict(self, *, spider: Optional[scrapy.Spider] = None) -> dict[str, Any]:
"""Return a dictionary containing the Request's data.
Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object.

View File

@ -7,17 +7,8 @@ See documentation in docs/topics/request-response.rst
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
cast,
)
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit
from lxml.html import FormElement # nosec
@ -31,6 +22,7 @@ from scrapy.http.request import Request
from scrapy.utils.python import is_listlike, to_bytes
if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -38,8 +30,8 @@ if TYPE_CHECKING:
FormdataVType = Union[str, Iterable[str]]
FormdataKVType = Tuple[str, FormdataVType]
FormdataType = Optional[Union[Dict[str, FormdataVType], List[FormdataKVType]]]
FormdataKVType = tuple[str, FormdataVType]
FormdataType = Optional[Union[dict[str, FormdataVType], list[FormdataKVType]]]
class FormRequest(Request):
@ -74,7 +66,7 @@ class FormRequest(Request):
formid: Optional[str] = None,
formnumber: int = 0,
formdata: FormdataType = None,
clickdata: Optional[Dict[str, Union[str, int]]] = None,
clickdata: Optional[dict[str, Union[str, int]]] = None,
dont_click: bool = False,
formxpath: Optional[str] = None,
formcss: Optional[str] = None,
@ -168,8 +160,8 @@ def _get_inputs(
form: FormElement,
formdata: FormdataType,
dont_click: bool,
clickdata: Optional[Dict[str, Union[str, int]]],
) -> List[FormdataKVType]:
clickdata: Optional[dict[str, Union[str, int]]],
) -> list[FormdataKVType]:
"""Return a list of key-value pairs for the inputs found in the given form."""
try:
formdata_keys = dict(formdata or ()).keys()
@ -187,7 +179,7 @@ def _get_inputs(
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={"re": "http://exslt.org/regular-expressions"},
)
values: List[FormdataKVType] = [
values: list[FormdataKVType] = [
(k, "" if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata_keys
@ -205,7 +197,7 @@ def _get_inputs(
def _value(
ele: Union[InputElement, SelectElement, TextareaElement]
) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
n = ele.name
v = ele.value
if ele.tag == "select":
@ -215,7 +207,7 @@ def _value(
def _select_value(
ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions]
) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
@ -226,8 +218,8 @@ def _select_value(
def _get_clickable(
clickdata: Optional[Dict[str, Union[str, int]]], form: FormElement
) -> Optional[Tuple[str, str]]:
clickdata: Optional[dict[str, Union[str, int]]], form: FormElement
) -> Optional[tuple[str, str]]:
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first

View File

@ -10,7 +10,7 @@ from __future__ import annotations
import copy
import json
import warnings
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, overload
from typing import TYPE_CHECKING, Any, Optional, overload
from scrapy.http.request import Request, RequestTypeVar
@ -20,14 +20,14 @@ if TYPE_CHECKING:
class JsonRequest(Request):
attributes: Tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
attributes: tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
def __init__(
self, *args: Any, dumps_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any
self, *args: Any, dumps_kwargs: Optional[dict[str, Any]] = None, **kwargs: Any
) -> None:
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
dumps_kwargs.setdefault("sort_keys", True)
self._dumps_kwargs: Dict[str, Any] = dumps_kwargs
self._dumps_kwargs: dict[str, Any] = dumps_kwargs
body_passed = kwargs.get("body", None) is not None
data: Any = kwargs.pop("data", None)
@ -47,19 +47,19 @@ class JsonRequest(Request):
)
@property
def dumps_kwargs(self) -> Dict[str, Any]:
def dumps_kwargs(self) -> dict[str, Any]:
return self._dumps_kwargs
@overload
def replace(
self, *args: Any, cls: Type[RequestTypeVar], **kwargs: Any
self, *args: Any, cls: type[RequestTypeVar], **kwargs: Any
) -> RequestTypeVar: ...
@overload
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
def replace(
self, *args: Any, cls: Optional[Type[Request]] = None, **kwargs: Any
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
) -> Request:
body_passed = kwargs.get("body", None) is not None
data: Any = kwargs.pop("data", None)

View File

@ -7,22 +7,7 @@ See documentation in docs/topics/request-response.rst
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
AnyStr,
Callable,
Dict,
Iterable,
List,
Mapping,
Optional,
Tuple,
Type,
TypeVar,
Union,
overload,
)
from typing import TYPE_CHECKING, Any, AnyStr, Optional, TypeVar, Union, overload
from urllib.parse import urljoin
from scrapy.exceptions import NotSupported
@ -32,6 +17,7 @@ from scrapy.link import Link
from scrapy.utils.trackref import object_ref
if TYPE_CHECKING:
from collections.abc import Callable, Iterable, Mapping
from ipaddress import IPv4Address, IPv6Address
from twisted.internet.ssl import Certificate
@ -52,7 +38,7 @@ class Response(object_ref):
downloaded (by the Downloader) and fed to the Spiders for processing.
"""
attributes: Tuple[str, ...] = (
attributes: tuple[str, ...] = (
"url",
"status",
"headers",
@ -74,9 +60,9 @@ class Response(object_ref):
self,
url: str,
status: int = 200,
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: bytes = b"",
flags: Optional[List[str]] = None,
flags: Optional[list[str]] = None,
request: Optional[Request] = None,
certificate: Optional[Certificate] = None,
ip_address: Union[IPv4Address, IPv6Address, None] = None,
@ -87,13 +73,13 @@ class Response(object_ref):
self._set_body(body)
self._set_url(url)
self.request: Optional[Request] = request
self.flags: List[str] = [] if flags is None else list(flags)
self.flags: list[str] = [] if flags is None else list(flags)
self.certificate: Optional[Certificate] = certificate
self.ip_address: Union[IPv4Address, IPv6Address, None] = ip_address
self.protocol: Optional[str] = protocol
@property
def cb_kwargs(self) -> Dict[str, Any]:
def cb_kwargs(self) -> dict[str, Any]:
try:
return self.request.cb_kwargs # type: ignore[union-attr]
except AttributeError:
@ -103,7 +89,7 @@ class Response(object_ref):
)
@property
def meta(self) -> Dict[str, Any]:
def meta(self) -> dict[str, Any]:
try:
return self.request.meta # type: ignore[union-attr]
except AttributeError:
@ -149,14 +135,14 @@ class Response(object_ref):
@overload
def replace(
self, *args: Any, cls: Type[ResponseTypeVar], **kwargs: Any
self, *args: Any, cls: type[ResponseTypeVar], **kwargs: Any
) -> ResponseTypeVar: ...
@overload
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
def replace(
self, *args: Any, cls: Optional[Type[Response]] = None, **kwargs: Any
self, *args: Any, cls: Optional[type[Response]] = None, **kwargs: Any
) -> Response:
"""Create a new Response with the same attributes except for those given new values"""
for x in self.attributes:
@ -200,16 +186,16 @@ class Response(object_ref):
url: Union[str, Link],
callback: Optional[CallbackT] = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[Dict[str, Any]] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[Dict[str, Any]] = None,
flags: Optional[List[str]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
@ -253,16 +239,16 @@ class Response(object_ref):
urls: Iterable[Union[str, Link]],
callback: Optional[CallbackT] = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[Dict[str, Any]] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[Dict[str, Any]] = None,
flags: Optional[List[str]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
) -> Iterable[Request]:
"""
.. versionadded:: 2.0

View File

@ -8,21 +8,9 @@ See documentation in docs/topics/request-response.rst
from __future__ import annotations
import json
from collections.abc import Iterable
from contextlib import suppress
from typing import (
TYPE_CHECKING,
Any,
AnyStr,
Callable,
Dict,
Iterable,
List,
Mapping,
Optional,
Tuple,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
from urllib.parse import urljoin
import parsel
@ -41,6 +29,8 @@ from scrapy.utils.python import memoizemethod_noargs, to_unicode
from scrapy.utils.response import get_base_url
if TYPE_CHECKING:
from collections.abc import Callable, Mapping
from twisted.python.failure import Failure
from scrapy.http.request import CallbackT, CookiesT, Request
@ -54,7 +44,7 @@ class TextResponse(Response):
_DEFAULT_ENCODING = "ascii"
_cached_decoded_json = _NONE
attributes: Tuple[str, ...] = Response.attributes + ("encoding",)
attributes: tuple[str, ...] = Response.attributes + ("encoding",)
def __init__(self, *args: Any, **kwargs: Any):
self._encoding: Optional[str] = kwargs.pop("encoding", None)
@ -183,16 +173,16 @@ class TextResponse(Response):
url: Union[str, Link, parsel.Selector],
callback: Optional[CallbackT] = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[Dict[str, Any]] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = None,
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[Dict[str, Any]] = None,
flags: Optional[List[str]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
@ -236,16 +226,16 @@ class TextResponse(Response):
urls: Union[Iterable[Union[str, Link]], parsel.SelectorList, None] = None,
callback: Optional[CallbackT] = None,
method: str = "GET",
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[CookiesT] = None,
meta: Optional[Dict[str, Any]] = None,
meta: Optional[dict[str, Any]] = None,
encoding: Optional[str] = None,
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable[[Failure], Any]] = None,
cb_kwargs: Optional[Dict[str, Any]] = None,
flags: Optional[List[str]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
flags: Optional[list[str]] = None,
css: Optional[str] = None,
xpath: Optional[str] = None,
) -> Iterable[Request]:

View File

@ -7,27 +7,21 @@ See documentation in docs/topics/item.rst
from __future__ import annotations
from abc import ABCMeta
from collections.abc import MutableMapping
from copy import deepcopy
from pprint import pformat
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterator,
KeysView,
MutableMapping,
NoReturn,
Tuple,
)
from typing import TYPE_CHECKING, Any, NoReturn
from scrapy.utils.trackref import object_ref
if TYPE_CHECKING:
from collections.abc import Iterator, KeysView
# typing.Self requires Python 3.11
from typing_extensions import Self
class Field(Dict[str, Any]):
class Field(dict[str, Any]):
"""Container of field metadata"""
@ -38,7 +32,7 @@ class ItemMeta(ABCMeta):
"""
def __new__(
mcs, class_name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]
mcs, class_name: str, bases: tuple[type, ...], attrs: dict[str, Any]
) -> ItemMeta:
classcell = attrs.pop("__classcell__", None)
new_bases = tuple(base._class for base in bases if hasattr(base, "_class"))
@ -83,10 +77,10 @@ class Item(MutableMapping[str, Any], object_ref, metaclass=ItemMeta):
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
"""
fields: Dict[str, Field]
fields: dict[str, Field]
def __init__(self, *args: Any, **kwargs: Any):
self._values: Dict[str, Any] = {}
self._values: dict[str, Any] = {}
if args or kwargs: # avoid creating dict for most common case
for k, v in dict(*args, **kwargs).items():
self[k] = v

View File

@ -6,8 +6,13 @@ This package contains a collection of Link Extractors.
For more info see docs/topics/link-extractors.rst
"""
import re
from typing import Iterable, Pattern
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Iterable
from re import Pattern
# common file extensions that are not followed if they occur in links
IGNORED_EXTENSIONS = [

View File

@ -6,20 +6,10 @@ from __future__ import annotations
import logging
import operator
import re
from collections.abc import Callable, Iterable
from functools import partial
from typing import (
TYPE_CHECKING,
Any,
Callable,
Iterable,
List,
Optional,
Pattern,
Set,
Tuple,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from urllib.parse import urljoin, urlparse
from lxml import etree # nosec
@ -28,13 +18,14 @@ from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
from scrapy.link import Link
from scrapy.linkextractors import IGNORED_EXTENSIONS, _is_valid_url, _matches, re
from scrapy.linkextractors import IGNORED_EXTENSIONS, _is_valid_url, _matches
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list
from scrapy.utils.response import get_base_url
from scrapy.utils.url import url_has_any_extension, url_is_from_any_domain
if TYPE_CHECKING:
from lxml.html import HtmlElement # nosec
from scrapy import Selector
@ -98,7 +89,7 @@ class LxmlParserLinkExtractor:
def _iter_links(
self, document: HtmlElement
) -> Iterable[Tuple[HtmlElement, str, str]]:
) -> Iterable[tuple[HtmlElement, str, str]]:
for el in document.iter(etree.Element):
if not self.scan_tag(_nons(el.tag)):
continue
@ -114,8 +105,8 @@ class LxmlParserLinkExtractor:
response_url: str,
response_encoding: str,
base_url: str,
) -> List[Link]:
links: List[Link] = []
) -> list[Link]:
links: list[Link] = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
@ -145,26 +136,26 @@ class LxmlParserLinkExtractor:
links.append(link)
return self._deduplicate_if_needed(links)
def extract_links(self, response: TextResponse) -> List[Link]:
def extract_links(self, response: TextResponse) -> list[Link]:
base_url = get_base_url(response)
return self._extract_links(
response.selector, response.url, response.encoding, base_url
)
def _process_links(self, links: List[Link]) -> List[Link]:
def _process_links(self, links: list[Link]) -> list[Link]:
"""Normalize and filter extracted links
The subclass should override it if necessary
"""
return self._deduplicate_if_needed(links)
def _deduplicate_if_needed(self, links: List[Link]) -> List[Link]:
def _deduplicate_if_needed(self, links: list[Link]) -> list[Link]:
if self.unique:
return unique_list(links, key=self.link_key)
return links
_RegexT = Union[str, Pattern[str]]
_RegexT = Union[str, re.Pattern[str]]
_RegexOrSeveralT = Union[_RegexT, Iterable[_RegexT]]
@ -197,13 +188,13 @@ class LxmlLinkExtractor:
strip=strip,
canonicalized=not canonicalize,
)
self.allow_res: List[Pattern[str]] = self._compile_regexes(allow)
self.deny_res: List[Pattern[str]] = self._compile_regexes(deny)
self.allow_res: list[re.Pattern[str]] = self._compile_regexes(allow)
self.deny_res: list[re.Pattern[str]] = self._compile_regexes(deny)
self.allow_domains: Set[str] = set(arg_to_iter(allow_domains))
self.deny_domains: Set[str] = set(arg_to_iter(deny_domains))
self.allow_domains: set[str] = set(arg_to_iter(allow_domains))
self.deny_domains: set[str] = set(arg_to_iter(deny_domains))
self.restrict_xpaths: Tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths))
self.restrict_xpaths: tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths))
self.restrict_xpaths += tuple(
map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
)
@ -211,11 +202,11 @@ class LxmlLinkExtractor:
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
self.canonicalize: bool = canonicalize
self.deny_extensions: Set[str] = {"." + e for e in arg_to_iter(deny_extensions)}
self.restrict_text: List[Pattern[str]] = self._compile_regexes(restrict_text)
self.deny_extensions: set[str] = {"." + e for e in arg_to_iter(deny_extensions)}
self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text)
@staticmethod
def _compile_regexes(value: Optional[_RegexOrSeveralT]) -> List[Pattern[str]]:
def _compile_regexes(value: Optional[_RegexOrSeveralT]) -> list[re.Pattern[str]]:
return [
x if isinstance(x, re.Pattern) else re.compile(x)
for x in arg_to_iter(value)
@ -257,7 +248,7 @@ class LxmlLinkExtractor:
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
return any(allowed) and not any(denied)
def _process_links(self, links: List[Link]) -> List[Link]:
def _process_links(self, links: list[Link]) -> list[Link]:
links = [x for x in links if self._link_allowed(x)]
if self.canonicalize:
for link in links:
@ -265,10 +256,10 @@ class LxmlLinkExtractor:
links = self.link_extractor._process_links(links)
return links
def _extract_links(self, *args: Any, **kwargs: Any) -> List[Link]:
def _extract_links(self, *args: Any, **kwargs: Any) -> list[Link]:
return self.link_extractor._extract_links(*args, **kwargs)
def extract_links(self, response: TextResponse) -> List[Link]:
def extract_links(self, response: TextResponse) -> list[Link]:
"""Returns a list of :class:`~scrapy.link.Link` objects from the
specified :class:`response <scrapy.http.Response>`.

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
import os
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypedDict, Union
from typing import TYPE_CHECKING, Any, Optional, TypedDict, Union
from twisted.python.failure import Failure
@ -31,7 +31,7 @@ DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
class LogFormatterResult(TypedDict):
level: int
msg: str
args: Union[Dict[str, Any], Tuple[Any, ...]]
args: Union[dict[str, Any], tuple[Any, ...]]
class LogFormatter:
@ -181,7 +181,7 @@ class LogFormatter:
.. versionadded:: 2.0
"""
args: Dict[str, Any] = {"request": request}
args: dict[str, Any] = {"request": request}
if errmsg:
msg = DOWNLOADERRORMSG_LONG
args["errmsg"] = errmsg

View File

@ -14,18 +14,7 @@ from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email.utils import formatdate
from io import BytesIO
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Optional,
Sequence,
Tuple,
Union,
)
from typing import IO, TYPE_CHECKING, Any, Optional, Union
from twisted import version as twisted_version
from twisted.internet import ssl
@ -36,6 +25,8 @@ from scrapy.utils.misc import arg_to_iter
from scrapy.utils.python import to_bytes
if TYPE_CHECKING:
from collections.abc import Callable, Sequence
# imports twisted.internet.reactor
from twisted.mail.smtp import ESMTPSenderFactory
from twisted.python.failure import Failure
@ -95,11 +86,11 @@ class MailSender:
def send(
self,
to: Union[str, List[str]],
to: Union[str, list[str]],
subject: str,
body: str,
cc: Union[str, List[str], None] = None,
attachs: Sequence[Tuple[str, str, IO[Any]]] = (),
cc: Union[str, list[str], None] = None,
attachs: Sequence[tuple[str, str, IO[Any]]] = (),
mimetype: str = "text/plain",
charset: Optional[str] = None,
_callback: Optional[Callable[..., None]] = None,
@ -164,7 +155,7 @@ class MailSender:
return dfd
def _sent_ok(
self, result: Any, to: List[str], cc: List[str], subject: str, nattachs: int
self, result: Any, to: list[str], cc: list[str], subject: str, nattachs: int
) -> None:
logger.info(
"Mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
@ -180,8 +171,8 @@ class MailSender:
def _sent_failed(
self,
failure: Failure,
to: List[str],
cc: List[str],
to: list[str],
cc: list[str],
subject: str,
nattachs: int,
) -> Failure:
@ -200,7 +191,7 @@ class MailSender:
)
return failure
def _sendmail(self, to_addrs: List[str], msg: bytes) -> Deferred[Any]:
def _sendmail(self, to_addrs: list[str], msg: bytes) -> Deferred[Any]:
from twisted.internet import reactor
msg_io = BytesIO(msg)
@ -218,11 +209,11 @@ class MailSender:
return d
def _create_sender_factory(
self, to_addrs: List[str], msg: IO[bytes], d: Deferred[Any]
self, to_addrs: list[str], msg: IO[bytes], d: Deferred[Any]
) -> ESMTPSenderFactory:
from twisted.mail.smtp import ESMTPSenderFactory
factory_keywords: Dict[str, Any] = {
factory_keywords: dict[str, Any] = {
"heloFallback": True,
"requireAuthentication": False,
"requireTransportSecurity": self.smtptls,

View File

@ -3,26 +3,15 @@ from __future__ import annotations
import logging
import pprint
from collections import defaultdict, deque
from typing import (
TYPE_CHECKING,
Any,
Callable,
Deque,
Dict,
Iterable,
List,
Optional,
Tuple,
TypeVar,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from scrapy.exceptions import NotConfigured
from scrapy.utils.defer import process_chain, process_parallel
from scrapy.utils.misc import build_from_crawler, build_from_settings, load_object
if TYPE_CHECKING:
from collections.abc import Callable, Iterable
from twisted.internet.defer import Deferred
# typing.Concatenate and typing.ParamSpec require Python 3.10
@ -51,14 +40,14 @@ class MiddlewareManager:
self.middlewares = middlewares
# Only process_spider_output and process_spider_exception can be None.
# Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
self.methods: Dict[
str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]
self.methods: dict[
str, deque[Union[None, Callable, tuple[Callable, Callable]]]
] = defaultdict(deque)
for mw in middlewares:
self._add_middleware(mw)
@classmethod
def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
def _get_mwlist_from_settings(cls, settings: Settings) -> list[Any]:
raise NotImplementedError
@classmethod
@ -107,7 +96,7 @@ class MiddlewareManager:
def _process_parallel(
self, methodname: str, obj: _T, *args: Any
) -> Deferred[List[_T2]]:
) -> Deferred[list[_T2]]:
methods = cast(
"Iterable[Callable[Concatenate[_T, _P], _T2]]", self.methods[methodname]
)
@ -119,8 +108,8 @@ class MiddlewareManager:
)
return process_chain(methods, obj, *args)
def open_spider(self, spider: Spider) -> Deferred[List[None]]:
def open_spider(self, spider: Spider) -> Deferred[list[None]]:
return self._process_parallel("open_spider", spider)
def close_spider(self, spider: Spider) -> Deferred[List[None]]:
def close_spider(self, spider: Spider) -> Deferred[list[None]]:
return self._process_parallel("close_spider", spider)

View File

@ -6,7 +6,7 @@ See documentation in docs/item-pipeline.rst
from __future__ import annotations
from typing import TYPE_CHECKING, Any, List
from typing import TYPE_CHECKING, Any
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
@ -23,7 +23,7 @@ class ItemPipelineManager(MiddlewareManager):
component_name = "item pipeline"
@classmethod
def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
def _get_mwlist_from_settings(cls, settings: Settings) -> list[Any]:
return build_component_list(settings.getwithbase("ITEM_PIPELINES"))
def _add_middleware(self, pipe: Any) -> None:

View File

@ -21,15 +21,9 @@ from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
DefaultDict,
Dict,
List,
NoReturn,
Optional,
Protocol,
Set,
Type,
TypedDict,
Union,
cast,
@ -53,6 +47,7 @@ from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str
if TYPE_CHECKING:
from collections.abc import Callable
from os import PathLike
from twisted.python.failure import Failure
@ -104,8 +99,8 @@ class FilesStoreProtocol(Protocol):
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
) -> Optional[Deferred[Any]]: ...
def stat_file(
@ -120,7 +115,7 @@ class FSFilesStore:
basedir = basedir.split("://", 1)[1]
self.basedir: str = basedir
self._mkdir(Path(self.basedir))
self.created_directories: DefaultDict[MediaPipeline.SpiderInfo, Set[str]] = (
self.created_directories: defaultdict[MediaPipeline.SpiderInfo, set[str]] = (
defaultdict(set)
)
@ -129,8 +124,8 @@ class FSFilesStore:
path: Union[str, PathLike[str]],
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
) -> None:
absolute_path = self._get_filesystem_path(path)
self._mkdir(absolute_path.parent, info)
@ -157,7 +152,7 @@ class FSFilesStore:
def _mkdir(
self, dirname: Path, domain: Optional[MediaPipeline.SpiderInfo] = None
) -> None:
seen: Set[str] = self.created_directories[domain] if domain else set()
seen: set[str] = self.created_directories[domain] if domain else set()
if str(dirname) not in seen:
if not dirname.exists():
dirname.mkdir(parents=True)
@ -201,7 +196,7 @@ class S3FilesStore:
def stat_file(
self, path: str, info: MediaPipeline.SpiderInfo
) -> Deferred[StatInfo]:
def _onsuccess(boto_key: Dict[str, Any]) -> StatInfo:
def _onsuccess(boto_key: dict[str, Any]) -> StatInfo:
checksum = boto_key["ETag"].strip('"')
last_modified = boto_key["LastModified"]
modified_stamp = time.mktime(last_modified.timetuple())
@ -209,10 +204,10 @@ class S3FilesStore:
return self._get_boto_key(path).addCallback(_onsuccess)
def _get_boto_key(self, path: str) -> Deferred[Dict[str, Any]]:
def _get_boto_key(self, path: str) -> Deferred[dict[str, Any]]:
key_name = f"{self.prefix}{path}"
return cast(
"Deferred[Dict[str, Any]]",
"Deferred[dict[str, Any]]",
deferToThread(
self.s3_client.head_object, Bucket=self.bucket, Key=key_name # type: ignore[attr-defined]
),
@ -223,8 +218,8 @@ class S3FilesStore:
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
) -> Deferred[Any]:
"""Upload file to S3 storage"""
key_name = f"{self.prefix}{path}"
@ -242,7 +237,7 @@ class S3FilesStore:
**extra,
)
def _headers_to_botocore_kwargs(self, headers: Dict[str, Any]) -> Dict[str, Any]:
def _headers_to_botocore_kwargs(self, headers: dict[str, Any]) -> dict[str, Any]:
"""Convert headers to botocore keyword arguments."""
# This is required while we need to support both boto and botocore.
mapping = CaseInsensitiveDict(
@ -274,7 +269,7 @@ class S3FilesStore:
"X-Amz-Website-Redirect-Location": "WebsiteRedirectLocation",
}
)
extra: Dict[str, Any] = {}
extra: dict[str, Any] = {}
for key, value in headers.items():
try:
kwarg = mapping[key]
@ -332,7 +327,7 @@ class GCSFilesStore:
deferToThread(self.bucket.get_blob, blob_path).addCallback(_onsuccess),
)
def _get_content_type(self, headers: Optional[Dict[str, str]]) -> str:
def _get_content_type(self, headers: Optional[dict[str, str]]) -> str:
if headers and "Content-Type" in headers:
return headers["Content-Type"]
return "application/octet-stream"
@ -345,8 +340,8 @@ class GCSFilesStore:
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
) -> Deferred[Any]:
blob_path = self._get_blob_path(path)
blob = self.bucket.blob(blob_path)
@ -385,8 +380,8 @@ class FTPFilesStore:
path: str,
buf: BytesIO,
info: MediaPipeline.SpiderInfo,
meta: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
meta: Optional[dict[str, Any]] = None,
headers: Optional[dict[str, str]] = None,
) -> Deferred[Any]:
path = f"{self.basedir}/{path}"
return deferToThread(
@ -443,7 +438,7 @@ class FilesPipeline(MediaPipeline):
MEDIA_NAME: str = "file"
EXPIRES: int = 90
STORE_SCHEMES: Dict[str, Type[FilesStoreProtocol]] = {
STORE_SCHEMES: dict[str, type[FilesStoreProtocol]] = {
"": FSFilesStore,
"file": FSFilesStore,
"s3": S3FilesStore,
@ -457,7 +452,7 @@ class FilesPipeline(MediaPipeline):
self,
store_uri: Union[str, PathLike[str]],
download_func: Optional[Callable[[Request, Spider], Response]] = None,
settings: Union[Settings, Dict[str, Any], None] = None,
settings: Union[Settings, dict[str, Any], None] = None,
):
store_uri = _to_string(store_uri)
if not store_uri:
@ -486,7 +481,7 @@ class FilesPipeline(MediaPipeline):
@classmethod
def from_settings(cls, settings: Settings) -> Self:
s3store: Type[S3FilesStore] = cast(Type[S3FilesStore], cls.STORE_SCHEMES["s3"])
s3store: type[S3FilesStore] = cast(type[S3FilesStore], cls.STORE_SCHEMES["s3"])
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
@ -496,14 +491,14 @@ class FilesPipeline(MediaPipeline):
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
s3store.POLICY = settings["FILES_STORE_S3_ACL"]
gcs_store: Type[GCSFilesStore] = cast(
Type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
gcs_store: type[GCSFilesStore] = cast(
type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
)
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
gcs_store.POLICY = settings["FILES_STORE_GCS_ACL"] or None
ftp_store: Type[FTPFilesStore] = cast(
Type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
ftp_store: type[FTPFilesStore] = cast(
type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
)
ftp_store.FTP_USERNAME = settings["FTP_USER"]
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
@ -660,7 +655,7 @@ class FilesPipeline(MediaPipeline):
# Overridable Interface
def get_media_requests(
self, item: Any, info: MediaPipeline.SpiderInfo
) -> List[Request]:
) -> list[Request]:
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u, callback=NO_CALLBACK) for u in urls]
@ -680,7 +675,7 @@ class FilesPipeline(MediaPipeline):
return checksum
def item_completed(
self, results: List[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
self, results: list[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
) -> Any:
with suppress(KeyError):
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]

View File

@ -11,19 +11,7 @@ import hashlib
import warnings
from contextlib import suppress
from io import BytesIO
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
Type,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from itemadapter import ItemAdapter
@ -42,6 +30,7 @@ from scrapy.settings import Settings
from scrapy.utils.python import get_func_args, to_bytes
if TYPE_CHECKING:
from collections.abc import Callable, Iterable
from os import PathLike
from PIL import Image
@ -79,7 +68,7 @@ class ImagesPipeline(FilesPipeline):
MIN_WIDTH: int = 0
MIN_HEIGHT: int = 0
EXPIRES: int = 90
THUMBS: Dict[str, Tuple[int, int]] = {}
THUMBS: dict[str, tuple[int, int]] = {}
DEFAULT_IMAGES_URLS_FIELD = "image_urls"
DEFAULT_IMAGES_RESULT_FIELD = "images"
@ -87,7 +76,7 @@ class ImagesPipeline(FilesPipeline):
self,
store_uri: Union[str, PathLike[str]],
download_func: Optional[Callable[[Request, Spider], Response]] = None,
settings: Union[Settings, Dict[str, Any], None] = None,
settings: Union[Settings, dict[str, Any], None] = None,
):
try:
from PIL import Image
@ -127,7 +116,7 @@ class ImagesPipeline(FilesPipeline):
self.min_height: int = settings.getint(
resolve("IMAGES_MIN_HEIGHT"), self.MIN_HEIGHT
)
self.thumbs: Dict[str, Tuple[int, int]] = settings.get(
self.thumbs: dict[str, tuple[int, int]] = settings.get(
resolve("IMAGES_THUMBS"), self.THUMBS
)
@ -135,7 +124,7 @@ class ImagesPipeline(FilesPipeline):
@classmethod
def from_settings(cls, settings: Settings) -> Self:
s3store: Type[S3FilesStore] = cast(Type[S3FilesStore], cls.STORE_SCHEMES["s3"])
s3store: type[S3FilesStore] = cast(type[S3FilesStore], cls.STORE_SCHEMES["s3"])
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
@ -145,14 +134,14 @@ class ImagesPipeline(FilesPipeline):
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
s3store.POLICY = settings["IMAGES_STORE_S3_ACL"]
gcs_store: Type[GCSFilesStore] = cast(
Type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
gcs_store: type[GCSFilesStore] = cast(
type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
)
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
gcs_store.POLICY = settings["IMAGES_STORE_GCS_ACL"] or None
ftp_store: Type[FTPFilesStore] = cast(
Type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
ftp_store: type[FTPFilesStore] = cast(
type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
)
ftp_store.FTP_USERNAME = settings["FTP_USER"]
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
@ -202,7 +191,7 @@ class ImagesPipeline(FilesPipeline):
info: MediaPipeline.SpiderInfo,
*,
item: Any = None,
) -> Iterable[Tuple[str, Image.Image, BytesIO]]:
) -> Iterable[tuple[str, Image.Image, BytesIO]]:
path = self.file_path(request, response=response, info=info, item=item)
orig_image = self._Image.open(BytesIO(response.body))
@ -246,9 +235,9 @@ class ImagesPipeline(FilesPipeline):
def convert_image(
self,
image: Image.Image,
size: Optional[Tuple[int, int]] = None,
size: Optional[tuple[int, int]] = None,
response_body: Optional[BytesIO] = None,
) -> Tuple[Image.Image, BytesIO]:
) -> tuple[Image.Image, BytesIO]:
if response_body is None:
warnings.warn(
f"{self.__class__.__name__}.convert_image() method called in a deprecated way, "
@ -288,12 +277,12 @@ class ImagesPipeline(FilesPipeline):
def get_media_requests(
self, item: Any, info: MediaPipeline.SpiderInfo
) -> List[Request]:
) -> list[Request]:
urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u, callback=NO_CALLBACK) for u in urls]
def item_completed(
self, results: List[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
self, results: list[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
) -> Any:
with suppress(KeyError):
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]

View File

@ -7,15 +7,9 @@ from collections import defaultdict
from typing import (
TYPE_CHECKING,
Any,
Callable,
DefaultDict,
Dict,
List,
Literal,
NoReturn,
Optional,
Set,
Tuple,
TypedDict,
TypeVar,
Union,
@ -33,6 +27,8 @@ from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import arg_to_iter
if TYPE_CHECKING:
from collections.abc import Callable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -52,7 +48,7 @@ class FileInfo(TypedDict):
status: str
FileInfoOrError = Union[Tuple[Literal[True], FileInfo], Tuple[Literal[False], Failure]]
FileInfoOrError = Union[tuple[Literal[True], FileInfo], tuple[Literal[False], Failure]]
logger = logging.getLogger(__name__)
@ -67,16 +63,16 @@ class MediaPipeline(ABC):
class SpiderInfo:
def __init__(self, spider: Spider):
self.spider: Spider = spider
self.downloading: Set[bytes] = set()
self.downloaded: Dict[bytes, Union[FileInfo, Failure]] = {}
self.waiting: DefaultDict[bytes, List[Deferred[FileInfo]]] = defaultdict(
self.downloading: set[bytes] = set()
self.downloaded: dict[bytes, Union[FileInfo, Failure]] = {}
self.waiting: defaultdict[bytes, list[Deferred[FileInfo]]] = defaultdict(
list
)
def __init__(
self,
download_func: Optional[Callable[[Request, Spider], Response]] = None,
settings: Union[Settings, Dict[str, Any], None] = None,
settings: Union[Settings, dict[str, Any], None] = None,
):
self.download_func = download_func
@ -129,12 +125,12 @@ class MediaPipeline(ABC):
def process_item(
self, item: Any, spider: Spider
) -> Deferred[List[FileInfoOrError]]:
) -> Deferred[list[FileInfoOrError]]:
info = self.spiderinfo
requests = arg_to_iter(self.get_media_requests(item, info))
dlist = [self._process_request(r, info, item) for r in requests]
dfd = cast(
"Deferred[List[FileInfoOrError]]", DeferredList(dlist, consumeErrors=True)
"Deferred[list[FileInfoOrError]]", DeferredList(dlist, consumeErrors=True)
)
return dfd.addCallback(self.item_completed, item, info)
@ -252,7 +248,7 @@ class MediaPipeline(ABC):
raise NotImplementedError()
@abstractmethod
def get_media_requests(self, item: Any, info: SpiderInfo) -> List[Request]:
def get_media_requests(self, item: Any, info: SpiderInfo) -> list[Request]:
"""Returns the media requests to download"""
raise NotImplementedError()
@ -276,7 +272,7 @@ class MediaPipeline(ABC):
raise NotImplementedError()
def item_completed(
self, results: List[FileInfoOrError], item: Any, info: SpiderInfo
self, results: list[FileInfoOrError], item: Any, info: SpiderInfo
) -> Any:
"""Called per item when all media requests has been processed"""
if self.LOG_FAILED_RESULTS:

View File

@ -2,23 +2,15 @@ from __future__ import annotations
import hashlib
import logging
from typing import (
TYPE_CHECKING,
Dict,
Iterable,
List,
Optional,
Protocol,
Tuple,
Type,
cast,
)
from typing import TYPE_CHECKING, Optional, Protocol, cast
from scrapy import Request
from scrapy.core.downloader import Downloader
from scrapy.utils.misc import build_from_crawler
if TYPE_CHECKING:
from collections.abc import Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -87,7 +79,7 @@ class ScrapyPriorityQueue:
def from_crawler(
cls,
crawler: Crawler,
downstream_queue_cls: Type[QueueProtocol],
downstream_queue_cls: type[QueueProtocol],
key: str,
startprios: Iterable[int] = (),
) -> Self:
@ -96,14 +88,14 @@ class ScrapyPriorityQueue:
def __init__(
self,
crawler: Crawler,
downstream_queue_cls: Type[QueueProtocol],
downstream_queue_cls: type[QueueProtocol],
key: str,
startprios: Iterable[int] = (),
):
self.crawler: Crawler = crawler
self.downstream_queue_cls: Type[QueueProtocol] = downstream_queue_cls
self.downstream_queue_cls: type[QueueProtocol] = downstream_queue_cls
self.key: str = key
self.queues: Dict[int, QueueProtocol] = {}
self.queues: dict[int, QueueProtocol] = {}
self.curprio: Optional[int] = None
self.init_prios(startprios)
@ -160,8 +152,8 @@ class ScrapyPriorityQueue:
# Protocols can't declare optional members
return cast(Request, queue.peek()) # type: ignore[attr-defined]
def close(self) -> List[int]:
active: List[int] = []
def close(self) -> list[int]:
active: list[int] = []
for p, q in self.queues.items():
active.append(p)
q.close()
@ -176,7 +168,7 @@ class DownloaderInterface:
assert crawler.engine
self.downloader: Downloader = crawler.engine.downloader
def stats(self, possible_slots: Iterable[str]) -> List[Tuple[int, str]]:
def stats(self, possible_slots: Iterable[str]) -> list[tuple[int, str]]:
return [(self._active_downloads(slot), slot) for slot in possible_slots]
def get_slot_key(self, request: Request) -> str:
@ -199,18 +191,18 @@ class DownloaderAwarePriorityQueue:
def from_crawler(
cls,
crawler: Crawler,
downstream_queue_cls: Type[QueueProtocol],
downstream_queue_cls: type[QueueProtocol],
key: str,
startprios: Optional[Dict[str, Iterable[int]]] = None,
startprios: Optional[dict[str, Iterable[int]]] = None,
) -> Self:
return cls(crawler, downstream_queue_cls, key, startprios)
def __init__(
self,
crawler: Crawler,
downstream_queue_cls: Type[QueueProtocol],
downstream_queue_cls: type[QueueProtocol],
key: str,
slot_startprios: Optional[Dict[str, Iterable[int]]] = None,
slot_startprios: Optional[dict[str, Iterable[int]]] = None,
):
if crawler.settings.getint("CONCURRENT_REQUESTS_PER_IP") != 0:
raise ValueError(
@ -229,11 +221,11 @@ class DownloaderAwarePriorityQueue:
)
self._downloader_interface: DownloaderInterface = DownloaderInterface(crawler)
self.downstream_queue_cls: Type[QueueProtocol] = downstream_queue_cls
self.downstream_queue_cls: type[QueueProtocol] = downstream_queue_cls
self.key: str = key
self.crawler: Crawler = crawler
self.pqueues: Dict[str, ScrapyPriorityQueue] = {} # slot -> priority queue
self.pqueues: dict[str, ScrapyPriorityQueue] = {} # slot -> priority queue
for slot, startprios in (slot_startprios or {}).items():
self.pqueues[slot] = self.pqfactory(slot, startprios)
@ -281,7 +273,7 @@ class DownloaderAwarePriorityQueue:
queue = self.pqueues[slot]
return queue.peek()
def close(self) -> Dict[str, List[int]]:
def close(self) -> dict[str, list[int]]:
active = {slot: queue.close() for slot, queue in self.pqueues.items()}
self.pqueues.clear()
return active

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Type
from typing import TYPE_CHECKING, Any, Optional
from twisted.internet import defer
from twisted.internet.base import ReactorBase, ThreadedResolver
@ -16,6 +16,8 @@ from zope.interface.declarations import implementer, provider
from scrapy.utils.datatypes import LocalCache
if TYPE_CHECKING:
from collections.abc import Sequence
from twisted.internet.defer import Deferred
# typing.Self requires Python 3.11
@ -82,7 +84,7 @@ class _CachingResolutionReceiver:
def __init__(self, resolutionReceiver: IResolutionReceiver, hostName: str):
self.resolutionReceiver: IResolutionReceiver = resolutionReceiver
self.hostName: str = hostName
self.addresses: List[IAddress] = []
self.addresses: list[IAddress] = []
def resolutionBegan(self, resolution: IHostResolution) -> None:
self.resolutionReceiver.resolutionBegan(resolution)
@ -126,7 +128,7 @@ class CachingHostnameResolver:
resolutionReceiver: IResolutionReceiver,
hostName: str,
portNumber: int = 0,
addressTypes: Optional[Sequence[Type[IAddress]]] = None,
addressTypes: Optional[Sequence[type[IAddress]]] = None,
transportSemantics: str = "TCP",
) -> IHostResolution:
try:

View File

@ -3,15 +3,20 @@ This module implements a class which returns the appropriate Response class
based on different criteria.
"""
from __future__ import annotations
from io import StringIO
from mimetypes import MimeTypes
from pkgutil import get_data
from typing import Dict, Mapping, Optional, Type, Union
from typing import TYPE_CHECKING, Optional, Union
from scrapy.http import Response
from scrapy.utils.misc import load_object
from scrapy.utils.python import binary_is_text, to_bytes, to_unicode
if TYPE_CHECKING:
from collections.abc import Mapping
class ResponseTypes:
CLASSES = {
@ -32,7 +37,7 @@ class ResponseTypes:
}
def __init__(self) -> None:
self.classes: Dict[str, Type[Response]] = {}
self.classes: dict[str, type[Response]] = {}
self.mimetypes: MimeTypes = MimeTypes()
mimedata = get_data("scrapy", "mime.types")
if not mimedata:
@ -43,7 +48,7 @@ class ResponseTypes:
for mimetype, cls in self.CLASSES.items():
self.classes[mimetype] = load_object(cls)
def from_mimetype(self, mimetype: str) -> Type[Response]:
def from_mimetype(self, mimetype: str) -> type[Response]:
"""Return the most appropriate Response class for the given mimetype"""
if mimetype is None:
return Response
@ -54,7 +59,7 @@ class ResponseTypes:
def from_content_type(
self, content_type: Union[str, bytes], content_encoding: Optional[bytes] = None
) -> Type[Response]:
) -> type[Response]:
"""Return the most appropriate Response class from an HTTP Content-Type
header"""
if content_encoding:
@ -66,7 +71,7 @@ class ResponseTypes:
def from_content_disposition(
self, content_disposition: Union[str, bytes]
) -> Type[Response]:
) -> type[Response]:
try:
filename = (
to_unicode(content_disposition, encoding="latin-1", errors="replace")
@ -78,7 +83,7 @@ class ResponseTypes:
except IndexError:
return Response
def from_headers(self, headers: Mapping[bytes, bytes]) -> Type[Response]:
def from_headers(self, headers: Mapping[bytes, bytes]) -> type[Response]:
"""Return the most appropriate Response class by looking at the HTTP
headers"""
cls = Response
@ -91,14 +96,14 @@ class ResponseTypes:
cls = self.from_content_disposition(headers[b"Content-Disposition"])
return cls
def from_filename(self, filename: str) -> Type[Response]:
def from_filename(self, filename: str) -> type[Response]:
"""Return the most appropriate Response class from a file name"""
mimetype, encoding = self.mimetypes.guess_type(filename)
if mimetype and not encoding:
return self.from_mimetype(mimetype)
return Response
def from_body(self, body: bytes) -> Type[Response]:
def from_body(self, body: bytes) -> type[Response]:
"""Try to guess the appropriate response based on the body content.
This method is a bit magic and could be improved in the future, but
it's not meant to be used except for special cases where response types
@ -122,7 +127,7 @@ class ResponseTypes:
url: Optional[str] = None,
filename: Optional[str] = None,
body: Optional[bytes] = None,
) -> Type[Response]:
) -> type[Response]:
"""Guess the most appropriate Response class based on
the given arguments."""
cls = Response

View File

@ -2,7 +2,7 @@
XPath selectors based on lxml
"""
from typing import Any, Optional, Type, Union
from typing import Any, Optional, Union
from parsel import Selector as _ParselSelector
@ -23,7 +23,7 @@ def _st(response: Optional[TextResponse], st: Optional[str]) -> str:
def _response_from_text(text: Union[str, bytes], st: Optional[str]) -> TextResponse:
rt: Type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
rt: type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
return rt(url="about:blank", encoding="utf-8", body=to_bytes(text, "utf-8"))

View File

@ -2,22 +2,10 @@ from __future__ import annotations
import copy
import json
from collections.abc import Iterable, Iterator, Mapping, MutableMapping
from importlib import import_module
from pprint import pformat
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Iterator,
List,
Mapping,
MutableMapping,
Optional,
Tuple,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from scrapy.settings import default_settings
@ -37,7 +25,7 @@ if TYPE_CHECKING:
_SettingsInputT = Union[SupportsItems[_SettingsKeyT, Any], str, None]
SETTINGS_PRIORITIES: Dict[str, int] = {
SETTINGS_PRIORITIES: dict[str, int] = {
"default": 0,
"command": 10,
"addon": 15,
@ -192,8 +180,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
return float(self.get(name, default))
def getlist(
self, name: _SettingsKeyT, default: Optional[List[Any]] = None
) -> List[Any]:
self, name: _SettingsKeyT, default: Optional[list[Any]] = None
) -> list[Any]:
"""
Get a setting value as a list. If the setting original type is a list, a
copy of it will be returned. If it's a string it will be split by ",".
@ -213,8 +201,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
return list(value)
def getdict(
self, name: _SettingsKeyT, default: Optional[Dict[Any, Any]] = None
) -> Dict[Any, Any]:
self, name: _SettingsKeyT, default: Optional[dict[Any, Any]] = None
) -> dict[Any, Any]:
"""
Get a setting value as a dictionary. If the setting original type is a
dictionary, a copy of it will be returned. If it is a string it will be
@ -238,8 +226,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
def getdictorlist(
self,
name: _SettingsKeyT,
default: Union[Dict[Any, Any], List[Any], Tuple[Any], None] = None,
) -> Union[Dict[Any, Any], List[Any]]:
default: Union[dict[Any, Any], list[Any], tuple[Any], None] = None,
) -> Union[dict[Any, Any], list[Any]]:
"""Get a setting value as either a :class:`dict` or a :class:`list`.
If the setting is already a dict or a list, a copy of it will be
@ -412,7 +400,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
"""
self._assert_mutability()
if isinstance(values, str):
values = cast(Dict[_SettingsKeyT, Any], json.loads(values))
values = cast(dict[_SettingsKeyT, Any], json.loads(values))
if values is not None:
if isinstance(values, BaseSettings):
for name, value in values.items():
@ -477,7 +465,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
def __len__(self) -> int:
return len(self.attributes)
def _to_dict(self) -> Dict[_SettingsKeyT, Any]:
def _to_dict(self) -> dict[_SettingsKeyT, Any]:
return {
self._get_key(k): (v._to_dict() if isinstance(v, BaseSettings) else v)
for k, v in self.items()
@ -490,7 +478,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
else str(key_value)
)
def copy_to_dict(self) -> Dict[_SettingsKeyT, Any]:
def copy_to_dict(self) -> dict[_SettingsKeyT, Any]:
"""
Make a copy of current settings and convert to a dict.
@ -553,7 +541,7 @@ class Settings(BaseSettings):
self.update(values, priority)
def iter_default_settings() -> Iterable[Tuple[str, Any]]:
def iter_default_settings() -> Iterable[tuple[str, Any]]:
"""Return the default settings as an iterator of (name, value) tuples"""
for name in dir(default_settings):
if name.isupper():
@ -562,7 +550,7 @@ def iter_default_settings() -> Iterable[Tuple[str, Any]]:
def overridden_settings(
settings: Mapping[_SettingsKeyT, Any]
) -> Iterable[Tuple[str, Any]]:
) -> Iterable[tuple[str, Any]]:
"""Return an iterable of the settings that have been overridden"""
for name, defvalue in iter_default_settings():
value = settings[name]

View File

@ -8,7 +8,7 @@ from __future__ import annotations
import os
import signal
from typing import Any, Callable, Dict, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from itemadapter import is_item
from twisted.internet import defer, threads
@ -27,25 +27,28 @@ from scrapy.utils.misc import load_object
from scrapy.utils.reactor import is_asyncio_reactor_installed, set_asyncio_event_loop
from scrapy.utils.response import open_in_browser
if TYPE_CHECKING:
from collections.abc import Callable
class Shell:
relevant_classes: Tuple[type, ...] = (Crawler, Spider, Request, Response, Settings)
relevant_classes: tuple[type, ...] = (Crawler, Spider, Request, Response, Settings)
def __init__(
self,
crawler: Crawler,
update_vars: Optional[Callable[[Dict[str, Any]], None]] = None,
update_vars: Optional[Callable[[dict[str, Any]], None]] = None,
code: Optional[str] = None,
):
self.crawler: Crawler = crawler
self.update_vars: Callable[[Dict[str, Any]], None] = update_vars or (
self.update_vars: Callable[[dict[str, Any]], None] = update_vars or (
lambda x: None
)
self.item_class: type = load_object(crawler.settings["DEFAULT_ITEM_CLASS"])
self.spider: Optional[Spider] = None
self.inthread: bool = not threadable.isInIOThread()
self.code: Optional[str] = code
self.vars: Dict[str, Any] = {}
self.vars: dict[str, Any] = {}
def start(
self,

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, List, Tuple
from typing import TYPE_CHECKING, Any
from pydispatch import dispatcher
@ -40,7 +40,7 @@ class SignalManager:
kwargs.setdefault("sender", self.sender)
dispatcher.disconnect(receiver, signal, **kwargs)
def send_catch_log(self, signal: Any, **kwargs: Any) -> List[Tuple[Any, Any]]:
def send_catch_log(self, signal: Any, **kwargs: Any) -> list[tuple[Any, Any]]:
"""
Send a signal, catch exceptions and log them.
@ -52,7 +52,7 @@ class SignalManager:
def send_catch_log_deferred(
self, signal: Any, **kwargs: Any
) -> Deferred[List[Tuple[Any, Any]]]:
) -> Deferred[list[tuple[Any, Any]]]:
"""
Like :meth:`send_catch_log` but supports returning
:class:`~twisted.internet.defer.Deferred` objects from signal handlers.

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import traceback
import warnings
from collections import defaultdict
from typing import TYPE_CHECKING, DefaultDict, Dict, List, Tuple, Type
from typing import TYPE_CHECKING
from zope.interface import implementer
@ -29,10 +29,10 @@ class SpiderLoader:
"""
def __init__(self, settings: BaseSettings):
self.spider_modules: List[str] = settings.getlist("SPIDER_MODULES")
self.spider_modules: list[str] = settings.getlist("SPIDER_MODULES")
self.warn_only: bool = settings.getbool("SPIDER_LOADER_WARN_ONLY")
self._spiders: Dict[str, Type[Spider]] = {}
self._found: DefaultDict[str, List[Tuple[str, str]]] = defaultdict(list)
self._spiders: dict[str, type[Spider]] = {}
self._found: defaultdict[str, list[tuple[str, str]]] = defaultdict(list)
self._load_all_spiders()
def _check_name_duplicates(self) -> None:
@ -80,7 +80,7 @@ class SpiderLoader:
def from_settings(cls, settings: BaseSettings) -> Self:
return cls(settings)
def load(self, spider_name: str) -> Type[Spider]:
def load(self, spider_name: str) -> type[Spider]:
"""
Return the Spider class for the given spider name. If the spider
name is not found, raise a KeyError.
@ -90,7 +90,7 @@ class SpiderLoader:
except KeyError:
raise KeyError(f"Spider not found: {spider_name}")
def find_by_request(self, request: Request) -> List[str]:
def find_by_request(self, request: Request) -> list[str]:
"""
Return the list of spider names that can handle the given request.
"""
@ -98,7 +98,7 @@ class SpiderLoader:
name for name, cls in self._spiders.items() if cls.handles_request(request)
]
def list(self) -> List[str]:
def list(self) -> list[str]:
"""
Return a list with the names of all spiders available in the project.
"""

View File

@ -7,11 +7,13 @@ See documentation in docs/topics/spider-middleware.rst
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, AsyncIterable, Iterable
from typing import TYPE_CHECKING, Any
from scrapy.http import Request, Response
if TYPE_CHECKING:
from collections.abc import AsyncIterable, Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self

View File

@ -7,11 +7,13 @@ See documentation in docs/topics/spider-middleware.rst
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Iterable, List, Optional
from typing import TYPE_CHECKING, Any, Optional
from scrapy.exceptions import IgnoreRequest
if TYPE_CHECKING:
from collections.abc import Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -39,7 +41,7 @@ class HttpErrorMiddleware:
def __init__(self, settings: BaseSettings):
self.handle_httpstatus_all: bool = settings.getbool("HTTPERROR_ALLOW_ALL")
self.handle_httpstatus_list: List[int] = settings.getlist(
self.handle_httpstatus_list: list[int] = settings.getlist(
"HTTPERROR_ALLOWED_CODES"
)

View File

@ -9,7 +9,7 @@ from __future__ import annotations
import logging
import re
import warnings
from typing import TYPE_CHECKING, Any, AsyncIterable, Iterable, Set
from typing import TYPE_CHECKING, Any
from scrapy import Spider, signals
from scrapy.exceptions import ScrapyDeprecationWarning
@ -23,6 +23,8 @@ warnings.warn(
)
if TYPE_CHECKING:
from collections.abc import AsyncIterable, Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -109,7 +111,7 @@ class OffsiteMiddleware:
def spider_opened(self, spider: Spider) -> None:
self.host_regex: re.Pattern[str] = self.get_host_regex(spider)
self.domains_seen: Set[str] = set()
self.domains_seen: set[str] = set()
class URLWarning(Warning):

View File

@ -6,18 +6,7 @@ originated it.
from __future__ import annotations
import warnings
from typing import (
TYPE_CHECKING,
Any,
AsyncIterable,
Dict,
Iterable,
Optional,
Tuple,
Type,
Union,
cast,
)
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from urllib.parse import urlparse
from w3lib.url import safe_url_string
@ -30,6 +19,8 @@ from scrapy.utils.python import to_unicode
from scrapy.utils.url import strip_url
if TYPE_CHECKING:
from collections.abc import AsyncIterable, Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -37,7 +28,7 @@ if TYPE_CHECKING:
from scrapy.settings import BaseSettings
LOCAL_SCHEMES: Tuple[str, ...] = (
LOCAL_SCHEMES: tuple[str, ...] = (
"about",
"blob",
"data",
@ -56,7 +47,7 @@ POLICY_SCRAPY_DEFAULT = "scrapy-default"
class ReferrerPolicy:
NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES
name: str
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
@ -291,11 +282,11 @@ class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
using ``file://`` or ``s3://`` scheme.
"""
NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES + ("file", "s3")
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES + ("file", "s3")
name: str = POLICY_SCRAPY_DEFAULT
_policy_classes: Dict[str, Type[ReferrerPolicy]] = {
_policy_classes: dict[str, type[ReferrerPolicy]] = {
p.name: p
for p in (
NoReferrerPolicy,
@ -316,14 +307,14 @@ _policy_classes[""] = NoReferrerWhenDowngradePolicy
def _load_policy_class(
policy: str, warning_only: bool = False
) -> Optional[Type[ReferrerPolicy]]:
) -> Optional[type[ReferrerPolicy]]:
"""
Expect a string for the path to the policy class,
otherwise try to interpret the string as a standard value
from https://www.w3.org/TR/referrer-policy/#referrer-policies
"""
try:
return cast(Type[ReferrerPolicy], load_object(policy))
return cast(type[ReferrerPolicy], load_object(policy))
except ValueError:
tokens = [token.strip() for token in policy.lower().split(",")]
# https://www.w3.org/TR/referrer-policy/#parse-referrer-policy-from-header
@ -341,7 +332,7 @@ def _load_policy_class(
class RefererMiddleware:
def __init__(self, settings: Optional[BaseSettings] = None):
self.default_policy: Type[ReferrerPolicy] = DefaultReferrerPolicy
self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy
if settings is not None:
settings_policy = _load_policy_class(settings.get("REFERRER_POLICY"))
assert settings_policy

View File

@ -7,12 +7,14 @@ See documentation in docs/topics/spider-middleware.rst
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, AsyncIterable, Iterable
from typing import TYPE_CHECKING, Any
from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
if TYPE_CHECKING:
from collections.abc import AsyncIterable, Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, cast
from typing import TYPE_CHECKING, Any, Optional, cast
from scrapy import signals
from scrapy.http import Request, Response
@ -15,6 +15,8 @@ from scrapy.utils.trackref import object_ref
from scrapy.utils.url import url_is_from_spider
if TYPE_CHECKING:
from collections.abc import Iterable
from twisted.internet.defer import Deferred
# typing.Self requires Python 3.11
@ -32,7 +34,7 @@ class Spider(object_ref):
"""
name: str
custom_settings: Optional[Dict[_SettingsKeyT, Any]] = None
custom_settings: Optional[dict[_SettingsKeyT, Any]] = None
def __init__(self, name: Optional[str] = None, **kwargs: Any):
if name is not None:
@ -41,7 +43,7 @@ class Spider(object_ref):
raise ValueError(f"{type(self).__name__} must have a name")
self.__dict__.update(kwargs)
if not hasattr(self, "start_urls"):
self.start_urls: List[str] = []
self.start_urls: list[str] = []
@property
def logger(self) -> SpiderLoggerAdapter:

View File

@ -1,6 +1,6 @@
"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.
for scraping typical websites that requires crawling pages.
See documentation in docs/topics/spiders.rst
"""
@ -8,22 +8,8 @@ See documentation in docs/topics/spiders.rst
from __future__ import annotations
import copy
from typing import (
TYPE_CHECKING,
Any,
AsyncIterable,
Awaitable,
Callable,
Dict,
Iterable,
List,
Optional,
Sequence,
Set,
TypeVar,
Union,
cast,
)
from collections.abc import AsyncIterable, Awaitable, Callable
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
from twisted.python.failure import Failure
@ -35,6 +21,8 @@ from scrapy.utils.asyncgen import collect_asyncgen
from scrapy.utils.spider import iterate_spider_output
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
# typing.Self requires Python 3.11
from typing_extensions import Self
@ -43,7 +31,7 @@ if TYPE_CHECKING:
_T = TypeVar("_T")
ProcessLinksT = Callable[[List[Link]], List[Link]]
ProcessLinksT = Callable[[list[Link]], list[Link]]
ProcessRequestT = Callable[[Request, Response], Optional[Request]]
@ -75,7 +63,7 @@ class Rule:
self,
link_extractor: Optional[LinkExtractor] = None,
callback: Union[CallbackT, str, None] = None,
cb_kwargs: Optional[Dict[str, Any]] = None,
cb_kwargs: Optional[dict[str, Any]] = None,
follow: Optional[bool] = None,
process_links: Union[ProcessLinksT, str, None] = None,
process_request: Union[ProcessRequestT, str, None] = None,
@ -84,7 +72,7 @@ class Rule:
self.link_extractor: LinkExtractor = link_extractor or _default_link_extractor
self.callback: Union[CallbackT, str, None] = callback
self.errback: Union[Callable[[Failure], Any], str, None] = errback
self.cb_kwargs: Dict[str, Any] = cb_kwargs or {}
self.cb_kwargs: dict[str, Any] = cb_kwargs or {}
self.process_links: Union[ProcessLinksT, str] = process_links or _identity
self.process_request: Union[ProcessRequestT, str] = (
process_request or _identity_process_request
@ -105,7 +93,7 @@ class Rule:
class CrawlSpider(Spider):
rules: Sequence[Rule] = ()
_rules: List[Rule]
_rules: list[Rule]
_follow_links: bool
def __init__(self, *a: Any, **kw: Any):
@ -139,9 +127,9 @@ class CrawlSpider(Spider):
def _requests_to_follow(self, response: Response) -> Iterable[Optional[Request]]:
if not isinstance(response, HtmlResponse):
return
seen: Set[Link] = set()
seen: set[Link] = set()
for rule_index, rule in enumerate(self._rules):
links: List[Link] = [
links: list[Link] = [
lnk
for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen
@ -170,7 +158,7 @@ class CrawlSpider(Spider):
self,
response: Response,
callback: Optional[CallbackT],
cb_kwargs: Dict[str, Any],
cb_kwargs: dict[str, Any],
follow: bool = True,
) -> AsyncIterable[Any]:
if callback:

View File

@ -5,7 +5,9 @@ for scraping from an XML feed.
See documentation in docs/topics/spiders.rst
"""
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.http import Response, TextResponse
@ -14,6 +16,9 @@ from scrapy.spiders import Spider
from scrapy.utils.iterators import csviter, xmliter_lxml
from scrapy.utils.spider import iterate_spider_output
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
class XMLFeedSpider(Spider):
"""
@ -27,7 +32,7 @@ class XMLFeedSpider(Spider):
iterator: str = "iternodes"
itertag: str = "item"
namespaces: Sequence[Tuple[str, str]] = ()
namespaces: Sequence[tuple[str, str]] = ()
def process_results(
self, response: Response, results: Iterable[Any]
@ -118,7 +123,7 @@ class CSVFeedSpider(Spider):
quotechar: Optional[str] = (
None # When this is None, python's csv module's default quotechar is used
)
headers: Optional[List[str]] = None
headers: Optional[list[str]] = None
def process_results(
self, response: Response, results: Iterable[Any]
@ -130,7 +135,7 @@ class CSVFeedSpider(Spider):
"""This method has the same purpose as the one in XMLFeedSpider"""
return response
def parse_row(self, response: Response, row: Dict[str, str]) -> Any:
def parse_row(self, response: Response, row: dict[str, str]) -> Any:
"""This method must be overridden with your custom spider functionality"""
raise NotImplementedError

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Iterable, Optional, cast
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Optional, cast
from scrapy import Request
from scrapy.spiders import Spider

Some files were not shown because too many files have changed in this diff Show More