mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 09:07:32 +00:00
Drop Python 3.8 Support (#6472)
This commit is contained in:
parent
9736e49b52
commit
5391663072
4
.github/workflows/checks.yml
vendored
4
.github/workflows/checks.yml
vendored
@ -15,10 +15,10 @@ jobs:
|
||||
- python-version: "3.12"
|
||||
env:
|
||||
TOXENV: pylint
|
||||
- python-version: 3.8
|
||||
- python-version: "3.9"
|
||||
env:
|
||||
TOXENV: typing
|
||||
- python-version: 3.8
|
||||
- python-version: "3.9"
|
||||
env:
|
||||
TOXENV: typing-tests
|
||||
- python-version: "3.12" # Keep in sync with .readthedocs.yml
|
||||
|
2
.github/workflows/tests-macos.yml
vendored
2
.github/workflows/tests-macos.yml
vendored
@ -11,7 +11,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
12
.github/workflows/tests-ubuntu.yml
vendored
12
.github/workflows/tests-ubuntu.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- python-version: 3.9
|
||||
- python-version: "3.9"
|
||||
env:
|
||||
TOXENV: py
|
||||
- python-version: "3.10"
|
||||
@ -35,19 +35,19 @@ jobs:
|
||||
TOXENV: pypy3
|
||||
|
||||
# pinned deps
|
||||
- python-version: 3.8.17
|
||||
- python-version: 3.9.19
|
||||
env:
|
||||
TOXENV: pinned
|
||||
- python-version: 3.8.17
|
||||
- python-version: 3.9.19
|
||||
env:
|
||||
TOXENV: asyncio-pinned
|
||||
- python-version: pypy3.8
|
||||
- python-version: pypy3.9
|
||||
env:
|
||||
TOXENV: pypy3-pinned
|
||||
- python-version: 3.8.17
|
||||
- python-version: 3.9.19
|
||||
env:
|
||||
TOXENV: extra-deps-pinned
|
||||
- python-version: 3.8.17
|
||||
- python-version: 3.9.19
|
||||
env:
|
||||
TOXENV: botocore-pinned
|
||||
|
||||
|
5
.github/workflows/tests-windows.yml
vendored
5
.github/workflows/tests-windows.yml
vendored
@ -12,12 +12,9 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- python-version: 3.8
|
||||
- python-version: "3.9"
|
||||
env:
|
||||
TOXENV: windows-pinned
|
||||
- python-version: 3.9
|
||||
env:
|
||||
TOXENV: py
|
||||
- python-version: "3.10"
|
||||
env:
|
||||
TOXENV: py
|
||||
|
@ -33,4 +33,4 @@ repos:
|
||||
rev: v3.16.0
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: [--py38-plus, --keep-runtime-typing]
|
||||
args: [--py39-plus, --keep-runtime-typing]
|
||||
|
@ -59,7 +59,7 @@ including a list of features.
|
||||
Requirements
|
||||
============
|
||||
|
||||
* Python 3.8+
|
||||
* Python 3.9+
|
||||
* Works on Linux, Windows, macOS, BSD
|
||||
|
||||
Install
|
||||
|
@ -9,7 +9,7 @@ Installation guide
|
||||
Supported Python versions
|
||||
=========================
|
||||
|
||||
Scrapy requires Python 3.8+, either the CPython implementation (default) or
|
||||
Scrapy requires Python 3.9+, either the CPython implementation (default) or
|
||||
the PyPy implementation (see :ref:`python:implementations`).
|
||||
|
||||
.. _intro-install-scrapy:
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, List
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.conf import build_component_list
|
||||
@ -20,7 +20,7 @@ class AddonManager:
|
||||
|
||||
def __init__(self, crawler: Crawler) -> None:
|
||||
self.crawler: Crawler = crawler
|
||||
self.addons: List[Any] = []
|
||||
self.addons: list[Any] = []
|
||||
|
||||
def load_settings(self, settings: Settings) -> None:
|
||||
"""Load add-ons and configurations from a settings object and apply them.
|
||||
|
@ -6,7 +6,7 @@ import inspect
|
||||
import os
|
||||
import sys
|
||||
from importlib.metadata import entry_points
|
||||
from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import BaseRunSpiderCommand, ScrapyCommand, ScrapyHelpFormatter
|
||||
@ -17,6 +17,8 @@ from scrapy.utils.project import get_project_settings, inside_project
|
||||
from scrapy.utils.python import garbage_collect
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
# typing.ParamSpec requires Python 3.10
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
@ -28,7 +30,7 @@ if TYPE_CHECKING:
|
||||
class ScrapyArgumentParser(argparse.ArgumentParser):
|
||||
def _parse_optional(
|
||||
self, arg_string: str
|
||||
) -> Optional[Tuple[Optional[argparse.Action], str, Optional[str]]]:
|
||||
) -> Optional[tuple[Optional[argparse.Action], str, Optional[str]]]:
|
||||
# if starts with -: it means that is a parameter not a argument
|
||||
if arg_string[:2] == "-:":
|
||||
return None
|
||||
@ -36,7 +38,7 @@ class ScrapyArgumentParser(argparse.ArgumentParser):
|
||||
return super()._parse_optional(arg_string)
|
||||
|
||||
|
||||
def _iter_command_classes(module_name: str) -> Iterable[Type[ScrapyCommand]]:
|
||||
def _iter_command_classes(module_name: str) -> Iterable[type[ScrapyCommand]]:
|
||||
# TODO: add `name` attribute to commands and merge this function with
|
||||
# scrapy.utils.spider.iter_spider_classes
|
||||
for module in walk_modules(module_name):
|
||||
@ -50,8 +52,8 @@ def _iter_command_classes(module_name: str) -> Iterable[Type[ScrapyCommand]]:
|
||||
yield obj
|
||||
|
||||
|
||||
def _get_commands_from_module(module: str, inproject: bool) -> Dict[str, ScrapyCommand]:
|
||||
d: Dict[str, ScrapyCommand] = {}
|
||||
def _get_commands_from_module(module: str, inproject: bool) -> dict[str, ScrapyCommand]:
|
||||
d: dict[str, ScrapyCommand] = {}
|
||||
for cmd in _iter_command_classes(module):
|
||||
if inproject or not cmd.requires_project:
|
||||
cmdname = cmd.__module__.split(".")[-1]
|
||||
@ -61,8 +63,8 @@ def _get_commands_from_module(module: str, inproject: bool) -> Dict[str, ScrapyC
|
||||
|
||||
def _get_commands_from_entry_points(
|
||||
inproject: bool, group: str = "scrapy.commands"
|
||||
) -> Dict[str, ScrapyCommand]:
|
||||
cmds: Dict[str, ScrapyCommand] = {}
|
||||
) -> dict[str, ScrapyCommand]:
|
||||
cmds: dict[str, ScrapyCommand] = {}
|
||||
if sys.version_info >= (3, 10):
|
||||
eps = entry_points(group=group)
|
||||
else:
|
||||
@ -78,7 +80,7 @@ def _get_commands_from_entry_points(
|
||||
|
||||
def _get_commands_dict(
|
||||
settings: BaseSettings, inproject: bool
|
||||
) -> Dict[str, ScrapyCommand]:
|
||||
) -> dict[str, ScrapyCommand]:
|
||||
cmds = _get_commands_from_module("scrapy.commands", inproject)
|
||||
cmds.update(_get_commands_from_entry_points(inproject))
|
||||
cmds_module = settings["COMMANDS_MODULE"]
|
||||
@ -87,7 +89,7 @@ def _get_commands_dict(
|
||||
return cmds
|
||||
|
||||
|
||||
def _pop_command_name(argv: List[str]) -> Optional[str]:
|
||||
def _pop_command_name(argv: list[str]) -> Optional[str]:
|
||||
i = 0
|
||||
for arg in argv[1:]:
|
||||
if not arg.startswith("-"):
|
||||
@ -146,7 +148,7 @@ def _run_print_help(
|
||||
|
||||
|
||||
def execute(
|
||||
argv: Optional[List[str]] = None, settings: Optional[Settings] = None
|
||||
argv: Optional[list[str]] = None, settings: Optional[Settings] = None
|
||||
) -> None:
|
||||
if argv is None:
|
||||
argv = sys.argv
|
||||
@ -189,7 +191,7 @@ def execute(
|
||||
sys.exit(cmd.exitcode)
|
||||
|
||||
|
||||
def _run_command(cmd: ScrapyCommand, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def _run_command(cmd: ScrapyCommand, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if opts.profile:
|
||||
_run_command_profiled(cmd, args, opts)
|
||||
else:
|
||||
@ -197,7 +199,7 @@ def _run_command(cmd: ScrapyCommand, args: List[str], opts: argparse.Namespace)
|
||||
|
||||
|
||||
def _run_command_profiled(
|
||||
cmd: ScrapyCommand, args: List[str], opts: argparse.Namespace
|
||||
cmd: ScrapyCommand, args: list[str], opts: argparse.Namespace
|
||||
) -> None:
|
||||
if opts.profile:
|
||||
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
|
||||
|
@ -8,7 +8,7 @@ import argparse
|
||||
import builtins
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from twisted.python import failure
|
||||
|
||||
@ -16,6 +16,8 @@ from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from scrapy.crawler import Crawler, CrawlerProcess
|
||||
|
||||
|
||||
@ -24,7 +26,7 @@ class ScrapyCommand:
|
||||
crawler_process: Optional[CrawlerProcess] = None
|
||||
|
||||
# default settings to be used for this command instead of global defaults
|
||||
default_settings: Dict[str, Any] = {}
|
||||
default_settings: dict[str, Any] = {}
|
||||
|
||||
exitcode: int = 0
|
||||
|
||||
@ -97,7 +99,7 @@ class ScrapyCommand:
|
||||
)
|
||||
group.add_argument("--pdb", action="store_true", help="enable pdb on failure")
|
||||
|
||||
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def process_options(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
try:
|
||||
self.settings.setdict(arglist_to_dict(opts.set), priority="cmdline")
|
||||
except ValueError:
|
||||
@ -122,7 +124,7 @@ class ScrapyCommand:
|
||||
if opts.pdb:
|
||||
failure.startDebugMode()
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
"""
|
||||
Entry point for running commands
|
||||
"""
|
||||
@ -167,7 +169,7 @@ class BaseRunSpiderCommand(ScrapyCommand):
|
||||
help="format to use for dumping items",
|
||||
)
|
||||
|
||||
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def process_options(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
super().process_options(args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
@ -207,7 +209,7 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
|
||||
parts = self.format_part_strings(builtins.list(part_strings))
|
||||
return super()._join_parts(parts)
|
||||
|
||||
def format_part_strings(self, part_strings: List[str]) -> List[str]:
|
||||
def format_part_strings(self, part_strings: list[str]) -> list[str]:
|
||||
"""
|
||||
Underline and title case command line help message headers.
|
||||
"""
|
||||
|
@ -4,7 +4,7 @@ import argparse
|
||||
import subprocess # nosec
|
||||
import sys
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import scrapy
|
||||
@ -13,6 +13,8 @@ from scrapy.http import Response, TextResponse
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
@ -26,7 +28,7 @@ class Command(ScrapyCommand):
|
||||
def short_desc(self) -> str:
|
||||
return "Run quick benchmark test"
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
with _BenchServer():
|
||||
assert self.crawler_process
|
||||
self.crawler_process.crawl(_BenchSpider, total=100000)
|
||||
|
@ -1,7 +1,6 @@
|
||||
import argparse
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from typing import List
|
||||
from unittest import TextTestResult as _TextTestResult
|
||||
from unittest import TextTestRunner
|
||||
|
||||
@ -69,7 +68,7 @@ class Command(ScrapyCommand):
|
||||
help="print contract tests for all spiders",
|
||||
)
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
# load contracts
|
||||
contracts = build_component_list(self.settings.getwithbase("SPIDER_CONTRACTS"))
|
||||
conman = ContractsManager(load_object(c) for c in contracts)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, cast
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -20,7 +20,7 @@ class Command(BaseRunSpiderCommand):
|
||||
def short_desc(self) -> str:
|
||||
return "Run a spider"
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if len(args) < 1:
|
||||
raise UsageError()
|
||||
elif len(args) > 1:
|
||||
|
@ -1,7 +1,6 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
@ -27,7 +26,7 @@ class Command(ScrapyCommand):
|
||||
sys.stderr.write(msg + os.linesep)
|
||||
self.exitcode = 1
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, Dict, List, Type
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from w3lib.url import is_url
|
||||
|
||||
@ -48,7 +48,7 @@ class Command(ScrapyCommand):
|
||||
help="do not handle HTTP 3xx status codes and print response as-is",
|
||||
)
|
||||
|
||||
def _print_headers(self, headers: Dict[bytes, List[bytes]], prefix: bytes) -> None:
|
||||
def _print_headers(self, headers: dict[bytes, list[bytes]], prefix: bytes) -> None:
|
||||
for key, values in headers.items():
|
||||
for value in values:
|
||||
self._print_bytes(prefix + b" " + key + b": " + value)
|
||||
@ -65,7 +65,7 @@ class Command(ScrapyCommand):
|
||||
def _print_bytes(self, bytes_: bytes) -> None:
|
||||
sys.stdout.buffer.write(bytes_ + b"\n")
|
||||
|
||||
def run(self, args: List[str], opts: Namespace) -> None:
|
||||
def run(self, args: list[str], opts: Namespace) -> None:
|
||||
if len(args) != 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
request = Request(
|
||||
@ -81,7 +81,7 @@ class Command(ScrapyCommand):
|
||||
else:
|
||||
request.meta["handle_httpstatus_all"] = True
|
||||
|
||||
spidercls: Type[Spider] = DefaultSpider
|
||||
spidercls: type[Spider] = DefaultSpider
|
||||
assert self.crawler_process
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
if opts.spider:
|
||||
|
@ -4,7 +4,7 @@ import shutil
|
||||
import string
|
||||
from importlib import import_module
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
from typing import Optional, Union, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import scrapy
|
||||
@ -87,7 +87,7 @@ class Command(ScrapyCommand):
|
||||
help="If the spider already exists, overwrite it with the template",
|
||||
)
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if opts.list:
|
||||
self._list_templates()
|
||||
return
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
|
||||
@ -15,7 +15,7 @@ class Command(ScrapyCommand):
|
||||
def short_desc(self) -> str:
|
||||
return "List available spiders"
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
assert self.crawler_process
|
||||
for s in sorted(self.crawler_process.spider_loader.list()):
|
||||
print(s)
|
||||
|
@ -5,20 +5,7 @@ import functools
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
Coroutine,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, overload
|
||||
|
||||
from itemadapter import ItemAdapter, is_item
|
||||
from twisted.internet.defer import Deferred, maybeDeferred
|
||||
@ -35,6 +22,8 @@ from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.spider import spidercls_for_request
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncGenerator, Coroutine, Iterable
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.http.request import CallbackT
|
||||
@ -50,8 +39,8 @@ class Command(BaseRunSpiderCommand):
|
||||
requires_project = True
|
||||
|
||||
spider = None
|
||||
items: Dict[int, List[Any]] = {}
|
||||
requests: Dict[int, List[Request]] = {}
|
||||
items: dict[int, list[Any]] = {}
|
||||
requests: dict[int, list[Request]] = {}
|
||||
|
||||
first_response = None
|
||||
|
||||
@ -166,11 +155,11 @@ class Command(BaseRunSpiderCommand):
|
||||
return d
|
||||
return arg_to_iter(deferred_from_coro(result))
|
||||
|
||||
def add_items(self, lvl: int, new_items: List[Any]) -> None:
|
||||
def add_items(self, lvl: int, new_items: list[Any]) -> None:
|
||||
old_items = self.items.get(lvl, [])
|
||||
self.items[lvl] = old_items + new_items
|
||||
|
||||
def add_requests(self, lvl: int, new_reqs: List[Request]) -> None:
|
||||
def add_requests(self, lvl: int, new_reqs: list[Request]) -> None:
|
||||
old_reqs = self.requests.get(lvl, [])
|
||||
self.requests[lvl] = old_reqs + new_reqs
|
||||
|
||||
@ -219,7 +208,7 @@ class Command(BaseRunSpiderCommand):
|
||||
depth: int,
|
||||
spider: Spider,
|
||||
callback: CallbackT,
|
||||
) -> Tuple[List[Any], List[Request], argparse.Namespace, int, Spider, CallbackT]:
|
||||
) -> tuple[list[Any], list[Request], argparse.Namespace, int, Spider, CallbackT]:
|
||||
items, requests = [], []
|
||||
for x in spider_output:
|
||||
if is_item(x):
|
||||
@ -232,7 +221,7 @@ class Command(BaseRunSpiderCommand):
|
||||
self,
|
||||
response: Response,
|
||||
callback: CallbackT,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> Deferred[Any]:
|
||||
cb_kwargs = cb_kwargs or {}
|
||||
d = maybeDeferred(self.iterate_spider_output, callback(response, **cb_kwargs))
|
||||
@ -285,10 +274,10 @@ class Command(BaseRunSpiderCommand):
|
||||
|
||||
def scraped_data(
|
||||
self,
|
||||
args: Tuple[
|
||||
List[Any], List[Request], argparse.Namespace, int, Spider, CallbackT
|
||||
args: tuple[
|
||||
list[Any], list[Request], argparse.Namespace, int, Spider, CallbackT
|
||||
],
|
||||
) -> List[Any]:
|
||||
) -> list[Any]:
|
||||
items, requests, opts, depth, spider, callback = args
|
||||
if opts.pipelines:
|
||||
itemproc = self.pcrawler.engine.scraper.itemproc
|
||||
@ -345,7 +334,7 @@ class Command(BaseRunSpiderCommand):
|
||||
def prepare_request(
|
||||
self, spider: Spider, request: Request, opts: argparse.Namespace
|
||||
) -> Request:
|
||||
def callback(response: Response, **cb_kwargs: Any) -> Deferred[List[Any]]:
|
||||
def callback(response: Response, **cb_kwargs: Any) -> Deferred[list[Any]]:
|
||||
# memorize first request
|
||||
if not self.first_response:
|
||||
self.first_response = response
|
||||
@ -376,7 +365,7 @@ class Command(BaseRunSpiderCommand):
|
||||
request.callback = callback
|
||||
return request
|
||||
|
||||
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def process_options(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
super().process_options(args, opts)
|
||||
|
||||
self.process_request_meta(opts)
|
||||
@ -404,7 +393,7 @@ class Command(BaseRunSpiderCommand):
|
||||
print_help=False,
|
||||
)
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
# parse arguments
|
||||
if not len(args) == 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
|
@ -4,7 +4,7 @@ import argparse
|
||||
import sys
|
||||
from importlib import import_module
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, List, Union
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
@ -41,7 +41,7 @@ class Command(BaseRunSpiderCommand):
|
||||
def long_desc(self) -> str:
|
||||
return "Run the spider defined in the given file"
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
filename = Path(args[0])
|
||||
|
@ -1,6 +1,5 @@
|
||||
import argparse
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.settings import BaseSettings
|
||||
@ -46,7 +45,7 @@ class Command(ScrapyCommand):
|
||||
help="print setting value, interpreted as a list",
|
||||
)
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
assert self.crawler_process
|
||||
settings = self.crawler_process.settings
|
||||
if opts.get:
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/shell.rst
|
||||
from __future__ import annotations
|
||||
|
||||
from threading import Thread
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Type
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy import Spider
|
||||
from scrapy.commands import ScrapyCommand
|
||||
@ -56,13 +56,13 @@ class Command(ScrapyCommand):
|
||||
help="do not handle HTTP 3xx status codes and print response as-is",
|
||||
)
|
||||
|
||||
def update_vars(self, vars: Dict[str, Any]) -> None:
|
||||
def update_vars(self, vars: dict[str, Any]) -> None:
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
available in the shell
|
||||
"""
|
||||
pass
|
||||
|
||||
def run(self, args: List[str], opts: Namespace) -> None:
|
||||
def run(self, args: list[str], opts: Namespace) -> None:
|
||||
url = args[0] if args else None
|
||||
if url:
|
||||
# first argument may be a local file
|
||||
@ -71,7 +71,7 @@ class Command(ScrapyCommand):
|
||||
assert self.crawler_process
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
spidercls: Type[Spider] = DefaultSpider
|
||||
spidercls: type[Spider] = DefaultSpider
|
||||
if opts.spider:
|
||||
spidercls = spider_loader.load(opts.spider)
|
||||
elif url:
|
||||
|
@ -6,14 +6,14 @@ from importlib.util import find_spec
|
||||
from pathlib import Path
|
||||
from shutil import copy2, copystat, ignore_patterns, move
|
||||
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
|
||||
from typing import List, Tuple, Union
|
||||
from typing import Union
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||
|
||||
TEMPLATES_TO_RENDER: Tuple[Tuple[str, ...], ...] = (
|
||||
TEMPLATES_TO_RENDER: tuple[tuple[str, ...], ...] = (
|
||||
("scrapy.cfg",),
|
||||
("${project_name}", "settings.py.tmpl"),
|
||||
("${project_name}", "items.py.tmpl"),
|
||||
@ -86,7 +86,7 @@ class Command(ScrapyCommand):
|
||||
copystat(src, dst)
|
||||
_make_writable(dst)
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if len(args) not in (1, 2):
|
||||
raise UsageError()
|
||||
|
||||
@ -107,9 +107,7 @@ class Command(ScrapyCommand):
|
||||
return
|
||||
|
||||
self._copytree(Path(self.templates_dir), project_dir.resolve())
|
||||
# On 3.8 shutil.move doesn't fully support Path args, but it supports our use case
|
||||
# See https://bugs.python.org/issue32689
|
||||
move(project_dir / "module", project_dir / project_name) # type: ignore[arg-type]
|
||||
move(project_dir / "module", project_dir / project_name)
|
||||
for paths in TEMPLATES_TO_RENDER:
|
||||
tplfile = Path(
|
||||
project_dir,
|
||||
|
@ -1,5 +1,4 @@
|
||||
import argparse
|
||||
from typing import List
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
@ -25,7 +24,7 @@ class Command(ScrapyCommand):
|
||||
help="also display twisted/python/platform info (useful for bug reports)",
|
||||
)
|
||||
|
||||
def run(self, args: List[str], opts: argparse.Namespace) -> None:
|
||||
def run(self, args: list[str], opts: argparse.Namespace) -> None:
|
||||
if opts.verbose:
|
||||
versions = scrapy_components_versions()
|
||||
width = max(len(n) for (n, _) in versions)
|
||||
|
@ -2,22 +2,11 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from collections.abc import AsyncGenerator, Iterable
|
||||
from functools import wraps
|
||||
from inspect import getmembers
|
||||
from types import CoroutineType
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
from unittest import TestCase, TestResult
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
@ -25,6 +14,8 @@ from scrapy.utils.python import get_spec
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import Spider
|
||||
@ -33,13 +24,13 @@ if TYPE_CHECKING:
|
||||
class Contract:
|
||||
"""Abstract class for contracts"""
|
||||
|
||||
request_cls: Optional[Type[Request]] = None
|
||||
request_cls: Optional[type[Request]] = None
|
||||
name: str
|
||||
|
||||
def __init__(self, method: Callable, *args: Any):
|
||||
self.testcase_pre = _create_testcase(method, f"@{self.name} pre-hook")
|
||||
self.testcase_post = _create_testcase(method, f"@{self.name} post-hook")
|
||||
self.args: Tuple[Any, ...] = args
|
||||
self.args: tuple[Any, ...] = args
|
||||
|
||||
def add_pre_hook(self, request: Request, results: TestResult) -> Request:
|
||||
if hasattr(self, "pre_process"):
|
||||
@ -47,7 +38,7 @@ class Contract:
|
||||
assert cb is not None
|
||||
|
||||
@wraps(cb)
|
||||
def wrapper(response: Response, **cb_kwargs: Any) -> List[Any]:
|
||||
def wrapper(response: Response, **cb_kwargs: Any) -> list[Any]:
|
||||
try:
|
||||
results.startTest(self.testcase_pre)
|
||||
self.pre_process(response)
|
||||
@ -76,7 +67,7 @@ class Contract:
|
||||
assert cb is not None
|
||||
|
||||
@wraps(cb)
|
||||
def wrapper(response: Response, **cb_kwargs: Any) -> List[Any]:
|
||||
def wrapper(response: Response, **cb_kwargs: Any) -> list[Any]:
|
||||
cb_result = cb(response, **cb_kwargs)
|
||||
if isinstance(cb_result, (AsyncGenerator, CoroutineType)):
|
||||
raise TypeError("Contracts don't support async callbacks")
|
||||
@ -98,18 +89,18 @@ class Contract:
|
||||
|
||||
return request
|
||||
|
||||
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
return args
|
||||
|
||||
|
||||
class ContractsManager:
|
||||
contracts: Dict[str, Type[Contract]] = {}
|
||||
contracts: dict[str, type[Contract]] = {}
|
||||
|
||||
def __init__(self, contracts: Iterable[Type[Contract]]):
|
||||
def __init__(self, contracts: Iterable[type[Contract]]):
|
||||
for contract in contracts:
|
||||
self.contracts[contract.name] = contract
|
||||
|
||||
def tested_methods_from_spidercls(self, spidercls: Type[Spider]) -> List[str]:
|
||||
def tested_methods_from_spidercls(self, spidercls: type[Spider]) -> list[str]:
|
||||
is_method = re.compile(r"^\s*@", re.MULTILINE).search
|
||||
methods = []
|
||||
for key, value in getmembers(spidercls):
|
||||
@ -118,8 +109,8 @@ class ContractsManager:
|
||||
|
||||
return methods
|
||||
|
||||
def extract_contracts(self, method: Callable) -> List[Contract]:
|
||||
contracts: List[Contract] = []
|
||||
def extract_contracts(self, method: Callable) -> list[Contract]:
|
||||
contracts: list[Contract] = []
|
||||
assert method.__doc__ is not None
|
||||
for line in method.__doc__.split("\n"):
|
||||
line = line.strip()
|
||||
@ -137,8 +128,8 @@ class ContractsManager:
|
||||
|
||||
def from_spider(
|
||||
self, spider: Spider, results: TestResult
|
||||
) -> List[Optional[Request]]:
|
||||
requests: List[Optional[Request]] = []
|
||||
) -> list[Optional[Request]]:
|
||||
requests: list[Optional[Request]] = []
|
||||
for method in self.tested_methods_from_spidercls(type(spider)):
|
||||
bound_method = spider.__getattribute__(method)
|
||||
try:
|
||||
|
@ -1,5 +1,5 @@
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from itemadapter import ItemAdapter, is_item
|
||||
|
||||
@ -16,7 +16,7 @@ class UrlContract(Contract):
|
||||
|
||||
name = "url"
|
||||
|
||||
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
args["url"] = self.args[0]
|
||||
return args
|
||||
|
||||
@ -30,7 +30,7 @@ class CallbackKeywordArgumentsContract(Contract):
|
||||
|
||||
name = "cb_kwargs"
|
||||
|
||||
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
args["cb_kwargs"] = json.loads(" ".join(self.args))
|
||||
return args
|
||||
|
||||
@ -44,7 +44,7 @@ class MetadataContract(Contract):
|
||||
|
||||
name = "meta"
|
||||
|
||||
def adjust_request_args(self, args: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def adjust_request_args(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
args["meta"] = json.loads(" ".join(self.args))
|
||||
return args
|
||||
|
||||
@ -63,7 +63,7 @@ class ReturnsContract(Contract):
|
||||
"""
|
||||
|
||||
name = "returns"
|
||||
object_type_verifiers: Dict[Optional[str], Callable[[Any], bool]] = {
|
||||
object_type_verifiers: dict[Optional[str], Callable[[Any], bool]] = {
|
||||
"request": lambda x: isinstance(x, Request),
|
||||
"requests": lambda x: isinstance(x, Request),
|
||||
"item": is_item,
|
||||
@ -90,7 +90,7 @@ class ReturnsContract(Contract):
|
||||
except IndexError:
|
||||
self.max_bound = float("inf")
|
||||
|
||||
def post_process(self, output: List[Any]) -> None:
|
||||
def post_process(self, output: list[Any]) -> None:
|
||||
occurrences = 0
|
||||
for x in output:
|
||||
if self.obj_type_verifier(x):
|
||||
@ -116,7 +116,7 @@ class ScrapesContract(Contract):
|
||||
|
||||
name = "scrapes"
|
||||
|
||||
def post_process(self, output: List[Any]) -> None:
|
||||
def post_process(self, output: list[Any]) -> None:
|
||||
for x in output:
|
||||
if is_item(x):
|
||||
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
|
||||
|
@ -5,18 +5,7 @@ import warnings
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
from time import time
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Deque,
|
||||
Dict,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from twisted.internet import task
|
||||
from twisted.internet.defer import Deferred
|
||||
@ -55,9 +44,9 @@ class Slot:
|
||||
self.randomize_delay: bool = randomize_delay
|
||||
self.throttle = throttle
|
||||
|
||||
self.active: Set[Request] = set()
|
||||
self.queue: Deque[Tuple[Request, Deferred[Response]]] = deque()
|
||||
self.transferring: Set[Request] = set()
|
||||
self.active: set[Request] = set()
|
||||
self.queue: deque[tuple[Request, Deferred[Response]]] = deque()
|
||||
self.transferring: set[Request] = set()
|
||||
self.lastseen: float = 0
|
||||
self.latercall = None
|
||||
|
||||
@ -95,7 +84,7 @@ class Slot:
|
||||
|
||||
def _get_concurrency_delay(
|
||||
concurrency: int, spider: Spider, settings: BaseSettings
|
||||
) -> Tuple[int, float]:
|
||||
) -> tuple[int, float]:
|
||||
delay: float = settings.getfloat("DOWNLOAD_DELAY")
|
||||
if hasattr(spider, "download_delay"):
|
||||
delay = spider.download_delay
|
||||
@ -112,8 +101,8 @@ class Downloader:
|
||||
def __init__(self, crawler: Crawler):
|
||||
self.settings: BaseSettings = crawler.settings
|
||||
self.signals: SignalManager = crawler.signals
|
||||
self.slots: Dict[str, Slot] = {}
|
||||
self.active: Set[Request] = set()
|
||||
self.slots: dict[str, Slot] = {}
|
||||
self.active: set[Request] = set()
|
||||
self.handlers: DownloadHandlers = DownloadHandlers(crawler)
|
||||
self.total_concurrency: int = self.settings.getint("CONCURRENT_REQUESTS")
|
||||
self.domain_concurrency: int = self.settings.getint(
|
||||
@ -126,7 +115,7 @@ class Downloader:
|
||||
)
|
||||
self._slot_gc_loop: task.LoopingCall = task.LoopingCall(self._slot_gc)
|
||||
self._slot_gc_loop.start(60)
|
||||
self.per_slot_settings: Dict[str, Dict[str, Any]] = self.settings.getdict(
|
||||
self.per_slot_settings: dict[str, dict[str, Any]] = self.settings.getdict(
|
||||
"DOWNLOAD_SLOTS", {}
|
||||
)
|
||||
|
||||
@ -146,7 +135,7 @@ class Downloader:
|
||||
def needs_backout(self) -> bool:
|
||||
return len(self.active) >= self.total_concurrency
|
||||
|
||||
def _get_slot(self, request: Request, spider: Spider) -> Tuple[str, Slot]:
|
||||
def _get_slot(self, request: Request, spider: Spider) -> tuple[str, Slot]:
|
||||
key = self.get_slot_key(request)
|
||||
if key not in self.slots:
|
||||
slot_settings = self.per_slot_settings.get(key, {})
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, List, Optional
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from OpenSSL import SSL
|
||||
from twisted.internet._sslverify import _setAcceptableProtocols
|
||||
@ -154,10 +154,10 @@ class AcceptableProtocolsContextFactory:
|
||||
negotiation.
|
||||
"""
|
||||
|
||||
def __init__(self, context_factory: Any, acceptable_protocols: List[bytes]):
|
||||
def __init__(self, context_factory: Any, acceptable_protocols: list[bytes]):
|
||||
verifyObject(IPolicyForHTTPS, context_factory)
|
||||
self._wrapped_context_factory: Any = context_factory
|
||||
self._acceptable_protocols: List[bytes] = acceptable_protocols
|
||||
self._acceptable_protocols: list[bytes] = acceptable_protocols
|
||||
|
||||
def creatorForNetloc(self, hostname: bytes, port: int) -> ClientTLSOptions:
|
||||
options: ClientTLSOptions = self._wrapped_context_factory.creatorForNetloc(
|
||||
|
@ -3,18 +3,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
Optional,
|
||||
Protocol,
|
||||
Type,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING, Any, Optional, Protocol, Union, cast
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
@ -25,6 +15,8 @@ from scrapy.utils.misc import build_from_crawler, load_object
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
@ -43,16 +35,16 @@ class DownloadHandlerProtocol(Protocol):
|
||||
class DownloadHandlers:
|
||||
def __init__(self, crawler: Crawler):
|
||||
self._crawler: Crawler = crawler
|
||||
self._schemes: Dict[str, Union[str, Callable[..., Any]]] = (
|
||||
self._schemes: dict[str, Union[str, Callable[..., Any]]] = (
|
||||
{}
|
||||
) # stores acceptable schemes on instancing
|
||||
self._handlers: Dict[str, DownloadHandlerProtocol] = (
|
||||
self._handlers: dict[str, DownloadHandlerProtocol] = (
|
||||
{}
|
||||
) # stores instanced handlers for schemes
|
||||
self._notconfigured: Dict[str, str] = {} # remembers failed handlers
|
||||
handlers: Dict[str, Union[str, Callable[..., Any]]] = without_none_values(
|
||||
self._notconfigured: dict[str, str] = {} # remembers failed handlers
|
||||
handlers: dict[str, Union[str, Callable[..., Any]]] = without_none_values(
|
||||
cast(
|
||||
Dict[str, Union[str, Callable[..., Any]]],
|
||||
dict[str, Union[str, Callable[..., Any]]],
|
||||
crawler.settings.getwithbase("DOWNLOAD_HANDLERS"),
|
||||
)
|
||||
)
|
||||
@ -81,7 +73,7 @@ class DownloadHandlers:
|
||||
) -> Optional[DownloadHandlerProtocol]:
|
||||
path = self._schemes[scheme]
|
||||
try:
|
||||
dhcls: Type[DownloadHandlerProtocol] = load_object(path)
|
||||
dhcls: type[DownloadHandlerProtocol] = load_object(path)
|
||||
if skip_lazy and getattr(dhcls, "lazy", True):
|
||||
return None
|
||||
dh = build_from_crawler(
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from w3lib.url import parse_data_uri
|
||||
|
||||
@ -20,7 +20,7 @@ class DataURIDownloadHandler:
|
||||
uri = parse_data_uri(request.url)
|
||||
respcls = responsetypes.from_mimetype(uri.media_type)
|
||||
|
||||
resp_kwargs: Dict[str, Any] = {}
|
||||
resp_kwargs: dict[str, Any] = {}
|
||||
if issubclass(respcls, TextResponse) and uri.media_type.split("/")[0] == "text":
|
||||
charset = uri.media_type_parameters.get("charset")
|
||||
resp_kwargs["encoding"] = charset
|
||||
|
@ -32,7 +32,7 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Optional
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
from twisted.internet.protocol import ClientCreator, Protocol
|
||||
@ -79,7 +79,7 @@ _CODE_RE = re.compile(r"\d+")
|
||||
class FTPDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
CODE_MAPPING: Dict[str, int] = {
|
||||
CODE_MAPPING: dict[str, int] = {
|
||||
"550": 404,
|
||||
"default": 503,
|
||||
}
|
||||
|
@ -1,9 +1,8 @@
|
||||
"""Download handlers for http and https schemes
|
||||
"""
|
||||
"""Download handlers for http and https schemes"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Type
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy.utils.misc import build_from_crawler, load_object
|
||||
from scrapy.utils.python import to_unicode
|
||||
@ -27,10 +26,10 @@ class HTTP10DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings: BaseSettings, crawler: Crawler):
|
||||
self.HTTPClientFactory: Type[ScrapyHTTPClientFactory] = load_object(
|
||||
self.HTTPClientFactory: type[ScrapyHTTPClientFactory] = load_object(
|
||||
settings["DOWNLOADER_HTTPCLIENTFACTORY"]
|
||||
)
|
||||
self.ClientContextFactory: Type[ScrapyClientContextFactory] = load_object(
|
||||
self.ClientContextFactory: type[ScrapyClientContextFactory] = load_object(
|
||||
settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]
|
||||
)
|
||||
self._settings: BaseSettings = settings
|
||||
|
@ -8,7 +8,7 @@ import re
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, TypedDict, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypedDict, TypeVar, Union
|
||||
from urllib.parse import urldefrag, urlunparse
|
||||
|
||||
from twisted.internet import ssl
|
||||
@ -52,7 +52,7 @@ _T = TypeVar("_T")
|
||||
class _ResultT(TypedDict):
|
||||
txresponse: TxResponse
|
||||
body: bytes
|
||||
flags: Optional[List[str]]
|
||||
flags: Optional[list[str]]
|
||||
certificate: Optional[ssl.Certificate]
|
||||
ip_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address, None]
|
||||
failure: NotRequired[Optional[Failure]]
|
||||
@ -143,10 +143,10 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
reactor: ReactorBase,
|
||||
host: str,
|
||||
port: int,
|
||||
proxyConf: Tuple[str, int, Optional[bytes]],
|
||||
proxyConf: tuple[str, int, Optional[bytes]],
|
||||
contextFactory: IPolicyForHTTPS,
|
||||
timeout: float = 30,
|
||||
bindAddress: Optional[Tuple[str, int]] = None,
|
||||
bindAddress: Optional[tuple[str, int]] = None,
|
||||
):
|
||||
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
|
||||
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
|
||||
@ -254,14 +254,14 @@ class TunnelingAgent(Agent):
|
||||
self,
|
||||
*,
|
||||
reactor: ReactorBase,
|
||||
proxyConf: Tuple[str, int, Optional[bytes]],
|
||||
proxyConf: tuple[str, int, Optional[bytes]],
|
||||
contextFactory: IPolicyForHTTPS,
|
||||
connectTimeout: Optional[float] = None,
|
||||
bindAddress: Optional[bytes] = None,
|
||||
pool: Optional[HTTPConnectionPool] = None,
|
||||
):
|
||||
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
|
||||
self._proxyConf: Tuple[str, int, Optional[bytes]] = proxyConf
|
||||
self._proxyConf: tuple[str, int, Optional[bytes]] = proxyConf
|
||||
self._contextFactory: IPolicyForHTTPS = contextFactory
|
||||
|
||||
def _getEndpoint(self, uri: URI) -> TunnelingTCP4ClientEndpoint:
|
||||
@ -621,7 +621,7 @@ class _ResponseReader(Protocol):
|
||||
self._crawler: Crawler = crawler
|
||||
|
||||
def _finish_response(
|
||||
self, flags: Optional[List[str]] = None, failure: Optional[Failure] = None
|
||||
self, flags: Optional[list[str]] = None, failure: Optional[Failure] = None
|
||||
) -> None:
|
||||
self._finished.callback(
|
||||
{
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional, Type
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -29,7 +29,7 @@ class S3DownloadHandler:
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
aws_secret_access_key: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
httpdownloadhandler: Type[HTTPDownloadHandler] = HTTPDownloadHandler,
|
||||
httpdownloadhandler: type[HTTPDownloadHandler] = HTTPDownloadHandler,
|
||||
**kw: Any,
|
||||
):
|
||||
if not is_botocore_available():
|
||||
|
@ -6,7 +6,8 @@ See documentation in docs/topics/downloader-middleware.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Callable, Generator, List, Union, cast
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
|
||||
@ -17,6 +18,8 @@ from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.defer import deferred_from_coro, mustbe_deferred
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import Spider
|
||||
@ -27,7 +30,7 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
component_name = "downloader middleware"
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> List[Any]:
|
||||
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> list[Any]:
|
||||
return build_component_list(settings.getwithbase("DOWNLOADER_MIDDLEWARES"))
|
||||
|
||||
def _add_middleware(self, mw: Any) -> None:
|
||||
|
@ -1,5 +1,5 @@
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
from typing import Any
|
||||
|
||||
from OpenSSL import SSL
|
||||
from service_identity.exceptions import CertificateError
|
||||
@ -21,7 +21,7 @@ METHOD_TLSv11 = "TLSv1.1"
|
||||
METHOD_TLSv12 = "TLSv1.2"
|
||||
|
||||
|
||||
openssl_methods: Dict[str, int] = {
|
||||
openssl_methods: dict[str, int] = {
|
||||
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
||||
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
||||
METHOD_TLSv11: SSL.TLSv1_1_METHOD, # TLS 1.1 only
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
from time import time
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
|
||||
|
||||
from twisted.internet import defer
|
||||
@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
def _parsed_url_args(parsed: ParseResult) -> Tuple[bytes, bytes, bytes, int, bytes]:
|
||||
def _parsed_url_args(parsed: ParseResult) -> tuple[bytes, bytes, bytes, int, bytes]:
|
||||
# Assume parsed is urlparse-d from Request.url,
|
||||
# which was passed via safe_url_string and is ascii-only.
|
||||
path_str = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
|
||||
@ -33,7 +33,7 @@ def _parsed_url_args(parsed: ParseResult) -> Tuple[bytes, bytes, bytes, int, byt
|
||||
return scheme, netloc, host, port, path
|
||||
|
||||
|
||||
def _parse(url: str) -> Tuple[bytes, bytes, bytes, int, bytes]:
|
||||
def _parse(url: str) -> tuple[bytes, bytes, bytes, int, bytes]:
|
||||
"""Return tuple of (scheme, netloc, host, port, path),
|
||||
all in bytes except for port which is int.
|
||||
Assume url is from Request.url, which was passed via safe_url_string
|
||||
|
@ -9,20 +9,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from time import time
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Generator,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Optional,
|
||||
Set,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks, succeed
|
||||
@ -42,6 +29,8 @@ from scrapy.utils.misc import build_from_crawler, load_object
|
||||
from scrapy.utils.reactor import CallLaterOnce
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Generator, Iterable, Iterator
|
||||
|
||||
from scrapy.core.scheduler import BaseScheduler
|
||||
from scrapy.core.scraper import _HandleOutputDeferred
|
||||
from scrapy.crawler import Crawler
|
||||
@ -63,7 +52,7 @@ class Slot:
|
||||
scheduler: BaseScheduler,
|
||||
) -> None:
|
||||
self.closing: Optional[Deferred[None]] = None
|
||||
self.inprogress: Set[Request] = set()
|
||||
self.inprogress: set[Request] = set()
|
||||
self.start_requests: Optional[Iterator[Request]] = iter(start_requests)
|
||||
self.close_if_idle: bool = close_if_idle
|
||||
self.nextcall: CallLaterOnce[None] = nextcall
|
||||
@ -106,10 +95,10 @@ class ExecutionEngine:
|
||||
self.spider: Optional[Spider] = None
|
||||
self.running: bool = False
|
||||
self.paused: bool = False
|
||||
self.scheduler_cls: Type[BaseScheduler] = self._get_scheduler_class(
|
||||
self.scheduler_cls: type[BaseScheduler] = self._get_scheduler_class(
|
||||
crawler.settings
|
||||
)
|
||||
downloader_cls: Type[Downloader] = load_object(self.settings["DOWNLOADER"])
|
||||
downloader_cls: type[Downloader] = load_object(self.settings["DOWNLOADER"])
|
||||
self.downloader: Downloader = downloader_cls(crawler)
|
||||
self.scraper = Scraper(crawler)
|
||||
self._spider_closed_callback: Callable[[Spider], Optional[Deferred[None]]] = (
|
||||
@ -117,10 +106,10 @@ class ExecutionEngine:
|
||||
)
|
||||
self.start_time: Optional[float] = None
|
||||
|
||||
def _get_scheduler_class(self, settings: BaseSettings) -> Type[BaseScheduler]:
|
||||
def _get_scheduler_class(self, settings: BaseSettings) -> type[BaseScheduler]:
|
||||
from scrapy.core.scheduler import BaseScheduler
|
||||
|
||||
scheduler_cls: Type[BaseScheduler] = load_object(settings["SCHEDULER"])
|
||||
scheduler_cls: type[BaseScheduler] = load_object(settings["SCHEDULER"])
|
||||
if not issubclass(scheduler_cls, BaseScheduler):
|
||||
raise TypeError(
|
||||
f"The provided scheduler class ({settings['SCHEDULER']})"
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING, Deque, Dict, List, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.defer import Deferred
|
||||
@ -26,7 +26,7 @@ if TYPE_CHECKING:
|
||||
from scrapy.spiders import Spider
|
||||
|
||||
|
||||
ConnectionKeyT = Tuple[bytes, bytes, int]
|
||||
ConnectionKeyT = tuple[bytes, bytes, int]
|
||||
|
||||
|
||||
class H2ConnectionPool:
|
||||
@ -36,11 +36,11 @@ class H2ConnectionPool:
|
||||
|
||||
# Store a dictionary which is used to get the respective
|
||||
# H2ClientProtocolInstance using the key as Tuple(scheme, hostname, port)
|
||||
self._connections: Dict[ConnectionKeyT, H2ClientProtocol] = {}
|
||||
self._connections: dict[ConnectionKeyT, H2ClientProtocol] = {}
|
||||
|
||||
# Save all requests that arrive before the connection is established
|
||||
self._pending_requests: Dict[
|
||||
ConnectionKeyT, Deque[Deferred[H2ClientProtocol]]
|
||||
self._pending_requests: dict[
|
||||
ConnectionKeyT, deque[Deferred[H2ClientProtocol]]
|
||||
] = {}
|
||||
|
||||
def get_connection(
|
||||
@ -68,7 +68,7 @@ class H2ConnectionPool:
|
||||
) -> Deferred[H2ClientProtocol]:
|
||||
self._pending_requests[key] = deque()
|
||||
|
||||
conn_lost_deferred: Deferred[List[BaseException]] = Deferred()
|
||||
conn_lost_deferred: Deferred[list[BaseException]] = Deferred()
|
||||
conn_lost_deferred.addCallback(self._remove_connection, key)
|
||||
|
||||
factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
|
||||
@ -94,7 +94,7 @@ class H2ConnectionPool:
|
||||
return conn
|
||||
|
||||
def _remove_connection(
|
||||
self, errors: List[BaseException], key: ConnectionKeyT
|
||||
self, errors: list[BaseException], key: ConnectionKeyT
|
||||
) -> None:
|
||||
self._connections.pop(key)
|
||||
|
||||
|
@ -4,7 +4,7 @@ import ipaddress
|
||||
import itertools
|
||||
import logging
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from h2.config import H2Configuration
|
||||
from h2.connection import H2Connection
|
||||
@ -91,7 +91,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self,
|
||||
uri: URI,
|
||||
settings: Settings,
|
||||
conn_lost_deferred: Deferred[List[BaseException]],
|
||||
conn_lost_deferred: Deferred[list[BaseException]],
|
||||
) -> None:
|
||||
"""
|
||||
Arguments:
|
||||
@ -102,7 +102,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
conn_lost_deferred -- Deferred fires with the reason: Failure to notify
|
||||
that connection was lost
|
||||
"""
|
||||
self._conn_lost_deferred: Deferred[List[BaseException]] = conn_lost_deferred
|
||||
self._conn_lost_deferred: Deferred[list[BaseException]] = conn_lost_deferred
|
||||
|
||||
config = H2Configuration(client_side=True, header_encoding="utf-8")
|
||||
self.conn = H2Connection(config=config)
|
||||
@ -113,19 +113,19 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self._stream_id_generator = itertools.count(start=1, step=2)
|
||||
|
||||
# Streams are stored in a dictionary keyed off their stream IDs
|
||||
self.streams: Dict[int, Stream] = {}
|
||||
self.streams: dict[int, Stream] = {}
|
||||
|
||||
# If requests are received before connection is made we keep
|
||||
# all requests in a pool and send them as the connection is made
|
||||
self._pending_request_stream_pool: Deque[Stream] = deque()
|
||||
self._pending_request_stream_pool: deque[Stream] = deque()
|
||||
|
||||
# Save an instance of errors raised which lead to losing the connection
|
||||
# We pass these instances to the streams ResponseFailed() failure
|
||||
self._conn_lost_errors: List[BaseException] = []
|
||||
self._conn_lost_errors: list[BaseException] = []
|
||||
|
||||
# Some meta data of this connection
|
||||
# initialized when connection is successfully made
|
||||
self.metadata: Dict[str, Any] = {
|
||||
self.metadata: dict[str, Any] = {
|
||||
# Peer certificate instance
|
||||
"certificate": None,
|
||||
# Address of the server we are connected to which
|
||||
@ -250,7 +250,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self.conn.initiate_connection()
|
||||
self._write_to_transport()
|
||||
|
||||
def _lose_connection_with_error(self, errors: List[BaseException]) -> None:
|
||||
def _lose_connection_with_error(self, errors: list[BaseException]) -> None:
|
||||
"""Helper function to lose the connection with the error sent as a
|
||||
reason"""
|
||||
self._conn_lost_errors += errors
|
||||
@ -353,7 +353,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self._pending_request_stream_pool.clear()
|
||||
self.conn.close_connection()
|
||||
|
||||
def _handle_events(self, events: List[Event]) -> None:
|
||||
def _handle_events(self, events: list[Event]) -> None:
|
||||
"""Private method which acts as a bridge between the events
|
||||
received from the HTTP/2 data and IH2EventsHandler
|
||||
|
||||
@ -442,7 +442,7 @@ class H2ClientFactory(Factory):
|
||||
self,
|
||||
uri: URI,
|
||||
settings: Settings,
|
||||
conn_lost_deferred: Deferred[List[BaseException]],
|
||||
conn_lost_deferred: Deferred[list[BaseException]],
|
||||
) -> None:
|
||||
self.uri = uri
|
||||
self.settings = settings
|
||||
@ -451,5 +451,5 @@ class H2ClientFactory(Factory):
|
||||
def buildProtocol(self, addr: IAddress) -> H2ClientProtocol:
|
||||
return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)
|
||||
|
||||
def acceptableProtocols(self) -> List[bytes]:
|
||||
def acceptableProtocols(self) -> list[bytes]:
|
||||
return [PROTOCOL_NAME]
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from h2.errors import ErrorCodes
|
||||
from h2.exceptions import H2Error, ProtocolError, StreamClosedError
|
||||
@ -113,7 +113,7 @@ class Stream:
|
||||
|
||||
# Metadata of an HTTP/2 connection stream
|
||||
# initialized when stream is instantiated
|
||||
self.metadata: Dict[str, Any] = {
|
||||
self.metadata: dict[str, Any] = {
|
||||
"request_content_length": (
|
||||
0 if self._request.body is None else len(self._request.body)
|
||||
),
|
||||
@ -134,7 +134,7 @@ class Stream:
|
||||
# Private variable used to build the response
|
||||
# this response is then converted to appropriate Response class
|
||||
# passed to the response deferred callback
|
||||
self._response: Dict[str, Any] = {
|
||||
self._response: dict[str, Any] = {
|
||||
# Data received frame by frame from the server is appended
|
||||
# and passed to the response Deferred when completely received.
|
||||
"body": BytesIO(),
|
||||
@ -196,7 +196,7 @@ class Stream:
|
||||
== f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
|
||||
)
|
||||
|
||||
def _get_request_headers(self) -> List[Tuple[str, str]]:
|
||||
def _get_request_headers(self) -> list[tuple[str, str]]:
|
||||
url = urlparse_cached(self._request)
|
||||
|
||||
path = url.path
|
||||
@ -349,7 +349,7 @@ class Stream:
|
||||
self._response["flow_controlled_size"], self.stream_id
|
||||
)
|
||||
|
||||
def receive_headers(self, headers: List[HeaderTuple]) -> None:
|
||||
def receive_headers(self, headers: list[HeaderTuple]) -> None:
|
||||
for name, value in headers:
|
||||
self._response["headers"].appendlist(name, value)
|
||||
|
||||
@ -382,7 +382,7 @@ class Stream:
|
||||
def close(
|
||||
self,
|
||||
reason: StreamCloseReason,
|
||||
errors: Optional[List[BaseException]] = None,
|
||||
errors: Optional[list[BaseException]] = None,
|
||||
from_protocol: bool = False,
|
||||
) -> None:
|
||||
"""Based on the reason sent we will handle each case."""
|
||||
|
@ -4,7 +4,7 @@ import json
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Type, cast
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
|
||||
# working around https://github.com/sphinx-doc/sphinx/issues/10400
|
||||
from twisted.internet.defer import Deferred # noqa: TC002
|
||||
@ -182,18 +182,18 @@ class Scheduler(BaseScheduler):
|
||||
self,
|
||||
dupefilter: BaseDupeFilter,
|
||||
jobdir: Optional[str] = None,
|
||||
dqclass: Optional[Type[BaseQueue]] = None,
|
||||
mqclass: Optional[Type[BaseQueue]] = None,
|
||||
dqclass: Optional[type[BaseQueue]] = None,
|
||||
mqclass: Optional[type[BaseQueue]] = None,
|
||||
logunser: bool = False,
|
||||
stats: Optional[StatsCollector] = None,
|
||||
pqclass: Optional[Type[ScrapyPriorityQueue]] = None,
|
||||
pqclass: Optional[type[ScrapyPriorityQueue]] = None,
|
||||
crawler: Optional[Crawler] = None,
|
||||
):
|
||||
self.df: BaseDupeFilter = dupefilter
|
||||
self.dqdir: Optional[str] = self._dqdir(jobdir)
|
||||
self.pqclass: Optional[Type[ScrapyPriorityQueue]] = pqclass
|
||||
self.dqclass: Optional[Type[BaseQueue]] = dqclass
|
||||
self.mqclass: Optional[Type[BaseQueue]] = mqclass
|
||||
self.pqclass: Optional[type[ScrapyPriorityQueue]] = pqclass
|
||||
self.dqclass: Optional[type[BaseQueue]] = dqclass
|
||||
self.mqclass: Optional[type[BaseQueue]] = mqclass
|
||||
self.logunser: bool = logunser
|
||||
self.stats: Optional[StatsCollector] = stats
|
||||
self.crawler: Optional[Crawler] = crawler
|
||||
@ -364,13 +364,13 @@ class Scheduler(BaseScheduler):
|
||||
return str(dqdir)
|
||||
return None
|
||||
|
||||
def _read_dqs_state(self, dqdir: str) -> List[int]:
|
||||
def _read_dqs_state(self, dqdir: str) -> list[int]:
|
||||
path = Path(dqdir, "active.json")
|
||||
if not path.exists():
|
||||
return []
|
||||
with path.open(encoding="utf-8") as f:
|
||||
return cast(List[int], json.load(f))
|
||||
return cast(list[int], json.load(f))
|
||||
|
||||
def _write_dqs_state(self, dqdir: str, state: List[int]) -> None:
|
||||
def _write_dqs_state(self, dqdir: str, state: list[int]) -> None:
|
||||
with Path(dqdir, "active.json").open("w", encoding="utf-8") as f:
|
||||
json.dump(state, f)
|
||||
|
@ -5,23 +5,8 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import deque
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Deque,
|
||||
Generator,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from collections.abc import AsyncIterable, Iterator
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
@ -47,6 +32,8 @@ from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator, Iterable
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
|
||||
|
||||
@ -54,12 +41,12 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_ParallelResult = List[Tuple[bool, Iterator[Any]]]
|
||||
_ParallelResult = list[tuple[bool, Iterator[Any]]]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# parameterized Deferreds require Twisted 21.7.0
|
||||
_HandleOutputDeferred = Deferred[Union[_ParallelResult, None]]
|
||||
QueueTuple = Tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
|
||||
QueueTuple = tuple[Union[Response, Failure], Request, _HandleOutputDeferred]
|
||||
|
||||
|
||||
class Slot:
|
||||
@ -69,8 +56,8 @@ class Slot:
|
||||
|
||||
def __init__(self, max_active_size: int = 5000000):
|
||||
self.max_active_size = max_active_size
|
||||
self.queue: Deque[QueueTuple] = deque()
|
||||
self.active: Set[Request] = set()
|
||||
self.queue: deque[QueueTuple] = deque()
|
||||
self.active: set[Request] = set()
|
||||
self.active_size: int = 0
|
||||
self.itemproc_size: int = 0
|
||||
self.closing: Optional[Deferred[Spider]] = None
|
||||
@ -113,7 +100,7 @@ class Scraper:
|
||||
self.spidermw: SpiderMiddlewareManager = SpiderMiddlewareManager.from_crawler(
|
||||
crawler
|
||||
)
|
||||
itemproc_cls: Type[ItemPipelineManager] = load_object(
|
||||
itemproc_cls: type[ItemPipelineManager] = load_object(
|
||||
crawler.settings["ITEM_PROCESSOR"]
|
||||
)
|
||||
self.itemproc: ItemPipelineManager = itemproc_cls.from_crawler(crawler)
|
||||
|
@ -7,22 +7,10 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import AsyncIterable, Callable, Iterable
|
||||
from inspect import isasyncgenfunction, iscoroutine
|
||||
from itertools import islice
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Callable,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.python.failure import Failure
|
||||
@ -42,6 +30,8 @@ from scrapy.utils.defer import (
|
||||
from scrapy.utils.python import MutableAsyncChain, MutableChain
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
|
||||
from scrapy.settings import BaseSettings
|
||||
|
||||
|
||||
@ -66,7 +56,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
self.downgrade_warning_done = False
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> List[Any]:
|
||||
def _get_mwlist_from_settings(cls, settings: BaseSettings) -> list[Any]:
|
||||
return build_component_list(settings.getwithbase("SPIDER_MIDDLEWARES"))
|
||||
|
||||
def _add_middleware(self, mw: Any) -> None:
|
||||
@ -349,7 +339,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
@staticmethod
|
||||
def _get_async_method_pair(
|
||||
mw: Any, methodname: str
|
||||
) -> Union[None, Callable, Tuple[Callable, Callable]]:
|
||||
) -> Union[None, Callable, tuple[Callable, Callable]]:
|
||||
normal_method: Optional[Callable] = getattr(mw, methodname, None)
|
||||
methodname_async = methodname + "_async"
|
||||
async_method: Optional[Callable] = getattr(mw, methodname_async, None)
|
||||
|
@ -4,18 +4,7 @@ import logging
|
||||
import pprint
|
||||
import signal
|
||||
import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Generator,
|
||||
Optional,
|
||||
Set,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from twisted.internet.defer import (
|
||||
Deferred,
|
||||
@ -53,6 +42,8 @@ from scrapy.utils.reactor import (
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
|
||||
from scrapy.utils.request import RequestFingerprinter
|
||||
|
||||
|
||||
@ -64,8 +55,8 @@ _T = TypeVar("_T")
|
||||
class Crawler:
|
||||
def __init__(
|
||||
self,
|
||||
spidercls: Type[Spider],
|
||||
settings: Union[None, Dict[str, Any], Settings] = None,
|
||||
spidercls: type[Spider],
|
||||
settings: Union[None, dict[str, Any], Settings] = None,
|
||||
init_reactor: bool = False,
|
||||
):
|
||||
if isinstance(spidercls, Spider):
|
||||
@ -74,7 +65,7 @@ class Crawler:
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
self.spidercls: Type[Spider] = spidercls
|
||||
self.spidercls: type[Spider] = spidercls
|
||||
self.settings: Settings = settings.copy()
|
||||
self.spidercls.update_settings(self.settings)
|
||||
self._update_root_log_handler()
|
||||
@ -112,7 +103,7 @@ class Crawler:
|
||||
self.__remove_handler = lambda: logging.root.removeHandler(handler)
|
||||
self.signals.connect(self.__remove_handler, signals.engine_stopped)
|
||||
|
||||
lf_cls: Type[LogFormatter] = load_object(self.settings["LOG_FORMATTER"])
|
||||
lf_cls: type[LogFormatter] = load_object(self.settings["LOG_FORMATTER"])
|
||||
self.logformatter = lf_cls.from_crawler(self)
|
||||
|
||||
self.request_fingerprinter = build_from_crawler(
|
||||
@ -256,18 +247,18 @@ class CrawlerRunner:
|
||||
verifyClass(ISpiderLoader, loader_cls)
|
||||
return loader_cls.from_settings(settings.frozencopy())
|
||||
|
||||
def __init__(self, settings: Union[Dict[str, Any], Settings, None] = None):
|
||||
def __init__(self, settings: Union[dict[str, Any], Settings, None] = None):
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
self.settings = settings
|
||||
self.spider_loader = self._get_spider_loader(settings)
|
||||
self._crawlers: Set[Crawler] = set()
|
||||
self._active: Set[Deferred[None]] = set()
|
||||
self._crawlers: set[Crawler] = set()
|
||||
self._active: set[Deferred[None]] = set()
|
||||
self.bootstrap_failed = False
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
crawler_or_spidercls: Union[Type[Spider], str, Crawler],
|
||||
crawler_or_spidercls: Union[type[Spider], str, Crawler],
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> Deferred[None]:
|
||||
@ -314,7 +305,7 @@ class CrawlerRunner:
|
||||
return d.addBoth(_done)
|
||||
|
||||
def create_crawler(
|
||||
self, crawler_or_spidercls: Union[Type[Spider], str, Crawler]
|
||||
self, crawler_or_spidercls: Union[type[Spider], str, Crawler]
|
||||
) -> Crawler:
|
||||
"""
|
||||
Return a :class:`~scrapy.crawler.Crawler` object.
|
||||
@ -335,11 +326,11 @@ class CrawlerRunner:
|
||||
return crawler_or_spidercls
|
||||
return self._create_crawler(crawler_or_spidercls)
|
||||
|
||||
def _create_crawler(self, spidercls: Union[str, Type[Spider]]) -> Crawler:
|
||||
def _create_crawler(self, spidercls: Union[str, type[Spider]]) -> Crawler:
|
||||
if isinstance(spidercls, str):
|
||||
spidercls = self.spider_loader.load(spidercls)
|
||||
# temporary cast until self.spider_loader is typed
|
||||
return Crawler(cast(Type[Spider], spidercls), self.settings)
|
||||
return Crawler(cast(type[Spider], spidercls), self.settings)
|
||||
|
||||
def stop(self) -> Deferred[Any]:
|
||||
"""
|
||||
@ -387,7 +378,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
settings: Union[Dict[str, Any], Settings, None] = None,
|
||||
settings: Union[dict[str, Any], Settings, None] = None,
|
||||
install_root_handler: bool = True,
|
||||
):
|
||||
super().__init__(settings)
|
||||
@ -416,14 +407,14 @@ class CrawlerProcess(CrawlerRunner):
|
||||
)
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
def _create_crawler(self, spidercls: Union[Type[Spider], str]) -> Crawler:
|
||||
def _create_crawler(self, spidercls: Union[type[Spider], str]) -> Crawler:
|
||||
if isinstance(spidercls, str):
|
||||
spidercls = self.spider_loader.load(spidercls)
|
||||
init_reactor = not self._initialized_reactor
|
||||
self._initialized_reactor = True
|
||||
# temporary cast until self.spider_loader is typed
|
||||
return Crawler(
|
||||
cast(Type[Spider], spidercls), self.settings, init_reactor=init_reactor
|
||||
cast(type[Spider], spidercls), self.settings, init_reactor=init_reactor
|
||||
)
|
||||
|
||||
def start(
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, DefaultDict, Iterable, Optional, Sequence, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from tldextract import TLDExtract
|
||||
|
||||
@ -13,6 +13,7 @@ from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Sequence
|
||||
from http.cookiejar import Cookie
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
@ -39,7 +40,7 @@ class CookiesMiddleware:
|
||||
"""This middleware enables working with sites that need cookies"""
|
||||
|
||||
def __init__(self, debug: bool = False):
|
||||
self.jars: DefaultDict[Any, CookieJar] = defaultdict(CookieJar)
|
||||
self.jars: defaultdict[Any, CookieJar] = defaultdict(CookieJar)
|
||||
self.debug: bool = debug
|
||||
|
||||
@classmethod
|
||||
|
@ -6,11 +6,13 @@ See documentation in docs/topics/downloader-middleware.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Iterable, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -20,8 +22,8 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware:
|
||||
def __init__(self, headers: Iterable[Tuple[str, str]]):
|
||||
self._headers: Iterable[Tuple[str, str]] = headers
|
||||
def __init__(self, headers: Iterable[tuple[str, str]]):
|
||||
self._headers: Iterable[tuple[str, str]] = headers
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import warnings
|
||||
from itertools import chain
|
||||
from logging import getLogger
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
@ -28,7 +28,7 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
ACCEPTED_ENCODINGS: List[bytes] = [b"gzip", b"deflate"]
|
||||
ACCEPTED_ENCODINGS: list[bytes] = [b"gzip", b"deflate"]
|
||||
|
||||
try:
|
||||
try:
|
||||
@ -50,7 +50,7 @@ else:
|
||||
|
||||
class HttpCompressionMiddleware:
|
||||
"""This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites"""
|
||||
sent/received from websites"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -140,7 +140,7 @@ class HttpCompressionMiddleware:
|
||||
respcls = responsetypes.from_args(
|
||||
headers=response.headers, url=response.url, body=decoded_body
|
||||
)
|
||||
kwargs: Dict[str, Any] = {"body": decoded_body}
|
||||
kwargs: dict[str, Any] = {"body": decoded_body}
|
||||
if issubclass(respcls, TextResponse):
|
||||
# force recalculating the encoding until we make sure the
|
||||
# responsetypes guessing is reliable
|
||||
@ -152,23 +152,23 @@ class HttpCompressionMiddleware:
|
||||
return response
|
||||
|
||||
def _handle_encoding(
|
||||
self, body: bytes, content_encoding: List[bytes], max_size: int
|
||||
) -> Tuple[bytes, List[bytes]]:
|
||||
self, body: bytes, content_encoding: list[bytes], max_size: int
|
||||
) -> tuple[bytes, list[bytes]]:
|
||||
to_decode, to_keep = self._split_encodings(content_encoding)
|
||||
for encoding in to_decode:
|
||||
body = self._decode(body, encoding, max_size)
|
||||
return body, to_keep
|
||||
|
||||
def _split_encodings(
|
||||
self, content_encoding: List[bytes]
|
||||
) -> Tuple[List[bytes], List[bytes]]:
|
||||
to_keep: List[bytes] = [
|
||||
self, content_encoding: list[bytes]
|
||||
) -> tuple[list[bytes], list[bytes]]:
|
||||
to_keep: list[bytes] = [
|
||||
encoding.strip().lower()
|
||||
for encoding in chain.from_iterable(
|
||||
encodings.split(b",") for encodings in content_encoding
|
||||
)
|
||||
]
|
||||
to_decode: List[bytes] = []
|
||||
to_decode: list[bytes] = []
|
||||
while to_keep:
|
||||
encoding = to_keep.pop()
|
||||
if encoding not in ACCEPTED_ENCODINGS:
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
from urllib.parse import unquote, urlunparse
|
||||
from urllib.request import ( # type: ignore[attr-defined]
|
||||
_parse_proxy,
|
||||
@ -25,7 +25,7 @@ if TYPE_CHECKING:
|
||||
class HttpProxyMiddleware:
|
||||
def __init__(self, auth_encoding: Optional[str] = "latin-1"):
|
||||
self.auth_encoding: Optional[str] = auth_encoding
|
||||
self.proxies: Dict[str, Tuple[Optional[bytes], str]] = {}
|
||||
self.proxies: dict[str, tuple[Optional[bytes], str]] = {}
|
||||
for type_, url in getproxies().items():
|
||||
try:
|
||||
self.proxies[type_] = self._get_proxy(url, type_)
|
||||
@ -47,7 +47,7 @@ class HttpProxyMiddleware:
|
||||
)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
def _get_proxy(self, url: str, orig_type: str) -> Tuple[Optional[bytes], str]:
|
||||
def _get_proxy(self, url: str, orig_type: str) -> tuple[Optional[bytes], str]:
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))
|
||||
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Set
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
@ -31,7 +31,7 @@ class OffsiteMiddleware:
|
||||
|
||||
def __init__(self, stats: StatsCollector):
|
||||
self.stats = stats
|
||||
self.domains_seen: Set[str] = set()
|
||||
self.domains_seen: set[str] = set()
|
||||
|
||||
def spider_opened(self, spider: Spider) -> None:
|
||||
self.host_regex: re.Pattern[str] = self.get_host_regex(spider)
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, List, Union, cast
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
@ -180,7 +180,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
def __init__(self, settings: BaseSettings):
|
||||
super().__init__(settings)
|
||||
self._ignore_tags: List[str] = settings.getlist("METAREFRESH_IGNORE_TAGS")
|
||||
self._ignore_tags: list[str] = settings.getlist("METAREFRESH_IGNORE_TAGS")
|
||||
self._maxdelay: int = settings.getint("METAREFRESH_MAXDELAY")
|
||||
|
||||
def process_response(
|
||||
|
@ -7,14 +7,14 @@ RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
Failed pages are collected on the scraping process and rescheduled at the end,
|
||||
once the spider has finished crawling all regular (non failed) pages.
|
||||
once the spider has finished crawling all regular (non-failed) pages.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from logging import Logger, getLogger
|
||||
from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.settings import BaseSettings, Settings
|
||||
@ -35,7 +35,7 @@ if TYPE_CHECKING:
|
||||
retry_logger = getLogger(__name__)
|
||||
|
||||
|
||||
def backwards_compatibility_getattr(self: Any, name: str) -> Tuple[Any, ...]:
|
||||
def backwards_compatibility_getattr(self: Any, name: str) -> tuple[Any, ...]:
|
||||
if name == "EXCEPTIONS_TO_RETRY":
|
||||
warnings.warn(
|
||||
"Attribute RetryMiddleware.EXCEPTIONS_TO_RETRY is deprecated. "
|
||||
@ -60,7 +60,7 @@ def get_retry_request(
|
||||
request: Request,
|
||||
*,
|
||||
spider: Spider,
|
||||
reason: Union[str, Exception, Type[Exception]] = "unspecified",
|
||||
reason: Union[str, Exception, type[Exception]] = "unspecified",
|
||||
max_retry_times: Optional[int] = None,
|
||||
priority_adjust: Optional[int] = None,
|
||||
logger: Logger = retry_logger,
|
||||
@ -187,7 +187,7 @@ class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
|
||||
def _retry(
|
||||
self,
|
||||
request: Request,
|
||||
reason: Union[str, Exception, Type[Exception]],
|
||||
reason: Union[str, Exception, type[Exception]],
|
||||
spider: Spider,
|
||||
) -> Optional[Request]:
|
||||
max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
|
||||
|
@ -7,7 +7,7 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Dict, Optional, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, Optional, TypeVar, Union
|
||||
|
||||
from twisted.internet.defer import Deferred, maybeDeferred
|
||||
|
||||
@ -45,7 +45,7 @@ class RobotsTxtMiddleware:
|
||||
"ROBOTSTXT_USER_AGENT", None
|
||||
)
|
||||
self.crawler: Crawler = crawler
|
||||
self._parsers: Dict[
|
||||
self._parsers: dict[
|
||||
str, Union[RobotParser, Deferred[Optional[RobotParser]], None]
|
||||
] = {}
|
||||
self._parserimpl: RobotParser = load_object(
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from twisted.web import http
|
||||
|
||||
@ -19,7 +19,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
def get_header_size(
|
||||
headers: Dict[str, Union[List[Union[str, bytes]], Tuple[Union[str, bytes], ...]]]
|
||||
headers: dict[str, Union[list[Union[str, bytes]], tuple[Union[str, bytes], ...]]]
|
||||
) -> int:
|
||||
size = 0
|
||||
for key, value in headers.items():
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional, Set
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.request import (
|
||||
@ -56,7 +56,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
self.fingerprinter: RequestFingerprinterProtocol = (
|
||||
fingerprinter or RequestFingerprinter()
|
||||
)
|
||||
self.fingerprints: Set[str] = set()
|
||||
self.fingerprints: set[str] = set()
|
||||
self.logdupes = True
|
||||
self.debug = debug
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
@ -6,9 +6,10 @@ import csv
|
||||
import marshal
|
||||
import pickle # nosec
|
||||
import pprint
|
||||
from collections.abc import Callable, Iterable, Mapping
|
||||
from io import BytesIO, TextIOWrapper
|
||||
from json import JSONEncoder
|
||||
from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Tuple, Union
|
||||
from typing import Any, Optional, Union
|
||||
from xml.sax.saxutils import XMLGenerator # nosec
|
||||
from xml.sax.xmlreader import AttributesImpl # nosec
|
||||
|
||||
@ -32,10 +33,10 @@ __all__ = [
|
||||
|
||||
class BaseItemExporter:
|
||||
def __init__(self, *, dont_fail: bool = False, **kwargs: Any):
|
||||
self._kwargs: Dict[str, Any] = kwargs
|
||||
self._kwargs: dict[str, Any] = kwargs
|
||||
self._configure(kwargs, dont_fail=dont_fail)
|
||||
|
||||
def _configure(self, options: Dict[str, Any], dont_fail: bool = False) -> None:
|
||||
def _configure(self, options: dict[str, Any], dont_fail: bool = False) -> None:
|
||||
"""Configure the exporter by popping options from the ``options`` dict.
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses ``__init__`` methods)
|
||||
@ -66,7 +67,7 @@ class BaseItemExporter:
|
||||
|
||||
def _get_serialized_fields(
|
||||
self, item: Any, default_value: Any = None, include_empty: Optional[bool] = None
|
||||
) -> Iterable[Tuple[str, Any]]:
|
||||
) -> Iterable[tuple[str, Any]]:
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
"""
|
||||
@ -339,7 +340,7 @@ class PythonItemExporter(BaseItemExporter):
|
||||
.. _msgpack: https://pypi.org/project/msgpack/
|
||||
"""
|
||||
|
||||
def _configure(self, options: Dict[str, Any], dont_fail: bool = False) -> None:
|
||||
def _configure(self, options: dict[str, Any], dont_fail: bool = False) -> None:
|
||||
super()._configure(options, dont_fail)
|
||||
if not self.encoding:
|
||||
self.encoding = "utf-8"
|
||||
@ -363,10 +364,10 @@ class PythonItemExporter(BaseItemExporter):
|
||||
return to_unicode(value, encoding=self.encoding)
|
||||
return value
|
||||
|
||||
def _serialize_item(self, item: Any) -> Iterable[Tuple[Union[str, bytes], Any]]:
|
||||
def _serialize_item(self, item: Any) -> Iterable[tuple[Union[str, bytes], Any]]:
|
||||
for key, value in ItemAdapter(item).items():
|
||||
yield key, self._serialize_value(value)
|
||||
|
||||
def export_item(self, item: Any) -> Dict[Union[str, bytes], Any]: # type: ignore[override]
|
||||
result: Dict[Union[str, bytes], Any] = dict(self._get_serialized_fields(item))
|
||||
def export_item(self, item: Any) -> dict[Union[str, bytes], Any]: # type: ignore[override]
|
||||
result: dict[Union[str, bytes], Any] = dict(self._get_serialized_fields(item))
|
||||
return result
|
||||
|
@ -6,7 +6,7 @@ See documentation in docs/topics/extensions.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, List
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
@ -19,5 +19,5 @@ class ExtensionManager(MiddlewareManager):
|
||||
component_name = "extension"
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
|
||||
def _get_mwlist_from_settings(cls, settings: Settings) -> list[Any]:
|
||||
return build_component_list(settings.getwithbase("EXTENSIONS"))
|
||||
|
@ -8,7 +8,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, DefaultDict, Dict
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -30,7 +30,7 @@ class CloseSpider:
|
||||
def __init__(self, crawler: Crawler):
|
||||
self.crawler: Crawler = crawler
|
||||
|
||||
self.close_on: Dict[str, Any] = {
|
||||
self.close_on: dict[str, Any] = {
|
||||
"timeout": crawler.settings.getfloat("CLOSESPIDER_TIMEOUT"),
|
||||
"itemcount": crawler.settings.getint("CLOSESPIDER_ITEMCOUNT"),
|
||||
"pagecount": crawler.settings.getint("CLOSESPIDER_PAGECOUNT"),
|
||||
@ -44,7 +44,7 @@ class CloseSpider:
|
||||
if not any(self.close_on.values()):
|
||||
raise NotConfigured
|
||||
|
||||
self.counter: DefaultDict[str, int] = defaultdict(int)
|
||||
self.counter: defaultdict[str, int] = defaultdict(int)
|
||||
|
||||
if self.close_on.get("errorcount"):
|
||||
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
||||
|
@ -10,25 +10,11 @@ import logging
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path, PureWindowsPath
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Protocol,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Protocol, TypeVar, Union, cast
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
|
||||
@ -50,6 +36,8 @@ from scrapy.utils.misc import build_from_crawler, load_object
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from _typeshed import OpenBinaryMode
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -70,7 +58,7 @@ except ImportError:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
UriParamsCallableT = Callable[[Dict[str, Any], Spider], Optional[Dict[str, Any]]]
|
||||
UriParamsCallableT = Callable[[dict[str, Any], Spider], Optional[dict[str, Any]]]
|
||||
|
||||
_StorageT = TypeVar("_StorageT", bound="FeedStorageProtocol")
|
||||
|
||||
@ -79,7 +67,7 @@ def build_storage(
|
||||
builder: Callable[..., _StorageT],
|
||||
uri: str,
|
||||
*args: Any,
|
||||
feed_options: Optional[Dict[str, Any]] = None,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
preargs: Iterable[Any] = (),
|
||||
**kwargs: Any,
|
||||
) -> _StorageT:
|
||||
@ -96,10 +84,10 @@ class ItemFilter:
|
||||
:type feed_options: dict
|
||||
"""
|
||||
|
||||
feed_options: Optional[Dict[str, Any]]
|
||||
item_classes: Tuple[type, ...]
|
||||
feed_options: Optional[dict[str, Any]]
|
||||
item_classes: tuple[type, ...]
|
||||
|
||||
def __init__(self, feed_options: Optional[Dict[str, Any]]) -> None:
|
||||
def __init__(self, feed_options: Optional[dict[str, Any]]) -> None:
|
||||
self.feed_options = feed_options
|
||||
if feed_options is not None:
|
||||
self.item_classes = tuple(
|
||||
@ -141,7 +129,7 @@ class IFeedStorage(Interface):
|
||||
class FeedStorageProtocol(Protocol):
|
||||
"""Reimplementation of ``IFeedStorage`` that can be used in type hints."""
|
||||
|
||||
def __init__(self, uri: str, *, feed_options: Optional[Dict[str, Any]] = None):
|
||||
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
|
||||
"""Initialize the storage with the parameters given in the URI and the
|
||||
feed-specific options (see :setting:`FEEDS`)"""
|
||||
|
||||
@ -176,7 +164,7 @@ class StdoutFeedStorage:
|
||||
uri: str,
|
||||
_stdout: Optional[IO[bytes]] = None,
|
||||
*,
|
||||
feed_options: Optional[Dict[str, Any]] = None,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
if not _stdout:
|
||||
_stdout = sys.stdout.buffer
|
||||
@ -198,7 +186,7 @@ class StdoutFeedStorage:
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage:
|
||||
def __init__(self, uri: str, *, feed_options: Optional[Dict[str, Any]] = None):
|
||||
def __init__(self, uri: str, *, feed_options: Optional[dict[str, Any]] = None):
|
||||
self.path: str = file_uri_to_path(uri)
|
||||
feed_options = feed_options or {}
|
||||
self.write_mode: OpenBinaryMode = (
|
||||
@ -225,7 +213,7 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
acl: Optional[str] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
*,
|
||||
feed_options: Optional[Dict[str, Any]] = None,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
session_token: Optional[str] = None,
|
||||
region_name: Optional[str] = None,
|
||||
):
|
||||
@ -291,7 +279,7 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
crawler: Crawler,
|
||||
uri: str,
|
||||
*,
|
||||
feed_options: Optional[Dict[str, Any]] = None,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
) -> Self:
|
||||
return build_storage(
|
||||
cls,
|
||||
@ -307,7 +295,7 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
def _store_in_thread(self, file: IO[bytes]) -> None:
|
||||
file.seek(0)
|
||||
kwargs: Dict[str, Any]
|
||||
kwargs: dict[str, Any]
|
||||
if IS_BOTO3_AVAILABLE:
|
||||
kwargs = {"ExtraArgs": {"ACL": self.acl}} if self.acl else {}
|
||||
self.s3_client.upload_fileobj(
|
||||
@ -354,7 +342,7 @@ class FTPFeedStorage(BlockingFeedStorage):
|
||||
uri: str,
|
||||
use_active_mode: bool = False,
|
||||
*,
|
||||
feed_options: Optional[Dict[str, Any]] = None,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
u = urlparse(uri)
|
||||
if not u.hostname:
|
||||
@ -373,7 +361,7 @@ class FTPFeedStorage(BlockingFeedStorage):
|
||||
crawler: Crawler,
|
||||
uri: str,
|
||||
*,
|
||||
feed_options: Optional[Dict[str, Any]] = None,
|
||||
feed_options: Optional[dict[str, Any]] = None,
|
||||
) -> Self:
|
||||
return build_storage(
|
||||
cls,
|
||||
@ -405,9 +393,9 @@ class FeedSlot:
|
||||
batch_id: int,
|
||||
uri_template: str,
|
||||
filter: ItemFilter,
|
||||
feed_options: Dict[str, Any],
|
||||
feed_options: dict[str, Any],
|
||||
spider: Spider,
|
||||
exporters: Dict[str, Type[BaseItemExporter]],
|
||||
exporters: dict[str, type[BaseItemExporter]],
|
||||
settings: BaseSettings,
|
||||
crawler: Crawler,
|
||||
):
|
||||
@ -422,9 +410,9 @@ class FeedSlot:
|
||||
self.uri: str = uri
|
||||
self.filter: ItemFilter = filter
|
||||
# exporter params
|
||||
self.feed_options: Dict[str, Any] = feed_options
|
||||
self.feed_options: dict[str, Any] = feed_options
|
||||
self.spider: Spider = spider
|
||||
self.exporters: Dict[str, Type[BaseItemExporter]] = exporters
|
||||
self.exporters: dict[str, type[BaseItemExporter]] = exporters
|
||||
self.settings: BaseSettings = settings
|
||||
self.crawler: Crawler = crawler
|
||||
# flags
|
||||
@ -460,7 +448,7 @@ class FeedSlot:
|
||||
self._exporting = True
|
||||
|
||||
def _get_instance(
|
||||
self, objcls: Type[BaseItemExporter], *args: Any, **kwargs: Any
|
||||
self, objcls: type[BaseItemExporter], *args: Any, **kwargs: Any
|
||||
) -> BaseItemExporter:
|
||||
return build_from_crawler(objcls, self.crawler, *args, **kwargs)
|
||||
|
||||
@ -483,7 +471,7 @@ _FeedSlot = create_deprecated_class(
|
||||
|
||||
|
||||
class FeedExporter:
|
||||
_pending_deferreds: List[Deferred[None]] = []
|
||||
_pending_deferreds: list[Deferred[None]] = []
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
@ -497,8 +485,8 @@ class FeedExporter:
|
||||
self.crawler: Crawler = crawler
|
||||
self.settings: Settings = crawler.settings
|
||||
self.feeds = {}
|
||||
self.slots: List[FeedSlot] = []
|
||||
self.filters: Dict[str, ItemFilter] = {}
|
||||
self.slots: list[FeedSlot] = []
|
||||
self.filters: dict[str, ItemFilter] = {}
|
||||
|
||||
if not self.settings["FEEDS"] and not self.settings["FEED_URI"]:
|
||||
raise NotConfigured
|
||||
@ -530,10 +518,10 @@ class FeedExporter:
|
||||
)
|
||||
self.filters[uri] = self._load_filter(feed_options)
|
||||
|
||||
self.storages: Dict[str, Type[FeedStorageProtocol]] = self._load_components(
|
||||
self.storages: dict[str, type[FeedStorageProtocol]] = self._load_components(
|
||||
"FEED_STORAGES"
|
||||
)
|
||||
self.exporters: Dict[str, Type[BaseItemExporter]] = self._load_components(
|
||||
self.exporters: dict[str, type[BaseItemExporter]] = self._load_components(
|
||||
"FEED_EXPORTERS"
|
||||
)
|
||||
for uri, feed_options in self.feeds.items():
|
||||
@ -631,7 +619,7 @@ class FeedExporter:
|
||||
self,
|
||||
batch_id: int,
|
||||
uri: str,
|
||||
feed_options: Dict[str, Any],
|
||||
feed_options: dict[str, Any],
|
||||
spider: Spider,
|
||||
uri_template: str,
|
||||
) -> FeedSlot:
|
||||
@ -696,9 +684,9 @@ class FeedExporter:
|
||||
slots.append(slot)
|
||||
self.slots = slots
|
||||
|
||||
def _load_components(self, setting_prefix: str) -> Dict[str, Any]:
|
||||
def _load_components(self, setting_prefix: str) -> dict[str, Any]:
|
||||
conf = without_none_values(
|
||||
cast(Dict[str, str], self.settings.getwithbase(setting_prefix))
|
||||
cast(dict[str, str], self.settings.getwithbase(setting_prefix))
|
||||
)
|
||||
d = {}
|
||||
for k, v in conf.items():
|
||||
@ -732,7 +720,7 @@ class FeedExporter:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _storage_supported(self, uri: str, feed_options: Dict[str, Any]) -> bool:
|
||||
def _storage_supported(self, uri: str, feed_options: dict[str, Any]) -> bool:
|
||||
scheme = urlparse(uri).scheme
|
||||
if scheme in self.storages or PureWindowsPath(uri).drive:
|
||||
try:
|
||||
@ -748,7 +736,7 @@ class FeedExporter:
|
||||
return False
|
||||
|
||||
def _get_storage(
|
||||
self, uri: str, feed_options: Dict[str, Any]
|
||||
self, uri: str, feed_options: dict[str, Any]
|
||||
) -> FeedStorageProtocol:
|
||||
"""Fork of create_instance specific to feed storage classes
|
||||
|
||||
@ -759,7 +747,7 @@ class FeedExporter:
|
||||
crawler = getattr(self, "crawler", None)
|
||||
|
||||
def build_instance(
|
||||
builder: Type[FeedStorageProtocol], *preargs: Any
|
||||
builder: type[FeedStorageProtocol], *preargs: Any
|
||||
) -> FeedStorageProtocol:
|
||||
return build_storage(
|
||||
builder, uri, feed_options=feed_options, preargs=preargs
|
||||
@ -784,7 +772,7 @@ class FeedExporter:
|
||||
spider: Spider,
|
||||
uri_params_function: Union[str, UriParamsCallableT, None],
|
||||
slot: Optional[FeedSlot] = None,
|
||||
) -> Dict[str, Any]:
|
||||
) -> dict[str, Any]:
|
||||
params = {}
|
||||
for k in dir(spider):
|
||||
params[k] = getattr(spider, k)
|
||||
@ -800,9 +788,9 @@ class FeedExporter:
|
||||
new_params = uripar_function(params, spider)
|
||||
return new_params if new_params is not None else params
|
||||
|
||||
def _load_filter(self, feed_options: Dict[str, Any]) -> ItemFilter:
|
||||
def _load_filter(self, feed_options: dict[str, Any]) -> ItemFilter:
|
||||
# load the item filter if declared else load the default filter class
|
||||
item_filter_class: Type[ItemFilter] = load_object(
|
||||
item_filter_class: type[ItemFilter] = load_object(
|
||||
feed_options.get("item_filter", ItemFilter)
|
||||
)
|
||||
return item_filter_class(feed_options)
|
||||
|
@ -9,7 +9,7 @@ from importlib import import_module
|
||||
from pathlib import Path
|
||||
from time import time
|
||||
from types import ModuleType
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
|
||||
@ -22,6 +22,8 @@ from scrapy.utils.python import to_bytes, to_unicode
|
||||
from scrapy.utils.request import RequestFingerprinter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
# typing.Concatenate requires Python 3.10
|
||||
from typing_extensions import Concatenate
|
||||
|
||||
@ -35,8 +37,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class DummyPolicy:
|
||||
def __init__(self, settings: BaseSettings):
|
||||
self.ignore_schemes: List[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self.ignore_http_codes: List[int] = [
|
||||
self.ignore_schemes: list[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self.ignore_http_codes: list[int] = [
|
||||
int(x) for x in settings.getlist("HTTPCACHE_IGNORE_HTTP_CODES")
|
||||
]
|
||||
|
||||
@ -62,18 +64,18 @@ class RFC2616Policy:
|
||||
|
||||
def __init__(self, settings: BaseSettings):
|
||||
self.always_store: bool = settings.getbool("HTTPCACHE_ALWAYS_STORE")
|
||||
self.ignore_schemes: List[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self.ignore_schemes: list[str] = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self._cc_parsed: WeakKeyDictionary[
|
||||
Union[Request, Response], Dict[bytes, Optional[bytes]]
|
||||
Union[Request, Response], dict[bytes, Optional[bytes]]
|
||||
] = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls: List[bytes] = [
|
||||
self.ignore_response_cache_controls: list[bytes] = [
|
||||
to_bytes(cc)
|
||||
for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(
|
||||
self, r: Union[Request, Response]
|
||||
) -> Dict[bytes, Optional[bytes]]:
|
||||
) -> dict[bytes, Optional[bytes]]:
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get(b"Cache-Control", b"")
|
||||
assert cch is not None
|
||||
@ -189,7 +191,7 @@ class RFC2616Policy:
|
||||
if b"ETag" in cachedresponse.headers:
|
||||
request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
|
||||
|
||||
def _get_max_age(self, cc: Dict[bytes, Optional[bytes]]) -> Optional[int]:
|
||||
def _get_max_age(self, cc: dict[bytes, Optional[bytes]]) -> Optional[int]:
|
||||
try:
|
||||
return max(0, int(cc[b"max-age"])) # type: ignore[arg-type]
|
||||
except (KeyError, ValueError):
|
||||
@ -298,7 +300,7 @@ class DbmCacheStorage:
|
||||
self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
|
||||
self.db[f"{key}_time"] = str(time())
|
||||
|
||||
def _read_data(self, spider: Spider, request: Request) -> Optional[Dict[str, Any]]:
|
||||
def _read_data(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
|
||||
key = self._fingerprinter.fingerprint(request).hex()
|
||||
db = self.db
|
||||
tkey = f"{key}_time"
|
||||
@ -309,7 +311,7 @@ class DbmCacheStorage:
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return None # expired
|
||||
|
||||
return cast(Dict[str, Any], pickle.loads(db[f"{key}_data"])) # nosec
|
||||
return cast(dict[str, Any], pickle.loads(db[f"{key}_data"])) # nosec
|
||||
|
||||
|
||||
class FilesystemCacheStorage:
|
||||
@ -385,7 +387,7 @@ class FilesystemCacheStorage:
|
||||
key = self._fingerprinter.fingerprint(request).hex()
|
||||
return str(Path(self.cachedir, spider.name, key[0:2], key))
|
||||
|
||||
def _read_meta(self, spider: Spider, request: Request) -> Optional[Dict[str, Any]]:
|
||||
def _read_meta(self, spider: Spider, request: Request) -> Optional[dict[str, Any]]:
|
||||
rpath = Path(self._get_request_path(spider, request))
|
||||
metapath = rpath / "pickled_meta"
|
||||
if not metapath.exists():
|
||||
@ -394,10 +396,10 @@ class FilesystemCacheStorage:
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return None # expired
|
||||
with self._open(metapath, "rb") as f:
|
||||
return cast(Dict[str, Any], pickle.load(f)) # nosec
|
||||
return cast(dict[str, Any], pickle.load(f)) # nosec
|
||||
|
||||
|
||||
def parse_cachecontrol(header: bytes) -> Dict[bytes, Optional[bytes]]:
|
||||
def parse_cachecontrol(header: bytes) -> dict[bytes, Optional[bytes]]:
|
||||
"""Parse Cache-Control header
|
||||
|
||||
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
@ -81,7 +81,7 @@ class LogStats:
|
||||
|
||||
def calculate_final_stats(
|
||||
self, spider: Spider
|
||||
) -> Union[Tuple[None, None], Tuple[float, float]]:
|
||||
) -> Union[tuple[None, None], tuple[float, float]]:
|
||||
start_time = self.stats.get_value("start_time")
|
||||
finished_time = self.stats.get_value("finished_time")
|
||||
|
||||
|
@ -11,7 +11,7 @@ import socket
|
||||
import sys
|
||||
from importlib import import_module
|
||||
from pprint import pformat
|
||||
from typing import TYPE_CHECKING, List
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
@ -42,7 +42,7 @@ class MemoryUsage:
|
||||
|
||||
self.crawler: Crawler = crawler
|
||||
self.warned: bool = False
|
||||
self.notify_mails: List[str] = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
|
||||
self.notify_mails: list[str] = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
|
||||
self.limit: int = crawler.settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
|
||||
self.warning: int = crawler.settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
|
||||
self.check_interval: float = crawler.settings.getfloat(
|
||||
@ -66,7 +66,7 @@ class MemoryUsage:
|
||||
def engine_started(self) -> None:
|
||||
assert self.crawler.stats
|
||||
self.crawler.stats.set_value("memusage/startup", self.get_virtual_size())
|
||||
self.tasks: List[task.LoopingCall] = []
|
||||
self.tasks: list[task.LoopingCall] = []
|
||||
tsk = task.LoopingCall(self.update)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
@ -141,7 +141,7 @@ class MemoryUsage:
|
||||
self.crawler.stats.set_value("memusage/warning_notified", 1)
|
||||
self.warned = True
|
||||
|
||||
def _send_report(self, rcpts: List[str], subject: str) -> None:
|
||||
def _send_report(self, rcpts: list[str], subject: str) -> None:
|
||||
"""send notification mail with some additional useful info"""
|
||||
assert self.crawler.engine
|
||||
assert self.crawler.stats
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from json import JSONEncoder
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
@ -29,8 +29,8 @@ class PeriodicLog:
|
||||
self,
|
||||
stats: StatsCollector,
|
||||
interval: float = 60.0,
|
||||
ext_stats: Dict[str, Any] = {},
|
||||
ext_delta: Dict[str, Any] = {},
|
||||
ext_stats: dict[str, Any] = {},
|
||||
ext_delta: dict[str, Any] = {},
|
||||
ext_timing_enabled: bool = False,
|
||||
):
|
||||
self.stats: StatsCollector = stats
|
||||
@ -39,11 +39,11 @@ class PeriodicLog:
|
||||
self.task: Optional[task.LoopingCall] = None
|
||||
self.encoder: JSONEncoder = ScrapyJSONEncoder(sort_keys=True, indent=4)
|
||||
self.ext_stats_enabled: bool = bool(ext_stats)
|
||||
self.ext_stats_include: List[str] = ext_stats.get("include", [])
|
||||
self.ext_stats_exclude: List[str] = ext_stats.get("exclude", [])
|
||||
self.ext_stats_include: list[str] = ext_stats.get("include", [])
|
||||
self.ext_stats_exclude: list[str] = ext_stats.get("exclude", [])
|
||||
self.ext_delta_enabled: bool = bool(ext_delta)
|
||||
self.ext_delta_include: List[str] = ext_delta.get("include", [])
|
||||
self.ext_delta_exclude: List[str] = ext_delta.get("exclude", [])
|
||||
self.ext_delta_include: list[str] = ext_delta.get("include", [])
|
||||
self.ext_delta_exclude: list[str] = ext_delta.get("exclude", [])
|
||||
self.ext_timing_enabled: bool = ext_timing_enabled
|
||||
|
||||
@classmethod
|
||||
@ -52,7 +52,7 @@ class PeriodicLog:
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
try:
|
||||
ext_stats: Optional[Dict[str, Any]] = crawler.settings.getdict(
|
||||
ext_stats: Optional[dict[str, Any]] = crawler.settings.getdict(
|
||||
"PERIODIC_LOG_STATS"
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
@ -62,7 +62,7 @@ class PeriodicLog:
|
||||
else None
|
||||
)
|
||||
try:
|
||||
ext_delta: Optional[Dict[str, Any]] = crawler.settings.getdict(
|
||||
ext_delta: Optional[dict[str, Any]] = crawler.settings.getdict(
|
||||
"PERIODIC_LOG_DELTA"
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
@ -93,14 +93,14 @@ class PeriodicLog:
|
||||
|
||||
def spider_opened(self, spider: Spider) -> None:
|
||||
self.time_prev: datetime = datetime.now(tz=timezone.utc)
|
||||
self.delta_prev: Dict[str, Union[int, float]] = {}
|
||||
self.stats_prev: Dict[str, Union[int, float]] = {}
|
||||
self.delta_prev: dict[str, Union[int, float]] = {}
|
||||
self.stats_prev: dict[str, Union[int, float]] = {}
|
||||
|
||||
self.task = task.LoopingCall(self.log)
|
||||
self.task.start(self.interval)
|
||||
|
||||
def log(self) -> None:
|
||||
data: Dict[str, Any] = {}
|
||||
data: dict[str, Any] = {}
|
||||
if self.ext_timing_enabled:
|
||||
data.update(self.log_timing())
|
||||
if self.ext_delta_enabled:
|
||||
@ -109,8 +109,8 @@ class PeriodicLog:
|
||||
data.update(self.log_crawler_stats())
|
||||
logger.info(self.encoder.encode(data))
|
||||
|
||||
def log_delta(self) -> Dict[str, Any]:
|
||||
num_stats: Dict[str, Union[int, float]] = {
|
||||
def log_delta(self) -> dict[str, Any]:
|
||||
num_stats: dict[str, Union[int, float]] = {
|
||||
k: v
|
||||
for k, v in self.stats._stats.items()
|
||||
if isinstance(v, (int, float))
|
||||
@ -120,7 +120,7 @@ class PeriodicLog:
|
||||
self.delta_prev = num_stats
|
||||
return {"delta": delta}
|
||||
|
||||
def log_timing(self) -> Dict[str, Any]:
|
||||
def log_timing(self) -> dict[str, Any]:
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
time = {
|
||||
"log_interval": self.interval,
|
||||
@ -132,7 +132,7 @@ class PeriodicLog:
|
||||
self.time_prev = now
|
||||
return {"time": time}
|
||||
|
||||
def log_crawler_stats(self) -> Dict[str, Any]:
|
||||
def log_crawler_stats(self) -> dict[str, Any]:
|
||||
stats = {
|
||||
k: v
|
||||
for k, v in self.stats._stats.items()
|
||||
@ -141,7 +141,7 @@ class PeriodicLog:
|
||||
return {"stats": stats}
|
||||
|
||||
def param_allowed(
|
||||
self, stat_name: str, include: List[str], exclude: List[str]
|
||||
self, stat_name: str, include: list[str], exclude: list[str]
|
||||
) -> bool:
|
||||
if not include and not exclude:
|
||||
return True
|
||||
|
@ -6,7 +6,7 @@ from bz2 import BZ2File
|
||||
from gzip import GzipFile
|
||||
from io import IOBase
|
||||
from lzma import LZMAFile
|
||||
from typing import IO, Any, BinaryIO, Dict, List, cast
|
||||
from typing import IO, Any, BinaryIO, cast
|
||||
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
@ -24,7 +24,7 @@ class GzipPlugin:
|
||||
See :py:class:`gzip.GzipFile` for more info about parameters.
|
||||
"""
|
||||
|
||||
def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
|
||||
def __init__(self, file: BinaryIO, feed_options: dict[str, Any]) -> None:
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
compress_level = self.feed_options.get("gzip_compresslevel", 9)
|
||||
@ -56,7 +56,7 @@ class Bz2Plugin:
|
||||
See :py:class:`bz2.BZ2File` for more info about parameters.
|
||||
"""
|
||||
|
||||
def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
|
||||
def __init__(self, file: BinaryIO, feed_options: dict[str, Any]) -> None:
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
compress_level = self.feed_options.get("bz2_compresslevel", 9)
|
||||
@ -88,7 +88,7 @@ class LZMAPlugin:
|
||||
See :py:class:`lzma.LZMAFile` for more info about parameters.
|
||||
"""
|
||||
|
||||
def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
|
||||
def __init__(self, file: BinaryIO, feed_options: dict[str, Any]) -> None:
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
|
||||
@ -126,7 +126,7 @@ class PostProcessingManager(IOBase):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, plugins: List[Any], file: IO[bytes], feed_options: Dict[str, Any]
|
||||
self, plugins: list[Any], file: IO[bytes], feed_options: dict[str, Any]
|
||||
) -> None:
|
||||
self.plugins = self._load_plugins(plugins)
|
||||
self.file = file
|
||||
@ -156,7 +156,7 @@ class PostProcessingManager(IOBase):
|
||||
def writable(self) -> bool:
|
||||
return True
|
||||
|
||||
def _load_plugins(self, plugins: List[Any]) -> List[Any]:
|
||||
def _load_plugins(self, plugins: list[Any]) -> list[Any]:
|
||||
plugins = [load_object(plugin) for plugin in plugins]
|
||||
return plugins
|
||||
|
||||
|
@ -6,7 +6,7 @@ Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from scrapy import Spider, signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -23,14 +23,14 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class StatsMailer:
|
||||
def __init__(self, stats: StatsCollector, recipients: List[str], mail: MailSender):
|
||||
def __init__(self, stats: StatsCollector, recipients: list[str], mail: MailSender):
|
||||
self.stats: StatsCollector = stats
|
||||
self.recipients: List[str] = recipients
|
||||
self.recipients: list[str] = recipients
|
||||
self.mail: MailSender = mail
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler) -> Self:
|
||||
recipients: List[str] = crawler.settings.getlist("STATSMAILER_RCPTS")
|
||||
recipients: list[str] = crawler.settings.getlist("STATSMAILER_RCPTS")
|
||||
if not recipients:
|
||||
raise NotConfigured
|
||||
mail: MailSender = MailSender.from_settings(crawler.settings)
|
||||
|
@ -10,7 +10,7 @@ import binascii
|
||||
import logging
|
||||
import os
|
||||
import pprint
|
||||
from typing import TYPE_CHECKING, Any, Dict, List
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from twisted.internet import protocol
|
||||
from twisted.internet.tcp import Port
|
||||
@ -45,7 +45,7 @@ class TelnetConsole(protocol.ServerFactory):
|
||||
|
||||
self.crawler: Crawler = crawler
|
||||
self.noisy: bool = False
|
||||
self.portrange: List[int] = [
|
||||
self.portrange: list[int] = [
|
||||
int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT")
|
||||
]
|
||||
self.host: str = crawler.settings["TELNETCONSOLE_HOST"]
|
||||
@ -98,10 +98,10 @@ class TelnetConsole(protocol.ServerFactory):
|
||||
|
||||
return telnet.TelnetTransport(telnet.AuthenticatingTelnetProtocol, Portal())
|
||||
|
||||
def _get_telnet_vars(self) -> Dict[str, Any]:
|
||||
def _get_telnet_vars(self) -> dict[str, Any]:
|
||||
# Note: if you add entries here also update topics/telnetconsole.rst
|
||||
assert self.crawler.engine
|
||||
telnet_vars: Dict[str, Any] = {
|
||||
telnet_vars: dict[str, Any] = {
|
||||
"engine": self.crawler.engine,
|
||||
"spider": self.crawler.engine.spider,
|
||||
"slot": self.crawler.engine.slot,
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from scrapy import Request, Spider, signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
@ -90,7 +90,7 @@ class AutoThrottle:
|
||||
|
||||
def _get_slot(
|
||||
self, request: Request, spider: Spider
|
||||
) -> Tuple[Optional[str], Optional[Slot]]:
|
||||
) -> tuple[Optional[str], Optional[Slot]]:
|
||||
key: Optional[str] = request.meta.get("download_slot")
|
||||
if key is None:
|
||||
return None, None
|
||||
|
@ -5,22 +5,14 @@ import time
|
||||
from http.cookiejar import Cookie
|
||||
from http.cookiejar import CookieJar as _CookieJar
|
||||
from http.cookiejar import CookiePolicy, DefaultCookiePolicy
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator, Sequence
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -83,7 +75,7 @@ class CookieJar:
|
||||
self.jar.clear_expired_cookies()
|
||||
|
||||
@property
|
||||
def _cookies(self) -> Dict[str, Dict[str, Dict[str, Cookie]]]:
|
||||
def _cookies(self) -> dict[str, dict[str, dict[str, Cookie]]]:
|
||||
return self.jar._cookies # type: ignore[attr-defined,no-any-return]
|
||||
|
||||
def clear_session_cookies(self) -> None:
|
||||
@ -118,7 +110,7 @@ class CookieJar:
|
||||
self.jar.set_cookie_if_ok(cookie, WrappedRequest(request)) # type: ignore[arg-type]
|
||||
|
||||
|
||||
def potential_domain_matches(domain: str) -> List[str]:
|
||||
def potential_domain_matches(domain: str) -> list[str]:
|
||||
"""Potential domain matches for a cookie
|
||||
|
||||
>>> potential_domain_matches('www.example.com')
|
||||
@ -200,7 +192,7 @@ class WrappedRequest:
|
||||
value = self.request.headers.get(name, default)
|
||||
return to_unicode(value, errors="replace") if value is not None else None
|
||||
|
||||
def header_items(self) -> List[Tuple[str, List[str]]]:
|
||||
def header_items(self) -> list[tuple[str, list[str]]]:
|
||||
return [
|
||||
(
|
||||
to_unicode(k, errors="replace"),
|
||||
@ -220,7 +212,7 @@ class WrappedResponse:
|
||||
def info(self) -> Self:
|
||||
return self
|
||||
|
||||
def get_all(self, name: str, default: Any = None) -> List[str]:
|
||||
def get_all(self, name: str, default: Any = None) -> list[str]:
|
||||
return [
|
||||
to_unicode(v, errors="replace") for v in self.response.headers.getlist(name)
|
||||
]
|
||||
|
@ -1,18 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AnyStr,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
|
||||
|
||||
from w3lib.http import headers_dict_to_raw
|
||||
|
||||
@ -20,6 +9,8 @@ from scrapy.utils.datatypes import CaseInsensitiveDict, CaselessDict
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -34,17 +25,17 @@ class Headers(CaselessDict):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
encoding: str = "utf-8",
|
||||
):
|
||||
self.encoding: str = encoding
|
||||
super().__init__(seq)
|
||||
|
||||
def update( # type: ignore[override]
|
||||
self, seq: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]]]
|
||||
self, seq: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]]]
|
||||
) -> None:
|
||||
seq = seq.items() if isinstance(seq, Mapping) else seq
|
||||
iseq: Dict[bytes, List[bytes]] = {}
|
||||
iseq: dict[bytes, list[bytes]] = {}
|
||||
for k, v in seq:
|
||||
iseq.setdefault(self.normkey(k), []).extend(self.normvalue(v))
|
||||
super().update(iseq)
|
||||
@ -53,7 +44,7 @@ class Headers(CaselessDict):
|
||||
"""Normalize key to bytes"""
|
||||
return self._tobytes(key.title())
|
||||
|
||||
def normvalue(self, value: Union[_RawValueT, Iterable[_RawValueT]]) -> List[bytes]:
|
||||
def normvalue(self, value: Union[_RawValueT, Iterable[_RawValueT]]) -> list[bytes]:
|
||||
"""Normalize values to bytes"""
|
||||
_value: Iterable[_RawValueT]
|
||||
if value is None:
|
||||
@ -78,19 +69,19 @@ class Headers(CaselessDict):
|
||||
|
||||
def __getitem__(self, key: AnyStr) -> Optional[bytes]:
|
||||
try:
|
||||
return cast(List[bytes], super().__getitem__(key))[-1]
|
||||
return cast(list[bytes], super().__getitem__(key))[-1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def get(self, key: AnyStr, def_val: Any = None) -> Optional[bytes]:
|
||||
try:
|
||||
return cast(List[bytes], super().get(key, def_val))[-1]
|
||||
return cast(list[bytes], super().get(key, def_val))[-1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def getlist(self, key: AnyStr, def_val: Any = None) -> List[bytes]:
|
||||
def getlist(self, key: AnyStr, def_val: Any = None) -> list[bytes]:
|
||||
try:
|
||||
return cast(List[bytes], super().__getitem__(key))
|
||||
return cast(list[bytes], super().__getitem__(key))
|
||||
except KeyError:
|
||||
if def_val is not None:
|
||||
return self.normvalue(def_val)
|
||||
@ -109,10 +100,10 @@ class Headers(CaselessDict):
|
||||
lst.extend(self.normvalue(value))
|
||||
self[key] = lst
|
||||
|
||||
def items(self) -> Iterable[Tuple[bytes, List[bytes]]]: # type: ignore[override]
|
||||
def items(self) -> Iterable[tuple[bytes, list[bytes]]]: # type: ignore[override]
|
||||
return ((k, self.getlist(k)) for k in self.keys())
|
||||
|
||||
def values(self) -> List[Optional[bytes]]: # type: ignore[override]
|
||||
def values(self) -> list[Optional[bytes]]: # type: ignore[override]
|
||||
return [
|
||||
self[k] for k in self.keys() # pylint: disable=consider-using-dict-items
|
||||
]
|
||||
|
@ -12,14 +12,8 @@ from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AnyStr,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Mapping,
|
||||
NoReturn,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
TypedDict,
|
||||
TypeVar,
|
||||
Union,
|
||||
@ -36,7 +30,7 @@ from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.url import escape_ajax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Callable, Iterable, Mapping
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -57,7 +51,7 @@ class VerboseCookie(TypedDict):
|
||||
secure: NotRequired[bool]
|
||||
|
||||
|
||||
CookiesT = Union[Dict[str, str], List[VerboseCookie]]
|
||||
CookiesT = Union[dict[str, str], list[VerboseCookie]]
|
||||
|
||||
|
||||
RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")
|
||||
@ -92,7 +86,7 @@ class Request(object_ref):
|
||||
executed by the Downloader, thus generating a :class:`Response`.
|
||||
"""
|
||||
|
||||
attributes: Tuple[str, ...] = (
|
||||
attributes: tuple[str, ...] = (
|
||||
"url",
|
||||
"callback",
|
||||
"method",
|
||||
@ -120,16 +114,16 @@ class Request(object_ref):
|
||||
url: str,
|
||||
callback: Optional[CallbackT] = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: str = "utf-8",
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
flags: Optional[List[str]] = None,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
self._encoding: str = encoding # this one has to be set first
|
||||
self.method: str = str(method).upper()
|
||||
@ -152,20 +146,20 @@ class Request(object_ref):
|
||||
self.headers: Headers = Headers(headers or {}, encoding=encoding)
|
||||
self.dont_filter: bool = dont_filter
|
||||
|
||||
self._meta: Optional[Dict[str, Any]] = dict(meta) if meta else None
|
||||
self._cb_kwargs: Optional[Dict[str, Any]] = (
|
||||
self._meta: Optional[dict[str, Any]] = dict(meta) if meta else None
|
||||
self._cb_kwargs: Optional[dict[str, Any]] = (
|
||||
dict(cb_kwargs) if cb_kwargs else None
|
||||
)
|
||||
self.flags: List[str] = [] if flags is None else list(flags)
|
||||
self.flags: list[str] = [] if flags is None else list(flags)
|
||||
|
||||
@property
|
||||
def cb_kwargs(self) -> Dict[str, Any]:
|
||||
def cb_kwargs(self) -> dict[str, Any]:
|
||||
if self._cb_kwargs is None:
|
||||
self._cb_kwargs = {}
|
||||
return self._cb_kwargs
|
||||
|
||||
@property
|
||||
def meta(self) -> Dict[str, Any]:
|
||||
def meta(self) -> dict[str, Any]:
|
||||
if self._meta is None:
|
||||
self._meta = {}
|
||||
return self._meta
|
||||
@ -207,14 +201,14 @@ class Request(object_ref):
|
||||
|
||||
@overload
|
||||
def replace(
|
||||
self, *args: Any, cls: Type[RequestTypeVar], **kwargs: Any
|
||||
self, *args: Any, cls: type[RequestTypeVar], **kwargs: Any
|
||||
) -> RequestTypeVar: ...
|
||||
|
||||
@overload
|
||||
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
|
||||
|
||||
def replace(
|
||||
self, *args: Any, cls: Optional[Type[Request]] = None, **kwargs: Any
|
||||
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
|
||||
) -> Request:
|
||||
"""Create a new Request with the same attributes except for those given new values"""
|
||||
for x in self.attributes:
|
||||
@ -261,7 +255,7 @@ class Request(object_ref):
|
||||
request_kwargs.update(kwargs)
|
||||
return cls(**request_kwargs)
|
||||
|
||||
def to_dict(self, *, spider: Optional[scrapy.Spider] = None) -> Dict[str, Any]:
|
||||
def to_dict(self, *, spider: Optional[scrapy.Spider] = None) -> dict[str, Any]:
|
||||
"""Return a dictionary containing the Request's data.
|
||||
|
||||
Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object.
|
||||
|
@ -7,17 +7,8 @@ See documentation in docs/topics/request-response.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit
|
||||
|
||||
from lxml.html import FormElement # nosec
|
||||
@ -31,6 +22,7 @@ from scrapy.http.request import Request
|
||||
from scrapy.utils.python import is_listlike, to_bytes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -38,8 +30,8 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
FormdataVType = Union[str, Iterable[str]]
|
||||
FormdataKVType = Tuple[str, FormdataVType]
|
||||
FormdataType = Optional[Union[Dict[str, FormdataVType], List[FormdataKVType]]]
|
||||
FormdataKVType = tuple[str, FormdataVType]
|
||||
FormdataType = Optional[Union[dict[str, FormdataVType], list[FormdataKVType]]]
|
||||
|
||||
|
||||
class FormRequest(Request):
|
||||
@ -74,7 +66,7 @@ class FormRequest(Request):
|
||||
formid: Optional[str] = None,
|
||||
formnumber: int = 0,
|
||||
formdata: FormdataType = None,
|
||||
clickdata: Optional[Dict[str, Union[str, int]]] = None,
|
||||
clickdata: Optional[dict[str, Union[str, int]]] = None,
|
||||
dont_click: bool = False,
|
||||
formxpath: Optional[str] = None,
|
||||
formcss: Optional[str] = None,
|
||||
@ -168,8 +160,8 @@ def _get_inputs(
|
||||
form: FormElement,
|
||||
formdata: FormdataType,
|
||||
dont_click: bool,
|
||||
clickdata: Optional[Dict[str, Union[str, int]]],
|
||||
) -> List[FormdataKVType]:
|
||||
clickdata: Optional[dict[str, Union[str, int]]],
|
||||
) -> list[FormdataKVType]:
|
||||
"""Return a list of key-value pairs for the inputs found in the given form."""
|
||||
try:
|
||||
formdata_keys = dict(formdata or ()).keys()
|
||||
@ -187,7 +179,7 @@ def _get_inputs(
|
||||
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"},
|
||||
)
|
||||
values: List[FormdataKVType] = [
|
||||
values: list[FormdataKVType] = [
|
||||
(k, "" if v is None else v)
|
||||
for k, v in (_value(e) for e in inputs)
|
||||
if k and k not in formdata_keys
|
||||
@ -205,7 +197,7 @@ def _get_inputs(
|
||||
|
||||
def _value(
|
||||
ele: Union[InputElement, SelectElement, TextareaElement]
|
||||
) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
|
||||
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
|
||||
n = ele.name
|
||||
v = ele.value
|
||||
if ele.tag == "select":
|
||||
@ -215,7 +207,7 @@ def _value(
|
||||
|
||||
def _select_value(
|
||||
ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions]
|
||||
) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
|
||||
) -> tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
|
||||
multiple = ele.multiple
|
||||
if v is None and not multiple:
|
||||
# Match browser behaviour on simple select tag without options selected
|
||||
@ -226,8 +218,8 @@ def _select_value(
|
||||
|
||||
|
||||
def _get_clickable(
|
||||
clickdata: Optional[Dict[str, Union[str, int]]], form: FormElement
|
||||
) -> Optional[Tuple[str, str]]:
|
||||
clickdata: Optional[dict[str, Union[str, int]]], form: FormElement
|
||||
) -> Optional[tuple[str, str]]:
|
||||
"""
|
||||
Returns the clickable element specified in clickdata,
|
||||
if the latter is given. If not, it returns the first
|
||||
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
||||
import copy
|
||||
import json
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, overload
|
||||
from typing import TYPE_CHECKING, Any, Optional, overload
|
||||
|
||||
from scrapy.http.request import Request, RequestTypeVar
|
||||
|
||||
@ -20,14 +20,14 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class JsonRequest(Request):
|
||||
attributes: Tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
|
||||
attributes: tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
|
||||
|
||||
def __init__(
|
||||
self, *args: Any, dumps_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any
|
||||
self, *args: Any, dumps_kwargs: Optional[dict[str, Any]] = None, **kwargs: Any
|
||||
) -> None:
|
||||
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
|
||||
dumps_kwargs.setdefault("sort_keys", True)
|
||||
self._dumps_kwargs: Dict[str, Any] = dumps_kwargs
|
||||
self._dumps_kwargs: dict[str, Any] = dumps_kwargs
|
||||
|
||||
body_passed = kwargs.get("body", None) is not None
|
||||
data: Any = kwargs.pop("data", None)
|
||||
@ -47,19 +47,19 @@ class JsonRequest(Request):
|
||||
)
|
||||
|
||||
@property
|
||||
def dumps_kwargs(self) -> Dict[str, Any]:
|
||||
def dumps_kwargs(self) -> dict[str, Any]:
|
||||
return self._dumps_kwargs
|
||||
|
||||
@overload
|
||||
def replace(
|
||||
self, *args: Any, cls: Type[RequestTypeVar], **kwargs: Any
|
||||
self, *args: Any, cls: type[RequestTypeVar], **kwargs: Any
|
||||
) -> RequestTypeVar: ...
|
||||
|
||||
@overload
|
||||
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
|
||||
|
||||
def replace(
|
||||
self, *args: Any, cls: Optional[Type[Request]] = None, **kwargs: Any
|
||||
self, *args: Any, cls: Optional[type[Request]] = None, **kwargs: Any
|
||||
) -> Request:
|
||||
body_passed = kwargs.get("body", None) is not None
|
||||
data: Any = kwargs.pop("data", None)
|
||||
|
@ -7,22 +7,7 @@ See documentation in docs/topics/request-response.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AnyStr,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, TypeVar, Union, overload
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from scrapy.exceptions import NotSupported
|
||||
@ -32,6 +17,7 @@ from scrapy.link import Link
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Iterable, Mapping
|
||||
from ipaddress import IPv4Address, IPv6Address
|
||||
|
||||
from twisted.internet.ssl import Certificate
|
||||
@ -52,7 +38,7 @@ class Response(object_ref):
|
||||
downloaded (by the Downloader) and fed to the Spiders for processing.
|
||||
"""
|
||||
|
||||
attributes: Tuple[str, ...] = (
|
||||
attributes: tuple[str, ...] = (
|
||||
"url",
|
||||
"status",
|
||||
"headers",
|
||||
@ -74,9 +60,9 @@ class Response(object_ref):
|
||||
self,
|
||||
url: str,
|
||||
status: int = 200,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: bytes = b"",
|
||||
flags: Optional[List[str]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
request: Optional[Request] = None,
|
||||
certificate: Optional[Certificate] = None,
|
||||
ip_address: Union[IPv4Address, IPv6Address, None] = None,
|
||||
@ -87,13 +73,13 @@ class Response(object_ref):
|
||||
self._set_body(body)
|
||||
self._set_url(url)
|
||||
self.request: Optional[Request] = request
|
||||
self.flags: List[str] = [] if flags is None else list(flags)
|
||||
self.flags: list[str] = [] if flags is None else list(flags)
|
||||
self.certificate: Optional[Certificate] = certificate
|
||||
self.ip_address: Union[IPv4Address, IPv6Address, None] = ip_address
|
||||
self.protocol: Optional[str] = protocol
|
||||
|
||||
@property
|
||||
def cb_kwargs(self) -> Dict[str, Any]:
|
||||
def cb_kwargs(self) -> dict[str, Any]:
|
||||
try:
|
||||
return self.request.cb_kwargs # type: ignore[union-attr]
|
||||
except AttributeError:
|
||||
@ -103,7 +89,7 @@ class Response(object_ref):
|
||||
)
|
||||
|
||||
@property
|
||||
def meta(self) -> Dict[str, Any]:
|
||||
def meta(self) -> dict[str, Any]:
|
||||
try:
|
||||
return self.request.meta # type: ignore[union-attr]
|
||||
except AttributeError:
|
||||
@ -149,14 +135,14 @@ class Response(object_ref):
|
||||
|
||||
@overload
|
||||
def replace(
|
||||
self, *args: Any, cls: Type[ResponseTypeVar], **kwargs: Any
|
||||
self, *args: Any, cls: type[ResponseTypeVar], **kwargs: Any
|
||||
) -> ResponseTypeVar: ...
|
||||
|
||||
@overload
|
||||
def replace(self, *args: Any, cls: None = None, **kwargs: Any) -> Self: ...
|
||||
|
||||
def replace(
|
||||
self, *args: Any, cls: Optional[Type[Response]] = None, **kwargs: Any
|
||||
self, *args: Any, cls: Optional[type[Response]] = None, **kwargs: Any
|
||||
) -> Response:
|
||||
"""Create a new Response with the same attributes except for those given new values"""
|
||||
for x in self.attributes:
|
||||
@ -200,16 +186,16 @@ class Response(object_ref):
|
||||
url: Union[str, Link],
|
||||
callback: Optional[CallbackT] = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
flags: Optional[List[str]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
) -> Request:
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
@ -253,16 +239,16 @@ class Response(object_ref):
|
||||
urls: Iterable[Union[str, Link]],
|
||||
callback: Optional[CallbackT] = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
flags: Optional[List[str]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
) -> Iterable[Request]:
|
||||
"""
|
||||
.. versionadded:: 2.0
|
||||
|
@ -8,21 +8,9 @@ See documentation in docs/topics/request-response.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections.abc import Iterable
|
||||
from contextlib import suppress
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AnyStr,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, AnyStr, Optional, Union, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import parsel
|
||||
@ -41,6 +29,8 @@ from scrapy.utils.python import memoizemethod_noargs, to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Mapping
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.http.request import CallbackT, CookiesT, Request
|
||||
@ -54,7 +44,7 @@ class TextResponse(Response):
|
||||
_DEFAULT_ENCODING = "ascii"
|
||||
_cached_decoded_json = _NONE
|
||||
|
||||
attributes: Tuple[str, ...] = Response.attributes + ("encoding",)
|
||||
attributes: tuple[str, ...] = Response.attributes + ("encoding",)
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any):
|
||||
self._encoding: Optional[str] = kwargs.pop("encoding", None)
|
||||
@ -183,16 +173,16 @@ class TextResponse(Response):
|
||||
url: Union[str, Link, parsel.Selector],
|
||||
callback: Optional[CallbackT] = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
flags: Optional[List[str]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
) -> Request:
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
@ -236,16 +226,16 @@ class TextResponse(Response):
|
||||
urls: Union[Iterable[Union[str, Link]], parsel.SelectorList, None] = None,
|
||||
callback: Optional[CallbackT] = None,
|
||||
method: str = "GET",
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None,
|
||||
headers: Union[Mapping[AnyStr, Any], Iterable[tuple[AnyStr, Any]], None] = None,
|
||||
body: Optional[Union[bytes, str]] = None,
|
||||
cookies: Optional[CookiesT] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
priority: int = 0,
|
||||
dont_filter: bool = False,
|
||||
errback: Optional[Callable[[Failure], Any]] = None,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
flags: Optional[List[str]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
flags: Optional[list[str]] = None,
|
||||
css: Optional[str] = None,
|
||||
xpath: Optional[str] = None,
|
||||
) -> Iterable[Request]:
|
||||
|
@ -7,27 +7,21 @@ See documentation in docs/topics/item.rst
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABCMeta
|
||||
from collections.abc import MutableMapping
|
||||
from copy import deepcopy
|
||||
from pprint import pformat
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterator,
|
||||
KeysView,
|
||||
MutableMapping,
|
||||
NoReturn,
|
||||
Tuple,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, NoReturn
|
||||
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator, KeysView
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
|
||||
class Field(Dict[str, Any]):
|
||||
class Field(dict[str, Any]):
|
||||
"""Container of field metadata"""
|
||||
|
||||
|
||||
@ -38,7 +32,7 @@ class ItemMeta(ABCMeta):
|
||||
"""
|
||||
|
||||
def __new__(
|
||||
mcs, class_name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]
|
||||
mcs, class_name: str, bases: tuple[type, ...], attrs: dict[str, Any]
|
||||
) -> ItemMeta:
|
||||
classcell = attrs.pop("__classcell__", None)
|
||||
new_bases = tuple(base._class for base in bases if hasattr(base, "_class"))
|
||||
@ -83,10 +77,10 @@ class Item(MutableMapping[str, Any], object_ref, metaclass=ItemMeta):
|
||||
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
|
||||
"""
|
||||
|
||||
fields: Dict[str, Field]
|
||||
fields: dict[str, Field]
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any):
|
||||
self._values: Dict[str, Any] = {}
|
||||
self._values: dict[str, Any] = {}
|
||||
if args or kwargs: # avoid creating dict for most common case
|
||||
for k, v in dict(*args, **kwargs).items():
|
||||
self[k] = v
|
||||
|
@ -6,8 +6,13 @@ This package contains a collection of Link Extractors.
|
||||
For more info see docs/topics/link-extractors.rst
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Iterable, Pattern
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
from re import Pattern
|
||||
|
||||
# common file extensions that are not followed if they occur in links
|
||||
IGNORED_EXTENSIONS = [
|
||||
|
@ -6,20 +6,10 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import operator
|
||||
import re
|
||||
from collections.abc import Callable, Iterable
|
||||
from functools import partial
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Pattern,
|
||||
Set,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from lxml import etree # nosec
|
||||
@ -28,13 +18,14 @@ from w3lib.html import strip_html5_whitespace
|
||||
from w3lib.url import canonicalize_url, safe_url_string
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.linkextractors import IGNORED_EXTENSIONS, _is_valid_url, _matches, re
|
||||
from scrapy.linkextractors import IGNORED_EXTENSIONS, _is_valid_url, _matches
|
||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.response import get_base_url
|
||||
from scrapy.utils.url import url_has_any_extension, url_is_from_any_domain
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
||||
from lxml.html import HtmlElement # nosec
|
||||
|
||||
from scrapy import Selector
|
||||
@ -98,7 +89,7 @@ class LxmlParserLinkExtractor:
|
||||
|
||||
def _iter_links(
|
||||
self, document: HtmlElement
|
||||
) -> Iterable[Tuple[HtmlElement, str, str]]:
|
||||
) -> Iterable[tuple[HtmlElement, str, str]]:
|
||||
for el in document.iter(etree.Element):
|
||||
if not self.scan_tag(_nons(el.tag)):
|
||||
continue
|
||||
@ -114,8 +105,8 @@ class LxmlParserLinkExtractor:
|
||||
response_url: str,
|
||||
response_encoding: str,
|
||||
base_url: str,
|
||||
) -> List[Link]:
|
||||
links: List[Link] = []
|
||||
) -> list[Link]:
|
||||
links: list[Link] = []
|
||||
# hacky way to get the underlying lxml parsed document
|
||||
for el, attr, attr_val in self._iter_links(selector.root):
|
||||
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
|
||||
@ -145,26 +136,26 @@ class LxmlParserLinkExtractor:
|
||||
links.append(link)
|
||||
return self._deduplicate_if_needed(links)
|
||||
|
||||
def extract_links(self, response: TextResponse) -> List[Link]:
|
||||
def extract_links(self, response: TextResponse) -> list[Link]:
|
||||
base_url = get_base_url(response)
|
||||
return self._extract_links(
|
||||
response.selector, response.url, response.encoding, base_url
|
||||
)
|
||||
|
||||
def _process_links(self, links: List[Link]) -> List[Link]:
|
||||
def _process_links(self, links: list[Link]) -> list[Link]:
|
||||
"""Normalize and filter extracted links
|
||||
|
||||
The subclass should override it if necessary
|
||||
"""
|
||||
return self._deduplicate_if_needed(links)
|
||||
|
||||
def _deduplicate_if_needed(self, links: List[Link]) -> List[Link]:
|
||||
def _deduplicate_if_needed(self, links: list[Link]) -> list[Link]:
|
||||
if self.unique:
|
||||
return unique_list(links, key=self.link_key)
|
||||
return links
|
||||
|
||||
|
||||
_RegexT = Union[str, Pattern[str]]
|
||||
_RegexT = Union[str, re.Pattern[str]]
|
||||
_RegexOrSeveralT = Union[_RegexT, Iterable[_RegexT]]
|
||||
|
||||
|
||||
@ -197,13 +188,13 @@ class LxmlLinkExtractor:
|
||||
strip=strip,
|
||||
canonicalized=not canonicalize,
|
||||
)
|
||||
self.allow_res: List[Pattern[str]] = self._compile_regexes(allow)
|
||||
self.deny_res: List[Pattern[str]] = self._compile_regexes(deny)
|
||||
self.allow_res: list[re.Pattern[str]] = self._compile_regexes(allow)
|
||||
self.deny_res: list[re.Pattern[str]] = self._compile_regexes(deny)
|
||||
|
||||
self.allow_domains: Set[str] = set(arg_to_iter(allow_domains))
|
||||
self.deny_domains: Set[str] = set(arg_to_iter(deny_domains))
|
||||
self.allow_domains: set[str] = set(arg_to_iter(allow_domains))
|
||||
self.deny_domains: set[str] = set(arg_to_iter(deny_domains))
|
||||
|
||||
self.restrict_xpaths: Tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.restrict_xpaths: tuple[str, ...] = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.restrict_xpaths += tuple(
|
||||
map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
|
||||
)
|
||||
@ -211,11 +202,11 @@ class LxmlLinkExtractor:
|
||||
if deny_extensions is None:
|
||||
deny_extensions = IGNORED_EXTENSIONS
|
||||
self.canonicalize: bool = canonicalize
|
||||
self.deny_extensions: Set[str] = {"." + e for e in arg_to_iter(deny_extensions)}
|
||||
self.restrict_text: List[Pattern[str]] = self._compile_regexes(restrict_text)
|
||||
self.deny_extensions: set[str] = {"." + e for e in arg_to_iter(deny_extensions)}
|
||||
self.restrict_text: list[re.Pattern[str]] = self._compile_regexes(restrict_text)
|
||||
|
||||
@staticmethod
|
||||
def _compile_regexes(value: Optional[_RegexOrSeveralT]) -> List[Pattern[str]]:
|
||||
def _compile_regexes(value: Optional[_RegexOrSeveralT]) -> list[re.Pattern[str]]:
|
||||
return [
|
||||
x if isinstance(x, re.Pattern) else re.compile(x)
|
||||
for x in arg_to_iter(value)
|
||||
@ -257,7 +248,7 @@ class LxmlLinkExtractor:
|
||||
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
|
||||
return any(allowed) and not any(denied)
|
||||
|
||||
def _process_links(self, links: List[Link]) -> List[Link]:
|
||||
def _process_links(self, links: list[Link]) -> list[Link]:
|
||||
links = [x for x in links if self._link_allowed(x)]
|
||||
if self.canonicalize:
|
||||
for link in links:
|
||||
@ -265,10 +256,10 @@ class LxmlLinkExtractor:
|
||||
links = self.link_extractor._process_links(links)
|
||||
return links
|
||||
|
||||
def _extract_links(self, *args: Any, **kwargs: Any) -> List[Link]:
|
||||
def _extract_links(self, *args: Any, **kwargs: Any) -> list[Link]:
|
||||
return self.link_extractor._extract_links(*args, **kwargs)
|
||||
|
||||
def extract_links(self, response: TextResponse) -> List[Link]:
|
||||
def extract_links(self, response: TextResponse) -> list[Link]:
|
||||
"""Returns a list of :class:`~scrapy.link.Link` objects from the
|
||||
specified :class:`response <scrapy.http.Response>`.
|
||||
|
||||
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypedDict, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypedDict, Union
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -31,7 +31,7 @@ DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
|
||||
class LogFormatterResult(TypedDict):
|
||||
level: int
|
||||
msg: str
|
||||
args: Union[Dict[str, Any], Tuple[Any, ...]]
|
||||
args: Union[dict[str, Any], tuple[Any, ...]]
|
||||
|
||||
|
||||
class LogFormatter:
|
||||
@ -181,7 +181,7 @@ class LogFormatter:
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
args: Dict[str, Any] = {"request": request}
|
||||
args: dict[str, Any] = {"request": request}
|
||||
if errmsg:
|
||||
msg = DOWNLOADERRORMSG_LONG
|
||||
args["errmsg"] = errmsg
|
||||
|
@ -14,18 +14,7 @@ from email.mime.nonmultipart import MIMENonMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.utils import formatdate
|
||||
from io import BytesIO
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from twisted import version as twisted_version
|
||||
from twisted.internet import ssl
|
||||
@ -36,6 +25,8 @@ from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Sequence
|
||||
|
||||
# imports twisted.internet.reactor
|
||||
from twisted.mail.smtp import ESMTPSenderFactory
|
||||
from twisted.python.failure import Failure
|
||||
@ -95,11 +86,11 @@ class MailSender:
|
||||
|
||||
def send(
|
||||
self,
|
||||
to: Union[str, List[str]],
|
||||
to: Union[str, list[str]],
|
||||
subject: str,
|
||||
body: str,
|
||||
cc: Union[str, List[str], None] = None,
|
||||
attachs: Sequence[Tuple[str, str, IO[Any]]] = (),
|
||||
cc: Union[str, list[str], None] = None,
|
||||
attachs: Sequence[tuple[str, str, IO[Any]]] = (),
|
||||
mimetype: str = "text/plain",
|
||||
charset: Optional[str] = None,
|
||||
_callback: Optional[Callable[..., None]] = None,
|
||||
@ -164,7 +155,7 @@ class MailSender:
|
||||
return dfd
|
||||
|
||||
def _sent_ok(
|
||||
self, result: Any, to: List[str], cc: List[str], subject: str, nattachs: int
|
||||
self, result: Any, to: list[str], cc: list[str], subject: str, nattachs: int
|
||||
) -> None:
|
||||
logger.info(
|
||||
"Mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
|
||||
@ -180,8 +171,8 @@ class MailSender:
|
||||
def _sent_failed(
|
||||
self,
|
||||
failure: Failure,
|
||||
to: List[str],
|
||||
cc: List[str],
|
||||
to: list[str],
|
||||
cc: list[str],
|
||||
subject: str,
|
||||
nattachs: int,
|
||||
) -> Failure:
|
||||
@ -200,7 +191,7 @@ class MailSender:
|
||||
)
|
||||
return failure
|
||||
|
||||
def _sendmail(self, to_addrs: List[str], msg: bytes) -> Deferred[Any]:
|
||||
def _sendmail(self, to_addrs: list[str], msg: bytes) -> Deferred[Any]:
|
||||
from twisted.internet import reactor
|
||||
|
||||
msg_io = BytesIO(msg)
|
||||
@ -218,11 +209,11 @@ class MailSender:
|
||||
return d
|
||||
|
||||
def _create_sender_factory(
|
||||
self, to_addrs: List[str], msg: IO[bytes], d: Deferred[Any]
|
||||
self, to_addrs: list[str], msg: IO[bytes], d: Deferred[Any]
|
||||
) -> ESMTPSenderFactory:
|
||||
from twisted.mail.smtp import ESMTPSenderFactory
|
||||
|
||||
factory_keywords: Dict[str, Any] = {
|
||||
factory_keywords: dict[str, Any] = {
|
||||
"heloFallback": True,
|
||||
"requireAuthentication": False,
|
||||
"requireTransportSecurity": self.smtptls,
|
||||
|
@ -3,26 +3,15 @@ from __future__ import annotations
|
||||
import logging
|
||||
import pprint
|
||||
from collections import defaultdict, deque
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Deque,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.defer import process_chain, process_parallel
|
||||
from scrapy.utils.misc import build_from_crawler, build_from_settings, load_object
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
|
||||
# typing.Concatenate and typing.ParamSpec require Python 3.10
|
||||
@ -51,14 +40,14 @@ class MiddlewareManager:
|
||||
self.middlewares = middlewares
|
||||
# Only process_spider_output and process_spider_exception can be None.
|
||||
# Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
|
||||
self.methods: Dict[
|
||||
str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]
|
||||
self.methods: dict[
|
||||
str, deque[Union[None, Callable, tuple[Callable, Callable]]]
|
||||
] = defaultdict(deque)
|
||||
for mw in middlewares:
|
||||
self._add_middleware(mw)
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
|
||||
def _get_mwlist_from_settings(cls, settings: Settings) -> list[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
@ -107,7 +96,7 @@ class MiddlewareManager:
|
||||
|
||||
def _process_parallel(
|
||||
self, methodname: str, obj: _T, *args: Any
|
||||
) -> Deferred[List[_T2]]:
|
||||
) -> Deferred[list[_T2]]:
|
||||
methods = cast(
|
||||
"Iterable[Callable[Concatenate[_T, _P], _T2]]", self.methods[methodname]
|
||||
)
|
||||
@ -119,8 +108,8 @@ class MiddlewareManager:
|
||||
)
|
||||
return process_chain(methods, obj, *args)
|
||||
|
||||
def open_spider(self, spider: Spider) -> Deferred[List[None]]:
|
||||
def open_spider(self, spider: Spider) -> Deferred[list[None]]:
|
||||
return self._process_parallel("open_spider", spider)
|
||||
|
||||
def close_spider(self, spider: Spider) -> Deferred[List[None]]:
|
||||
def close_spider(self, spider: Spider) -> Deferred[list[None]]:
|
||||
return self._process_parallel("close_spider", spider)
|
||||
|
@ -6,7 +6,7 @@ See documentation in docs/item-pipeline.rst
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, List
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
@ -23,7 +23,7 @@ class ItemPipelineManager(MiddlewareManager):
|
||||
component_name = "item pipeline"
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
|
||||
def _get_mwlist_from_settings(cls, settings: Settings) -> list[Any]:
|
||||
return build_component_list(settings.getwithbase("ITEM_PIPELINES"))
|
||||
|
||||
def _add_middleware(self, pipe: Any) -> None:
|
||||
|
@ -21,15 +21,9 @@ from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
DefaultDict,
|
||||
Dict,
|
||||
List,
|
||||
NoReturn,
|
||||
Optional,
|
||||
Protocol,
|
||||
Set,
|
||||
Type,
|
||||
TypedDict,
|
||||
Union,
|
||||
cast,
|
||||
@ -53,6 +47,7 @@ from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.request import referer_str
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from os import PathLike
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
@ -104,8 +99,8 @@ class FilesStoreProtocol(Protocol):
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
) -> Optional[Deferred[Any]]: ...
|
||||
|
||||
def stat_file(
|
||||
@ -120,7 +115,7 @@ class FSFilesStore:
|
||||
basedir = basedir.split("://", 1)[1]
|
||||
self.basedir: str = basedir
|
||||
self._mkdir(Path(self.basedir))
|
||||
self.created_directories: DefaultDict[MediaPipeline.SpiderInfo, Set[str]] = (
|
||||
self.created_directories: defaultdict[MediaPipeline.SpiderInfo, set[str]] = (
|
||||
defaultdict(set)
|
||||
)
|
||||
|
||||
@ -129,8 +124,8 @@ class FSFilesStore:
|
||||
path: Union[str, PathLike[str]],
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
) -> None:
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
self._mkdir(absolute_path.parent, info)
|
||||
@ -157,7 +152,7 @@ class FSFilesStore:
|
||||
def _mkdir(
|
||||
self, dirname: Path, domain: Optional[MediaPipeline.SpiderInfo] = None
|
||||
) -> None:
|
||||
seen: Set[str] = self.created_directories[domain] if domain else set()
|
||||
seen: set[str] = self.created_directories[domain] if domain else set()
|
||||
if str(dirname) not in seen:
|
||||
if not dirname.exists():
|
||||
dirname.mkdir(parents=True)
|
||||
@ -201,7 +196,7 @@ class S3FilesStore:
|
||||
def stat_file(
|
||||
self, path: str, info: MediaPipeline.SpiderInfo
|
||||
) -> Deferred[StatInfo]:
|
||||
def _onsuccess(boto_key: Dict[str, Any]) -> StatInfo:
|
||||
def _onsuccess(boto_key: dict[str, Any]) -> StatInfo:
|
||||
checksum = boto_key["ETag"].strip('"')
|
||||
last_modified = boto_key["LastModified"]
|
||||
modified_stamp = time.mktime(last_modified.timetuple())
|
||||
@ -209,10 +204,10 @@ class S3FilesStore:
|
||||
|
||||
return self._get_boto_key(path).addCallback(_onsuccess)
|
||||
|
||||
def _get_boto_key(self, path: str) -> Deferred[Dict[str, Any]]:
|
||||
def _get_boto_key(self, path: str) -> Deferred[dict[str, Any]]:
|
||||
key_name = f"{self.prefix}{path}"
|
||||
return cast(
|
||||
"Deferred[Dict[str, Any]]",
|
||||
"Deferred[dict[str, Any]]",
|
||||
deferToThread(
|
||||
self.s3_client.head_object, Bucket=self.bucket, Key=key_name # type: ignore[attr-defined]
|
||||
),
|
||||
@ -223,8 +218,8 @@ class S3FilesStore:
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
) -> Deferred[Any]:
|
||||
"""Upload file to S3 storage"""
|
||||
key_name = f"{self.prefix}{path}"
|
||||
@ -242,7 +237,7 @@ class S3FilesStore:
|
||||
**extra,
|
||||
)
|
||||
|
||||
def _headers_to_botocore_kwargs(self, headers: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def _headers_to_botocore_kwargs(self, headers: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Convert headers to botocore keyword arguments."""
|
||||
# This is required while we need to support both boto and botocore.
|
||||
mapping = CaseInsensitiveDict(
|
||||
@ -274,7 +269,7 @@ class S3FilesStore:
|
||||
"X-Amz-Website-Redirect-Location": "WebsiteRedirectLocation",
|
||||
}
|
||||
)
|
||||
extra: Dict[str, Any] = {}
|
||||
extra: dict[str, Any] = {}
|
||||
for key, value in headers.items():
|
||||
try:
|
||||
kwarg = mapping[key]
|
||||
@ -332,7 +327,7 @@ class GCSFilesStore:
|
||||
deferToThread(self.bucket.get_blob, blob_path).addCallback(_onsuccess),
|
||||
)
|
||||
|
||||
def _get_content_type(self, headers: Optional[Dict[str, str]]) -> str:
|
||||
def _get_content_type(self, headers: Optional[dict[str, str]]) -> str:
|
||||
if headers and "Content-Type" in headers:
|
||||
return headers["Content-Type"]
|
||||
return "application/octet-stream"
|
||||
@ -345,8 +340,8 @@ class GCSFilesStore:
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
) -> Deferred[Any]:
|
||||
blob_path = self._get_blob_path(path)
|
||||
blob = self.bucket.blob(blob_path)
|
||||
@ -385,8 +380,8 @@ class FTPFilesStore:
|
||||
path: str,
|
||||
buf: BytesIO,
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
meta: Optional[dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
) -> Deferred[Any]:
|
||||
path = f"{self.basedir}/{path}"
|
||||
return deferToThread(
|
||||
@ -443,7 +438,7 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
MEDIA_NAME: str = "file"
|
||||
EXPIRES: int = 90
|
||||
STORE_SCHEMES: Dict[str, Type[FilesStoreProtocol]] = {
|
||||
STORE_SCHEMES: dict[str, type[FilesStoreProtocol]] = {
|
||||
"": FSFilesStore,
|
||||
"file": FSFilesStore,
|
||||
"s3": S3FilesStore,
|
||||
@ -457,7 +452,7 @@ class FilesPipeline(MediaPipeline):
|
||||
self,
|
||||
store_uri: Union[str, PathLike[str]],
|
||||
download_func: Optional[Callable[[Request, Spider], Response]] = None,
|
||||
settings: Union[Settings, Dict[str, Any], None] = None,
|
||||
settings: Union[Settings, dict[str, Any], None] = None,
|
||||
):
|
||||
store_uri = _to_string(store_uri)
|
||||
if not store_uri:
|
||||
@ -486,7 +481,7 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings: Settings) -> Self:
|
||||
s3store: Type[S3FilesStore] = cast(Type[S3FilesStore], cls.STORE_SCHEMES["s3"])
|
||||
s3store: type[S3FilesStore] = cast(type[S3FilesStore], cls.STORE_SCHEMES["s3"])
|
||||
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
|
||||
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
|
||||
@ -496,14 +491,14 @@ class FilesPipeline(MediaPipeline):
|
||||
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
|
||||
s3store.POLICY = settings["FILES_STORE_S3_ACL"]
|
||||
|
||||
gcs_store: Type[GCSFilesStore] = cast(
|
||||
Type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
|
||||
gcs_store: type[GCSFilesStore] = cast(
|
||||
type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
|
||||
)
|
||||
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
|
||||
gcs_store.POLICY = settings["FILES_STORE_GCS_ACL"] or None
|
||||
|
||||
ftp_store: Type[FTPFilesStore] = cast(
|
||||
Type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
|
||||
ftp_store: type[FTPFilesStore] = cast(
|
||||
type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
|
||||
)
|
||||
ftp_store.FTP_USERNAME = settings["FTP_USER"]
|
||||
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
|
||||
@ -660,7 +655,7 @@ class FilesPipeline(MediaPipeline):
|
||||
# Overridable Interface
|
||||
def get_media_requests(
|
||||
self, item: Any, info: MediaPipeline.SpiderInfo
|
||||
) -> List[Request]:
|
||||
) -> list[Request]:
|
||||
urls = ItemAdapter(item).get(self.files_urls_field, [])
|
||||
return [Request(u, callback=NO_CALLBACK) for u in urls]
|
||||
|
||||
@ -680,7 +675,7 @@ class FilesPipeline(MediaPipeline):
|
||||
return checksum
|
||||
|
||||
def item_completed(
|
||||
self, results: List[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
|
||||
self, results: list[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
|
||||
) -> Any:
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
|
||||
|
@ -11,19 +11,7 @@ import hashlib
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
@ -42,6 +30,7 @@ from scrapy.settings import Settings
|
||||
from scrapy.utils.python import get_func_args, to_bytes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Iterable
|
||||
from os import PathLike
|
||||
|
||||
from PIL import Image
|
||||
@ -79,7 +68,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
MIN_WIDTH: int = 0
|
||||
MIN_HEIGHT: int = 0
|
||||
EXPIRES: int = 90
|
||||
THUMBS: Dict[str, Tuple[int, int]] = {}
|
||||
THUMBS: dict[str, tuple[int, int]] = {}
|
||||
DEFAULT_IMAGES_URLS_FIELD = "image_urls"
|
||||
DEFAULT_IMAGES_RESULT_FIELD = "images"
|
||||
|
||||
@ -87,7 +76,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
self,
|
||||
store_uri: Union[str, PathLike[str]],
|
||||
download_func: Optional[Callable[[Request, Spider], Response]] = None,
|
||||
settings: Union[Settings, Dict[str, Any], None] = None,
|
||||
settings: Union[Settings, dict[str, Any], None] = None,
|
||||
):
|
||||
try:
|
||||
from PIL import Image
|
||||
@ -127,7 +116,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
self.min_height: int = settings.getint(
|
||||
resolve("IMAGES_MIN_HEIGHT"), self.MIN_HEIGHT
|
||||
)
|
||||
self.thumbs: Dict[str, Tuple[int, int]] = settings.get(
|
||||
self.thumbs: dict[str, tuple[int, int]] = settings.get(
|
||||
resolve("IMAGES_THUMBS"), self.THUMBS
|
||||
)
|
||||
|
||||
@ -135,7 +124,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings: Settings) -> Self:
|
||||
s3store: Type[S3FilesStore] = cast(Type[S3FilesStore], cls.STORE_SCHEMES["s3"])
|
||||
s3store: type[S3FilesStore] = cast(type[S3FilesStore], cls.STORE_SCHEMES["s3"])
|
||||
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
|
||||
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
|
||||
@ -145,14 +134,14 @@ class ImagesPipeline(FilesPipeline):
|
||||
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
|
||||
s3store.POLICY = settings["IMAGES_STORE_S3_ACL"]
|
||||
|
||||
gcs_store: Type[GCSFilesStore] = cast(
|
||||
Type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
|
||||
gcs_store: type[GCSFilesStore] = cast(
|
||||
type[GCSFilesStore], cls.STORE_SCHEMES["gs"]
|
||||
)
|
||||
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
|
||||
gcs_store.POLICY = settings["IMAGES_STORE_GCS_ACL"] or None
|
||||
|
||||
ftp_store: Type[FTPFilesStore] = cast(
|
||||
Type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
|
||||
ftp_store: type[FTPFilesStore] = cast(
|
||||
type[FTPFilesStore], cls.STORE_SCHEMES["ftp"]
|
||||
)
|
||||
ftp_store.FTP_USERNAME = settings["FTP_USER"]
|
||||
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
|
||||
@ -202,7 +191,7 @@ class ImagesPipeline(FilesPipeline):
|
||||
info: MediaPipeline.SpiderInfo,
|
||||
*,
|
||||
item: Any = None,
|
||||
) -> Iterable[Tuple[str, Image.Image, BytesIO]]:
|
||||
) -> Iterable[tuple[str, Image.Image, BytesIO]]:
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
orig_image = self._Image.open(BytesIO(response.body))
|
||||
|
||||
@ -246,9 +235,9 @@ class ImagesPipeline(FilesPipeline):
|
||||
def convert_image(
|
||||
self,
|
||||
image: Image.Image,
|
||||
size: Optional[Tuple[int, int]] = None,
|
||||
size: Optional[tuple[int, int]] = None,
|
||||
response_body: Optional[BytesIO] = None,
|
||||
) -> Tuple[Image.Image, BytesIO]:
|
||||
) -> tuple[Image.Image, BytesIO]:
|
||||
if response_body is None:
|
||||
warnings.warn(
|
||||
f"{self.__class__.__name__}.convert_image() method called in a deprecated way, "
|
||||
@ -288,12 +277,12 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
def get_media_requests(
|
||||
self, item: Any, info: MediaPipeline.SpiderInfo
|
||||
) -> List[Request]:
|
||||
) -> list[Request]:
|
||||
urls = ItemAdapter(item).get(self.images_urls_field, [])
|
||||
return [Request(u, callback=NO_CALLBACK) for u in urls]
|
||||
|
||||
def item_completed(
|
||||
self, results: List[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
|
||||
self, results: list[FileInfoOrError], item: Any, info: MediaPipeline.SpiderInfo
|
||||
) -> Any:
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
|
||||
|
@ -7,15 +7,9 @@ from collections import defaultdict
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
DefaultDict,
|
||||
Dict,
|
||||
List,
|
||||
Literal,
|
||||
NoReturn,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
TypeVar,
|
||||
Union,
|
||||
@ -33,6 +27,8 @@ from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -52,7 +48,7 @@ class FileInfo(TypedDict):
|
||||
status: str
|
||||
|
||||
|
||||
FileInfoOrError = Union[Tuple[Literal[True], FileInfo], Tuple[Literal[False], Failure]]
|
||||
FileInfoOrError = Union[tuple[Literal[True], FileInfo], tuple[Literal[False], Failure]]
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -67,16 +63,16 @@ class MediaPipeline(ABC):
|
||||
class SpiderInfo:
|
||||
def __init__(self, spider: Spider):
|
||||
self.spider: Spider = spider
|
||||
self.downloading: Set[bytes] = set()
|
||||
self.downloaded: Dict[bytes, Union[FileInfo, Failure]] = {}
|
||||
self.waiting: DefaultDict[bytes, List[Deferred[FileInfo]]] = defaultdict(
|
||||
self.downloading: set[bytes] = set()
|
||||
self.downloaded: dict[bytes, Union[FileInfo, Failure]] = {}
|
||||
self.waiting: defaultdict[bytes, list[Deferred[FileInfo]]] = defaultdict(
|
||||
list
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
download_func: Optional[Callable[[Request, Spider], Response]] = None,
|
||||
settings: Union[Settings, Dict[str, Any], None] = None,
|
||||
settings: Union[Settings, dict[str, Any], None] = None,
|
||||
):
|
||||
self.download_func = download_func
|
||||
|
||||
@ -129,12 +125,12 @@ class MediaPipeline(ABC):
|
||||
|
||||
def process_item(
|
||||
self, item: Any, spider: Spider
|
||||
) -> Deferred[List[FileInfoOrError]]:
|
||||
) -> Deferred[list[FileInfoOrError]]:
|
||||
info = self.spiderinfo
|
||||
requests = arg_to_iter(self.get_media_requests(item, info))
|
||||
dlist = [self._process_request(r, info, item) for r in requests]
|
||||
dfd = cast(
|
||||
"Deferred[List[FileInfoOrError]]", DeferredList(dlist, consumeErrors=True)
|
||||
"Deferred[list[FileInfoOrError]]", DeferredList(dlist, consumeErrors=True)
|
||||
)
|
||||
return dfd.addCallback(self.item_completed, item, info)
|
||||
|
||||
@ -252,7 +248,7 @@ class MediaPipeline(ABC):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def get_media_requests(self, item: Any, info: SpiderInfo) -> List[Request]:
|
||||
def get_media_requests(self, item: Any, info: SpiderInfo) -> list[Request]:
|
||||
"""Returns the media requests to download"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@ -276,7 +272,7 @@ class MediaPipeline(ABC):
|
||||
raise NotImplementedError()
|
||||
|
||||
def item_completed(
|
||||
self, results: List[FileInfoOrError], item: Any, info: SpiderInfo
|
||||
self, results: list[FileInfoOrError], item: Any, info: SpiderInfo
|
||||
) -> Any:
|
||||
"""Called per item when all media requests has been processed"""
|
||||
if self.LOG_FAILED_RESULTS:
|
||||
|
@ -2,23 +2,15 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Protocol,
|
||||
Tuple,
|
||||
Type,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Optional, Protocol, cast
|
||||
|
||||
from scrapy import Request
|
||||
from scrapy.core.downloader import Downloader
|
||||
from scrapy.utils.misc import build_from_crawler
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -87,7 +79,7 @@ class ScrapyPriorityQueue:
|
||||
def from_crawler(
|
||||
cls,
|
||||
crawler: Crawler,
|
||||
downstream_queue_cls: Type[QueueProtocol],
|
||||
downstream_queue_cls: type[QueueProtocol],
|
||||
key: str,
|
||||
startprios: Iterable[int] = (),
|
||||
) -> Self:
|
||||
@ -96,14 +88,14 @@ class ScrapyPriorityQueue:
|
||||
def __init__(
|
||||
self,
|
||||
crawler: Crawler,
|
||||
downstream_queue_cls: Type[QueueProtocol],
|
||||
downstream_queue_cls: type[QueueProtocol],
|
||||
key: str,
|
||||
startprios: Iterable[int] = (),
|
||||
):
|
||||
self.crawler: Crawler = crawler
|
||||
self.downstream_queue_cls: Type[QueueProtocol] = downstream_queue_cls
|
||||
self.downstream_queue_cls: type[QueueProtocol] = downstream_queue_cls
|
||||
self.key: str = key
|
||||
self.queues: Dict[int, QueueProtocol] = {}
|
||||
self.queues: dict[int, QueueProtocol] = {}
|
||||
self.curprio: Optional[int] = None
|
||||
self.init_prios(startprios)
|
||||
|
||||
@ -160,8 +152,8 @@ class ScrapyPriorityQueue:
|
||||
# Protocols can't declare optional members
|
||||
return cast(Request, queue.peek()) # type: ignore[attr-defined]
|
||||
|
||||
def close(self) -> List[int]:
|
||||
active: List[int] = []
|
||||
def close(self) -> list[int]:
|
||||
active: list[int] = []
|
||||
for p, q in self.queues.items():
|
||||
active.append(p)
|
||||
q.close()
|
||||
@ -176,7 +168,7 @@ class DownloaderInterface:
|
||||
assert crawler.engine
|
||||
self.downloader: Downloader = crawler.engine.downloader
|
||||
|
||||
def stats(self, possible_slots: Iterable[str]) -> List[Tuple[int, str]]:
|
||||
def stats(self, possible_slots: Iterable[str]) -> list[tuple[int, str]]:
|
||||
return [(self._active_downloads(slot), slot) for slot in possible_slots]
|
||||
|
||||
def get_slot_key(self, request: Request) -> str:
|
||||
@ -199,18 +191,18 @@ class DownloaderAwarePriorityQueue:
|
||||
def from_crawler(
|
||||
cls,
|
||||
crawler: Crawler,
|
||||
downstream_queue_cls: Type[QueueProtocol],
|
||||
downstream_queue_cls: type[QueueProtocol],
|
||||
key: str,
|
||||
startprios: Optional[Dict[str, Iterable[int]]] = None,
|
||||
startprios: Optional[dict[str, Iterable[int]]] = None,
|
||||
) -> Self:
|
||||
return cls(crawler, downstream_queue_cls, key, startprios)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler: Crawler,
|
||||
downstream_queue_cls: Type[QueueProtocol],
|
||||
downstream_queue_cls: type[QueueProtocol],
|
||||
key: str,
|
||||
slot_startprios: Optional[Dict[str, Iterable[int]]] = None,
|
||||
slot_startprios: Optional[dict[str, Iterable[int]]] = None,
|
||||
):
|
||||
if crawler.settings.getint("CONCURRENT_REQUESTS_PER_IP") != 0:
|
||||
raise ValueError(
|
||||
@ -229,11 +221,11 @@ class DownloaderAwarePriorityQueue:
|
||||
)
|
||||
|
||||
self._downloader_interface: DownloaderInterface = DownloaderInterface(crawler)
|
||||
self.downstream_queue_cls: Type[QueueProtocol] = downstream_queue_cls
|
||||
self.downstream_queue_cls: type[QueueProtocol] = downstream_queue_cls
|
||||
self.key: str = key
|
||||
self.crawler: Crawler = crawler
|
||||
|
||||
self.pqueues: Dict[str, ScrapyPriorityQueue] = {} # slot -> priority queue
|
||||
self.pqueues: dict[str, ScrapyPriorityQueue] = {} # slot -> priority queue
|
||||
for slot, startprios in (slot_startprios or {}).items():
|
||||
self.pqueues[slot] = self.pqfactory(slot, startprios)
|
||||
|
||||
@ -281,7 +273,7 @@ class DownloaderAwarePriorityQueue:
|
||||
queue = self.pqueues[slot]
|
||||
return queue.peek()
|
||||
|
||||
def close(self) -> Dict[str, List[int]]:
|
||||
def close(self) -> dict[str, list[int]]:
|
||||
active = {slot: queue.close() for slot, queue in self.pqueues.items()}
|
||||
self.pqueues.clear()
|
||||
return active
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Type
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.base import ReactorBase, ThreadedResolver
|
||||
@ -16,6 +16,8 @@ from zope.interface.declarations import implementer, provider
|
||||
from scrapy.utils.datatypes import LocalCache
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
@ -82,7 +84,7 @@ class _CachingResolutionReceiver:
|
||||
def __init__(self, resolutionReceiver: IResolutionReceiver, hostName: str):
|
||||
self.resolutionReceiver: IResolutionReceiver = resolutionReceiver
|
||||
self.hostName: str = hostName
|
||||
self.addresses: List[IAddress] = []
|
||||
self.addresses: list[IAddress] = []
|
||||
|
||||
def resolutionBegan(self, resolution: IHostResolution) -> None:
|
||||
self.resolutionReceiver.resolutionBegan(resolution)
|
||||
@ -126,7 +128,7 @@ class CachingHostnameResolver:
|
||||
resolutionReceiver: IResolutionReceiver,
|
||||
hostName: str,
|
||||
portNumber: int = 0,
|
||||
addressTypes: Optional[Sequence[Type[IAddress]]] = None,
|
||||
addressTypes: Optional[Sequence[type[IAddress]]] = None,
|
||||
transportSemantics: str = "TCP",
|
||||
) -> IHostResolution:
|
||||
try:
|
||||
|
@ -3,15 +3,20 @@ This module implements a class which returns the appropriate Response class
|
||||
based on different criteria.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from io import StringIO
|
||||
from mimetypes import MimeTypes
|
||||
from pkgutil import get_data
|
||||
from typing import Dict, Mapping, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.python import binary_is_text, to_bytes, to_unicode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Mapping
|
||||
|
||||
|
||||
class ResponseTypes:
|
||||
CLASSES = {
|
||||
@ -32,7 +37,7 @@ class ResponseTypes:
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.classes: Dict[str, Type[Response]] = {}
|
||||
self.classes: dict[str, type[Response]] = {}
|
||||
self.mimetypes: MimeTypes = MimeTypes()
|
||||
mimedata = get_data("scrapy", "mime.types")
|
||||
if not mimedata:
|
||||
@ -43,7 +48,7 @@ class ResponseTypes:
|
||||
for mimetype, cls in self.CLASSES.items():
|
||||
self.classes[mimetype] = load_object(cls)
|
||||
|
||||
def from_mimetype(self, mimetype: str) -> Type[Response]:
|
||||
def from_mimetype(self, mimetype: str) -> type[Response]:
|
||||
"""Return the most appropriate Response class for the given mimetype"""
|
||||
if mimetype is None:
|
||||
return Response
|
||||
@ -54,7 +59,7 @@ class ResponseTypes:
|
||||
|
||||
def from_content_type(
|
||||
self, content_type: Union[str, bytes], content_encoding: Optional[bytes] = None
|
||||
) -> Type[Response]:
|
||||
) -> type[Response]:
|
||||
"""Return the most appropriate Response class from an HTTP Content-Type
|
||||
header"""
|
||||
if content_encoding:
|
||||
@ -66,7 +71,7 @@ class ResponseTypes:
|
||||
|
||||
def from_content_disposition(
|
||||
self, content_disposition: Union[str, bytes]
|
||||
) -> Type[Response]:
|
||||
) -> type[Response]:
|
||||
try:
|
||||
filename = (
|
||||
to_unicode(content_disposition, encoding="latin-1", errors="replace")
|
||||
@ -78,7 +83,7 @@ class ResponseTypes:
|
||||
except IndexError:
|
||||
return Response
|
||||
|
||||
def from_headers(self, headers: Mapping[bytes, bytes]) -> Type[Response]:
|
||||
def from_headers(self, headers: Mapping[bytes, bytes]) -> type[Response]:
|
||||
"""Return the most appropriate Response class by looking at the HTTP
|
||||
headers"""
|
||||
cls = Response
|
||||
@ -91,14 +96,14 @@ class ResponseTypes:
|
||||
cls = self.from_content_disposition(headers[b"Content-Disposition"])
|
||||
return cls
|
||||
|
||||
def from_filename(self, filename: str) -> Type[Response]:
|
||||
def from_filename(self, filename: str) -> type[Response]:
|
||||
"""Return the most appropriate Response class from a file name"""
|
||||
mimetype, encoding = self.mimetypes.guess_type(filename)
|
||||
if mimetype and not encoding:
|
||||
return self.from_mimetype(mimetype)
|
||||
return Response
|
||||
|
||||
def from_body(self, body: bytes) -> Type[Response]:
|
||||
def from_body(self, body: bytes) -> type[Response]:
|
||||
"""Try to guess the appropriate response based on the body content.
|
||||
This method is a bit magic and could be improved in the future, but
|
||||
it's not meant to be used except for special cases where response types
|
||||
@ -122,7 +127,7 @@ class ResponseTypes:
|
||||
url: Optional[str] = None,
|
||||
filename: Optional[str] = None,
|
||||
body: Optional[bytes] = None,
|
||||
) -> Type[Response]:
|
||||
) -> type[Response]:
|
||||
"""Guess the most appropriate Response class based on
|
||||
the given arguments."""
|
||||
cls = Response
|
||||
|
@ -2,7 +2,7 @@
|
||||
XPath selectors based on lxml
|
||||
"""
|
||||
|
||||
from typing import Any, Optional, Type, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from parsel import Selector as _ParselSelector
|
||||
|
||||
@ -23,7 +23,7 @@ def _st(response: Optional[TextResponse], st: Optional[str]) -> str:
|
||||
|
||||
|
||||
def _response_from_text(text: Union[str, bytes], st: Optional[str]) -> TextResponse:
|
||||
rt: Type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
|
||||
rt: type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
|
||||
return rt(url="about:blank", encoding="utf-8", body=to_bytes(text, "utf-8"))
|
||||
|
||||
|
||||
|
@ -2,22 +2,10 @@ from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import json
|
||||
from collections.abc import Iterable, Iterator, Mapping, MutableMapping
|
||||
from importlib import import_module
|
||||
from pprint import pformat
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Mapping,
|
||||
MutableMapping,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
from scrapy.settings import default_settings
|
||||
|
||||
@ -37,7 +25,7 @@ if TYPE_CHECKING:
|
||||
_SettingsInputT = Union[SupportsItems[_SettingsKeyT, Any], str, None]
|
||||
|
||||
|
||||
SETTINGS_PRIORITIES: Dict[str, int] = {
|
||||
SETTINGS_PRIORITIES: dict[str, int] = {
|
||||
"default": 0,
|
||||
"command": 10,
|
||||
"addon": 15,
|
||||
@ -192,8 +180,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
return float(self.get(name, default))
|
||||
|
||||
def getlist(
|
||||
self, name: _SettingsKeyT, default: Optional[List[Any]] = None
|
||||
) -> List[Any]:
|
||||
self, name: _SettingsKeyT, default: Optional[list[Any]] = None
|
||||
) -> list[Any]:
|
||||
"""
|
||||
Get a setting value as a list. If the setting original type is a list, a
|
||||
copy of it will be returned. If it's a string it will be split by ",".
|
||||
@ -213,8 +201,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
return list(value)
|
||||
|
||||
def getdict(
|
||||
self, name: _SettingsKeyT, default: Optional[Dict[Any, Any]] = None
|
||||
) -> Dict[Any, Any]:
|
||||
self, name: _SettingsKeyT, default: Optional[dict[Any, Any]] = None
|
||||
) -> dict[Any, Any]:
|
||||
"""
|
||||
Get a setting value as a dictionary. If the setting original type is a
|
||||
dictionary, a copy of it will be returned. If it is a string it will be
|
||||
@ -238,8 +226,8 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
def getdictorlist(
|
||||
self,
|
||||
name: _SettingsKeyT,
|
||||
default: Union[Dict[Any, Any], List[Any], Tuple[Any], None] = None,
|
||||
) -> Union[Dict[Any, Any], List[Any]]:
|
||||
default: Union[dict[Any, Any], list[Any], tuple[Any], None] = None,
|
||||
) -> Union[dict[Any, Any], list[Any]]:
|
||||
"""Get a setting value as either a :class:`dict` or a :class:`list`.
|
||||
|
||||
If the setting is already a dict or a list, a copy of it will be
|
||||
@ -412,7 +400,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
"""
|
||||
self._assert_mutability()
|
||||
if isinstance(values, str):
|
||||
values = cast(Dict[_SettingsKeyT, Any], json.loads(values))
|
||||
values = cast(dict[_SettingsKeyT, Any], json.loads(values))
|
||||
if values is not None:
|
||||
if isinstance(values, BaseSettings):
|
||||
for name, value in values.items():
|
||||
@ -477,7 +465,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
def __len__(self) -> int:
|
||||
return len(self.attributes)
|
||||
|
||||
def _to_dict(self) -> Dict[_SettingsKeyT, Any]:
|
||||
def _to_dict(self) -> dict[_SettingsKeyT, Any]:
|
||||
return {
|
||||
self._get_key(k): (v._to_dict() if isinstance(v, BaseSettings) else v)
|
||||
for k, v in self.items()
|
||||
@ -490,7 +478,7 @@ class BaseSettings(MutableMapping[_SettingsKeyT, Any]):
|
||||
else str(key_value)
|
||||
)
|
||||
|
||||
def copy_to_dict(self) -> Dict[_SettingsKeyT, Any]:
|
||||
def copy_to_dict(self) -> dict[_SettingsKeyT, Any]:
|
||||
"""
|
||||
Make a copy of current settings and convert to a dict.
|
||||
|
||||
@ -553,7 +541,7 @@ class Settings(BaseSettings):
|
||||
self.update(values, priority)
|
||||
|
||||
|
||||
def iter_default_settings() -> Iterable[Tuple[str, Any]]:
|
||||
def iter_default_settings() -> Iterable[tuple[str, Any]]:
|
||||
"""Return the default settings as an iterator of (name, value) tuples"""
|
||||
for name in dir(default_settings):
|
||||
if name.isupper():
|
||||
@ -562,7 +550,7 @@ def iter_default_settings() -> Iterable[Tuple[str, Any]]:
|
||||
|
||||
def overridden_settings(
|
||||
settings: Mapping[_SettingsKeyT, Any]
|
||||
) -> Iterable[Tuple[str, Any]]:
|
||||
) -> Iterable[tuple[str, Any]]:
|
||||
"""Return an iterable of the settings that have been overridden"""
|
||||
for name, defvalue in iter_default_settings():
|
||||
value = settings[name]
|
||||
|
@ -8,7 +8,7 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
import signal
|
||||
from typing import Any, Callable, Dict, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import defer, threads
|
||||
@ -27,25 +27,28 @@ from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.reactor import is_asyncio_reactor_installed, set_asyncio_event_loop
|
||||
from scrapy.utils.response import open_in_browser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
|
||||
class Shell:
|
||||
relevant_classes: Tuple[type, ...] = (Crawler, Spider, Request, Response, Settings)
|
||||
relevant_classes: tuple[type, ...] = (Crawler, Spider, Request, Response, Settings)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler: Crawler,
|
||||
update_vars: Optional[Callable[[Dict[str, Any]], None]] = None,
|
||||
update_vars: Optional[Callable[[dict[str, Any]], None]] = None,
|
||||
code: Optional[str] = None,
|
||||
):
|
||||
self.crawler: Crawler = crawler
|
||||
self.update_vars: Callable[[Dict[str, Any]], None] = update_vars or (
|
||||
self.update_vars: Callable[[dict[str, Any]], None] = update_vars or (
|
||||
lambda x: None
|
||||
)
|
||||
self.item_class: type = load_object(crawler.settings["DEFAULT_ITEM_CLASS"])
|
||||
self.spider: Optional[Spider] = None
|
||||
self.inthread: bool = not threadable.isInIOThread()
|
||||
self.code: Optional[str] = code
|
||||
self.vars: Dict[str, Any] = {}
|
||||
self.vars: dict[str, Any] = {}
|
||||
|
||||
def start(
|
||||
self,
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, List, Tuple
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from pydispatch import dispatcher
|
||||
|
||||
@ -40,7 +40,7 @@ class SignalManager:
|
||||
kwargs.setdefault("sender", self.sender)
|
||||
dispatcher.disconnect(receiver, signal, **kwargs)
|
||||
|
||||
def send_catch_log(self, signal: Any, **kwargs: Any) -> List[Tuple[Any, Any]]:
|
||||
def send_catch_log(self, signal: Any, **kwargs: Any) -> list[tuple[Any, Any]]:
|
||||
"""
|
||||
Send a signal, catch exceptions and log them.
|
||||
|
||||
@ -52,7 +52,7 @@ class SignalManager:
|
||||
|
||||
def send_catch_log_deferred(
|
||||
self, signal: Any, **kwargs: Any
|
||||
) -> Deferred[List[Tuple[Any, Any]]]:
|
||||
) -> Deferred[list[tuple[Any, Any]]]:
|
||||
"""
|
||||
Like :meth:`send_catch_log` but supports returning
|
||||
:class:`~twisted.internet.defer.Deferred` objects from signal handlers.
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import traceback
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, DefaultDict, Dict, List, Tuple, Type
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from zope.interface import implementer
|
||||
|
||||
@ -29,10 +29,10 @@ class SpiderLoader:
|
||||
"""
|
||||
|
||||
def __init__(self, settings: BaseSettings):
|
||||
self.spider_modules: List[str] = settings.getlist("SPIDER_MODULES")
|
||||
self.spider_modules: list[str] = settings.getlist("SPIDER_MODULES")
|
||||
self.warn_only: bool = settings.getbool("SPIDER_LOADER_WARN_ONLY")
|
||||
self._spiders: Dict[str, Type[Spider]] = {}
|
||||
self._found: DefaultDict[str, List[Tuple[str, str]]] = defaultdict(list)
|
||||
self._spiders: dict[str, type[Spider]] = {}
|
||||
self._found: defaultdict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
self._load_all_spiders()
|
||||
|
||||
def _check_name_duplicates(self) -> None:
|
||||
@ -80,7 +80,7 @@ class SpiderLoader:
|
||||
def from_settings(cls, settings: BaseSettings) -> Self:
|
||||
return cls(settings)
|
||||
|
||||
def load(self, spider_name: str) -> Type[Spider]:
|
||||
def load(self, spider_name: str) -> type[Spider]:
|
||||
"""
|
||||
Return the Spider class for the given spider name. If the spider
|
||||
name is not found, raise a KeyError.
|
||||
@ -90,7 +90,7 @@ class SpiderLoader:
|
||||
except KeyError:
|
||||
raise KeyError(f"Spider not found: {spider_name}")
|
||||
|
||||
def find_by_request(self, request: Request) -> List[str]:
|
||||
def find_by_request(self, request: Request) -> list[str]:
|
||||
"""
|
||||
Return the list of spider names that can handle the given request.
|
||||
"""
|
||||
@ -98,7 +98,7 @@ class SpiderLoader:
|
||||
name for name, cls in self._spiders.items() if cls.handles_request(request)
|
||||
]
|
||||
|
||||
def list(self) -> List[str]:
|
||||
def list(self) -> list[str]:
|
||||
"""
|
||||
Return a list with the names of all spiders available in the project.
|
||||
"""
|
||||
|
@ -7,11 +7,13 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, AsyncIterable, Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncIterable, Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
|
@ -7,11 +7,13 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -39,7 +41,7 @@ class HttpErrorMiddleware:
|
||||
|
||||
def __init__(self, settings: BaseSettings):
|
||||
self.handle_httpstatus_all: bool = settings.getbool("HTTPERROR_ALLOW_ALL")
|
||||
self.handle_httpstatus_list: List[int] = settings.getlist(
|
||||
self.handle_httpstatus_list: list[int] = settings.getlist(
|
||||
"HTTPERROR_ALLOWED_CODES"
|
||||
)
|
||||
|
||||
|
@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, AsyncIterable, Iterable, Set
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy import Spider, signals
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
@ -23,6 +23,8 @@ warnings.warn(
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncIterable, Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -109,7 +111,7 @@ class OffsiteMiddleware:
|
||||
|
||||
def spider_opened(self, spider: Spider) -> None:
|
||||
self.host_regex: re.Pattern[str] = self.get_host_regex(spider)
|
||||
self.domains_seen: Set[str] = set()
|
||||
self.domains_seen: set[str] = set()
|
||||
|
||||
|
||||
class URLWarning(Warning):
|
||||
|
@ -6,18 +6,7 @@ originated it.
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Dict,
|
||||
Iterable,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
@ -30,6 +19,8 @@ from scrapy.utils.python import to_unicode
|
||||
from scrapy.utils.url import strip_url
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncIterable, Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -37,7 +28,7 @@ if TYPE_CHECKING:
|
||||
from scrapy.settings import BaseSettings
|
||||
|
||||
|
||||
LOCAL_SCHEMES: Tuple[str, ...] = (
|
||||
LOCAL_SCHEMES: tuple[str, ...] = (
|
||||
"about",
|
||||
"blob",
|
||||
"data",
|
||||
@ -56,7 +47,7 @@ POLICY_SCRAPY_DEFAULT = "scrapy-default"
|
||||
|
||||
|
||||
class ReferrerPolicy:
|
||||
NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES
|
||||
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES
|
||||
name: str
|
||||
|
||||
def referrer(self, response_url: str, request_url: str) -> Optional[str]:
|
||||
@ -291,11 +282,11 @@ class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
|
||||
using ``file://`` or ``s3://`` scheme.
|
||||
"""
|
||||
|
||||
NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES + ("file", "s3")
|
||||
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES + ("file", "s3")
|
||||
name: str = POLICY_SCRAPY_DEFAULT
|
||||
|
||||
|
||||
_policy_classes: Dict[str, Type[ReferrerPolicy]] = {
|
||||
_policy_classes: dict[str, type[ReferrerPolicy]] = {
|
||||
p.name: p
|
||||
for p in (
|
||||
NoReferrerPolicy,
|
||||
@ -316,14 +307,14 @@ _policy_classes[""] = NoReferrerWhenDowngradePolicy
|
||||
|
||||
def _load_policy_class(
|
||||
policy: str, warning_only: bool = False
|
||||
) -> Optional[Type[ReferrerPolicy]]:
|
||||
) -> Optional[type[ReferrerPolicy]]:
|
||||
"""
|
||||
Expect a string for the path to the policy class,
|
||||
otherwise try to interpret the string as a standard value
|
||||
from https://www.w3.org/TR/referrer-policy/#referrer-policies
|
||||
"""
|
||||
try:
|
||||
return cast(Type[ReferrerPolicy], load_object(policy))
|
||||
return cast(type[ReferrerPolicy], load_object(policy))
|
||||
except ValueError:
|
||||
tokens = [token.strip() for token in policy.lower().split(",")]
|
||||
# https://www.w3.org/TR/referrer-policy/#parse-referrer-policy-from-header
|
||||
@ -341,7 +332,7 @@ def _load_policy_class(
|
||||
|
||||
class RefererMiddleware:
|
||||
def __init__(self, settings: Optional[BaseSettings] = None):
|
||||
self.default_policy: Type[ReferrerPolicy] = DefaultReferrerPolicy
|
||||
self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy
|
||||
if settings is not None:
|
||||
settings_policy = _load_policy_class(settings.get("REFERRER_POLICY"))
|
||||
assert settings_policy
|
||||
|
@ -7,12 +7,14 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, AsyncIterable, Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncIterable, Iterable
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/spiders.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, cast
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request, Response
|
||||
@ -15,6 +15,8 @@ from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.url import url_is_from_spider
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
@ -32,7 +34,7 @@ class Spider(object_ref):
|
||||
"""
|
||||
|
||||
name: str
|
||||
custom_settings: Optional[Dict[_SettingsKeyT, Any]] = None
|
||||
custom_settings: Optional[dict[_SettingsKeyT, Any]] = None
|
||||
|
||||
def __init__(self, name: Optional[str] = None, **kwargs: Any):
|
||||
if name is not None:
|
||||
@ -41,7 +43,7 @@ class Spider(object_ref):
|
||||
raise ValueError(f"{type(self).__name__} must have a name")
|
||||
self.__dict__.update(kwargs)
|
||||
if not hasattr(self, "start_urls"):
|
||||
self.start_urls: List[str] = []
|
||||
self.start_urls: list[str] = []
|
||||
|
||||
@property
|
||||
def logger(self) -> SpiderLoggerAdapter:
|
||||
|
@ -1,6 +1,6 @@
|
||||
"""
|
||||
This modules implements the CrawlSpider which is the recommended spider to use
|
||||
for scraping typical web sites that requires crawling pages.
|
||||
for scraping typical websites that requires crawling pages.
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
@ -8,22 +8,8 @@ See documentation in docs/topics/spiders.rst
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Awaitable,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Set,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from collections.abc import AsyncIterable, Awaitable, Callable
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union, cast
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -35,6 +21,8 @@ from scrapy.utils.asyncgen import collect_asyncgen
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Sequence
|
||||
|
||||
# typing.Self requires Python 3.11
|
||||
from typing_extensions import Self
|
||||
|
||||
@ -43,7 +31,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
ProcessLinksT = Callable[[List[Link]], List[Link]]
|
||||
ProcessLinksT = Callable[[list[Link]], list[Link]]
|
||||
ProcessRequestT = Callable[[Request, Response], Optional[Request]]
|
||||
|
||||
|
||||
@ -75,7 +63,7 @@ class Rule:
|
||||
self,
|
||||
link_extractor: Optional[LinkExtractor] = None,
|
||||
callback: Union[CallbackT, str, None] = None,
|
||||
cb_kwargs: Optional[Dict[str, Any]] = None,
|
||||
cb_kwargs: Optional[dict[str, Any]] = None,
|
||||
follow: Optional[bool] = None,
|
||||
process_links: Union[ProcessLinksT, str, None] = None,
|
||||
process_request: Union[ProcessRequestT, str, None] = None,
|
||||
@ -84,7 +72,7 @@ class Rule:
|
||||
self.link_extractor: LinkExtractor = link_extractor or _default_link_extractor
|
||||
self.callback: Union[CallbackT, str, None] = callback
|
||||
self.errback: Union[Callable[[Failure], Any], str, None] = errback
|
||||
self.cb_kwargs: Dict[str, Any] = cb_kwargs or {}
|
||||
self.cb_kwargs: dict[str, Any] = cb_kwargs or {}
|
||||
self.process_links: Union[ProcessLinksT, str] = process_links or _identity
|
||||
self.process_request: Union[ProcessRequestT, str] = (
|
||||
process_request or _identity_process_request
|
||||
@ -105,7 +93,7 @@ class Rule:
|
||||
|
||||
class CrawlSpider(Spider):
|
||||
rules: Sequence[Rule] = ()
|
||||
_rules: List[Rule]
|
||||
_rules: list[Rule]
|
||||
_follow_links: bool
|
||||
|
||||
def __init__(self, *a: Any, **kw: Any):
|
||||
@ -139,9 +127,9 @@ class CrawlSpider(Spider):
|
||||
def _requests_to_follow(self, response: Response) -> Iterable[Optional[Request]]:
|
||||
if not isinstance(response, HtmlResponse):
|
||||
return
|
||||
seen: Set[Link] = set()
|
||||
seen: set[Link] = set()
|
||||
for rule_index, rule in enumerate(self._rules):
|
||||
links: List[Link] = [
|
||||
links: list[Link] = [
|
||||
lnk
|
||||
for lnk in rule.link_extractor.extract_links(response)
|
||||
if lnk not in seen
|
||||
@ -170,7 +158,7 @@ class CrawlSpider(Spider):
|
||||
self,
|
||||
response: Response,
|
||||
callback: Optional[CallbackT],
|
||||
cb_kwargs: Dict[str, Any],
|
||||
cb_kwargs: dict[str, Any],
|
||||
follow: bool = True,
|
||||
) -> AsyncIterable[Any]:
|
||||
if callback:
|
||||
|
@ -5,7 +5,9 @@ for scraping from an XML feed.
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
from scrapy.http import Response, TextResponse
|
||||
@ -14,6 +16,9 @@ from scrapy.spiders import Spider
|
||||
from scrapy.utils.iterators import csviter, xmliter_lxml
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Sequence
|
||||
|
||||
|
||||
class XMLFeedSpider(Spider):
|
||||
"""
|
||||
@ -27,7 +32,7 @@ class XMLFeedSpider(Spider):
|
||||
|
||||
iterator: str = "iternodes"
|
||||
itertag: str = "item"
|
||||
namespaces: Sequence[Tuple[str, str]] = ()
|
||||
namespaces: Sequence[tuple[str, str]] = ()
|
||||
|
||||
def process_results(
|
||||
self, response: Response, results: Iterable[Any]
|
||||
@ -118,7 +123,7 @@ class CSVFeedSpider(Spider):
|
||||
quotechar: Optional[str] = (
|
||||
None # When this is None, python's csv module's default quotechar is used
|
||||
)
|
||||
headers: Optional[List[str]] = None
|
||||
headers: Optional[list[str]] = None
|
||||
|
||||
def process_results(
|
||||
self, response: Response, results: Iterable[Any]
|
||||
@ -130,7 +135,7 @@ class CSVFeedSpider(Spider):
|
||||
"""This method has the same purpose as the one in XMLFeedSpider"""
|
||||
return response
|
||||
|
||||
def parse_row(self, response: Response, row: Dict[str, str]) -> Any:
|
||||
def parse_row(self, response: Response, row: dict[str, str]) -> Any:
|
||||
"""This method must be overridden with your custom spider functionality"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Iterable, Optional, cast
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
|
||||
from scrapy import Request
|
||||
from scrapy.spiders import Spider
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user