mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Bump ruff, switch from black to ruff-format (#6631)
This commit is contained in:
parent
c03fb2abb8
commit
cec0aeca58
@ -1,13 +1,10 @@
|
|||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.8.4
|
rev: v0.9.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [ --fix ]
|
args: [ --fix ]
|
||||||
- repo: https://github.com/psf/black.git
|
- id: ruff-format
|
||||||
rev: 24.10.0
|
|
||||||
hooks:
|
|
||||||
- id: black
|
|
||||||
- repo: https://github.com/adamchainz/blacken-docs
|
- repo: https://github.com/adamchainz/blacken-docs
|
||||||
rev: 1.19.1
|
rev: 1.19.1
|
||||||
hooks:
|
hooks:
|
||||||
|
@ -188,9 +188,9 @@ class Command(ScrapyCommand):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
assert (
|
assert self.crawler_process is not None, (
|
||||||
self.crawler_process is not None
|
"crawler_process must be set before calling run"
|
||||||
), "crawler_process must be set before calling run"
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
spidercls = self.crawler_process.spider_loader.load(name)
|
spidercls = self.crawler_process.spider_loader.load(name)
|
||||||
|
@ -34,13 +34,12 @@ class DownloadHandlerProtocol(Protocol):
|
|||||||
class DownloadHandlers:
|
class DownloadHandlers:
|
||||||
def __init__(self, crawler: Crawler):
|
def __init__(self, crawler: Crawler):
|
||||||
self._crawler: Crawler = crawler
|
self._crawler: Crawler = crawler
|
||||||
self._schemes: dict[str, str | Callable[..., Any]] = (
|
# stores acceptable schemes on instancing
|
||||||
{}
|
self._schemes: dict[str, str | Callable[..., Any]] = {}
|
||||||
) # stores acceptable schemes on instancing
|
# stores instanced handlers for schemes
|
||||||
self._handlers: dict[str, DownloadHandlerProtocol] = (
|
self._handlers: dict[str, DownloadHandlerProtocol] = {}
|
||||||
{}
|
# remembers failed handlers
|
||||||
) # stores instanced handlers for schemes
|
self._notconfigured: dict[str, str] = {}
|
||||||
self._notconfigured: dict[str, str] = {} # remembers failed handlers
|
|
||||||
handlers: dict[str, str | Callable[..., Any]] = without_none_values(
|
handlers: dict[str, str | Callable[..., Any]] = without_none_values(
|
||||||
cast(
|
cast(
|
||||||
"dict[str, str | Callable[..., Any]]",
|
"dict[str, str | Callable[..., Any]]",
|
||||||
|
@ -193,7 +193,7 @@ class Stream:
|
|||||||
url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
|
url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
|
||||||
or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
|
or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
|
||||||
or url.netloc
|
or url.netloc
|
||||||
== f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
|
== f"{self._protocol.metadata['ip_address']}:{self._protocol.metadata['uri'].port}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_request_headers(self) -> list[tuple[str, str]]:
|
def _get_request_headers(self) -> list[tuple[str, str]]:
|
||||||
@ -339,7 +339,7 @@ class Stream:
|
|||||||
if self._log_warnsize:
|
if self._log_warnsize:
|
||||||
self.metadata["reached_warnsize"] = True
|
self.metadata["reached_warnsize"] = True
|
||||||
warning_msg = (
|
warning_msg = (
|
||||||
f'Received more ({self._response["flow_controlled_size"]}) bytes than download '
|
f"Received more ({self._response['flow_controlled_size']}) bytes than download "
|
||||||
f"warn size ({self._download_warnsize}) in request {self._request}"
|
f"warn size ({self._download_warnsize}) in request {self._request}"
|
||||||
)
|
)
|
||||||
logger.warning(warning_msg)
|
logger.warning(warning_msg)
|
||||||
@ -445,7 +445,7 @@ class Stream:
|
|||||||
ResponseFailed(
|
ResponseFailed(
|
||||||
[
|
[
|
||||||
Failure(
|
Failure(
|
||||||
f'Remote peer {self._protocol.metadata["ip_address"]} sent RST_STREAM',
|
f"Remote peer {self._protocol.metadata['ip_address']} sent RST_STREAM",
|
||||||
ProtocolError,
|
ProtocolError,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
@ -465,7 +465,7 @@ class Stream:
|
|||||||
InvalidHostname(
|
InvalidHostname(
|
||||||
self._request,
|
self._request,
|
||||||
str(self._protocol.metadata["uri"].host, "utf-8"),
|
str(self._protocol.metadata["uri"].host, "utf-8"),
|
||||||
f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}',
|
f"{self._protocol.metadata['ip_address']}:{self._protocol.metadata['uri'].port}",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -54,8 +54,7 @@ class CookiesMiddleware:
|
|||||||
) -> None:
|
) -> None:
|
||||||
for cookie in cookies:
|
for cookie in cookies:
|
||||||
cookie_domain = cookie.domain
|
cookie_domain = cookie.domain
|
||||||
if cookie_domain.startswith("."):
|
cookie_domain = cookie_domain.removeprefix(".")
|
||||||
cookie_domain = cookie_domain[1:]
|
|
||||||
|
|
||||||
hostname = urlparse_cached(request).hostname
|
hostname = urlparse_cached(request).hostname
|
||||||
assert hostname is not None
|
assert hostname is not None
|
||||||
|
@ -89,5 +89,5 @@ class OffsiteMiddleware:
|
|||||||
warnings.warn(message)
|
warnings.warn(message)
|
||||||
else:
|
else:
|
||||||
domains.append(re.escape(domain))
|
domains.append(re.escape(domain))
|
||||||
regex = rf'^(.*\.)?({"|".join(domains)})$'
|
regex = rf"^(.*\.)?({'|'.join(domains)})$"
|
||||||
return re.compile(regex)
|
return re.compile(regex)
|
||||||
|
@ -63,7 +63,9 @@ class RobotsTxtMiddleware:
|
|||||||
if request.url.startswith("data:") or request.url.startswith("file:"):
|
if request.url.startswith("data:") or request.url.startswith("file:"):
|
||||||
return None
|
return None
|
||||||
d: Deferred[RobotParser | None] = maybeDeferred(
|
d: Deferred[RobotParser | None] = maybeDeferred(
|
||||||
self.robot_parser, request, spider # type: ignore[call-overload]
|
self.robot_parser,
|
||||||
|
request,
|
||||||
|
spider, # type: ignore[call-overload]
|
||||||
)
|
)
|
||||||
d2: Deferred[None] = d.addCallback(self.process_request_2, request, spider)
|
d2: Deferred[None] = d.addCallback(self.process_request_2, request, spider)
|
||||||
return d2
|
return d2
|
||||||
|
@ -19,7 +19,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
def get_header_size(
|
def get_header_size(
|
||||||
headers: dict[str, list[str | bytes] | tuple[str | bytes, ...]]
|
headers: dict[str, list[str | bytes] | tuple[str | bytes, ...]],
|
||||||
) -> int:
|
) -> int:
|
||||||
size = 0
|
size = 0
|
||||||
for key, value in headers.items():
|
for key, value in headers.items():
|
||||||
|
@ -84,9 +84,7 @@ class TelnetConsole(protocol.ServerFactory):
|
|||||||
"""An implementation of IPortal"""
|
"""An implementation of IPortal"""
|
||||||
|
|
||||||
@defers
|
@defers
|
||||||
def login(
|
def login(self_, credentials, mind, *interfaces): # pylint: disable=no-self-argument
|
||||||
self_, credentials, mind, *interfaces
|
|
||||||
): # pylint: disable=no-self-argument
|
|
||||||
if not (
|
if not (
|
||||||
credentials.username == self.username.encode("utf8")
|
credentials.username == self.username.encode("utf8")
|
||||||
and credentials.checkPassword(self.password.encode("utf8"))
|
and credentials.checkPassword(self.password.encode("utf8"))
|
||||||
|
@ -105,7 +105,8 @@ class Headers(CaselessDict):
|
|||||||
|
|
||||||
def values(self) -> list[bytes | None]: # type: ignore[override]
|
def values(self) -> list[bytes | None]: # type: ignore[override]
|
||||||
return [
|
return [
|
||||||
self[k] for k in self.keys() # pylint: disable=consider-using-dict-items
|
self[k]
|
||||||
|
for k in self.keys() # pylint: disable=consider-using-dict-items
|
||||||
]
|
]
|
||||||
|
|
||||||
def to_string(self) -> bytes:
|
def to_string(self) -> bytes:
|
||||||
|
@ -24,7 +24,6 @@ from scrapy.http.request import Request
|
|||||||
from scrapy.utils.python import is_listlike, to_bytes
|
from scrapy.utils.python import is_listlike, to_bytes
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
||||||
# typing.Self requires Python 3.11
|
# typing.Self requires Python 3.11
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
@ -94,8 +94,7 @@ class Response(object_ref):
|
|||||||
return self.request.meta # type: ignore[union-attr]
|
return self.request.meta # type: ignore[union-attr]
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
"Response.meta not available, this response "
|
"Response.meta not available, this response is not tied to any request"
|
||||||
"is not tied to any request"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -25,7 +25,6 @@ from scrapy.utils.response import get_base_url
|
|||||||
from scrapy.utils.url import url_has_any_extension, url_is_from_any_domain
|
from scrapy.utils.url import url_has_any_extension, url_is_from_any_domain
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
||||||
from lxml.html import HtmlElement
|
from lxml.html import HtmlElement
|
||||||
|
|
||||||
from scrapy import Selector
|
from scrapy import Selector
|
||||||
|
@ -202,7 +202,9 @@ class S3FilesStore:
|
|||||||
return cast(
|
return cast(
|
||||||
"Deferred[dict[str, Any]]",
|
"Deferred[dict[str, Any]]",
|
||||||
deferToThread(
|
deferToThread(
|
||||||
self.s3_client.head_object, Bucket=self.bucket, Key=key_name # type: ignore[attr-defined]
|
self.s3_client.head_object, # type: ignore[attr-defined]
|
||||||
|
Bucket=self.bucket,
|
||||||
|
Key=key_name,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -81,8 +81,7 @@ class Selector(_ParselSelector, object_ref):
|
|||||||
):
|
):
|
||||||
if response is not None and text is not None:
|
if response is not None and text is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"{self.__class__.__name__}.__init__() received "
|
f"{self.__class__.__name__}.__init__() received both response and text"
|
||||||
"both response and text"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
st = _st(response, type)
|
st = _st(response, type)
|
||||||
|
@ -539,7 +539,7 @@ def iter_default_settings() -> Iterable[tuple[str, Any]]:
|
|||||||
|
|
||||||
|
|
||||||
def overridden_settings(
|
def overridden_settings(
|
||||||
settings: Mapping[_SettingsKeyT, Any]
|
settings: Mapping[_SettingsKeyT, Any],
|
||||||
) -> Iterable[tuple[str, Any]]:
|
) -> Iterable[tuple[str, Any]]:
|
||||||
"""Return an iterable of the settings that have been overridden"""
|
"""Return an iterable of the settings that have been overridden"""
|
||||||
for name, defvalue in iter_default_settings():
|
for name, defvalue in iter_default_settings():
|
||||||
|
@ -333,7 +333,7 @@ TEMPLATES_DIR = str((Path(__file__).parent / ".." / "templates").resolve())
|
|||||||
|
|
||||||
URLLENGTH_LIMIT = 2083
|
URLLENGTH_LIMIT = 2083
|
||||||
|
|
||||||
USER_AGENT = f'Scrapy/{import_module("scrapy").__version__} (+https://scrapy.org)'
|
USER_AGENT = f"Scrapy/{import_module('scrapy').__version__} (+https://scrapy.org)"
|
||||||
|
|
||||||
TELNETCONSOLE_ENABLED = 1
|
TELNETCONSOLE_ENABLED = 1
|
||||||
TELNETCONSOLE_PORT = [6023, 6073]
|
TELNETCONSOLE_PORT = [6023, 6073]
|
||||||
|
@ -110,7 +110,7 @@ class OffsiteMiddleware:
|
|||||||
warnings.warn(message, PortWarning)
|
warnings.warn(message, PortWarning)
|
||||||
else:
|
else:
|
||||||
domains.append(re.escape(domain))
|
domains.append(re.escape(domain))
|
||||||
regex = rf'^(.*\.)?({"|".join(domains)})$'
|
regex = rf"^(.*\.)?({'|'.join(domains)})$"
|
||||||
return re.compile(regex)
|
return re.compile(regex)
|
||||||
|
|
||||||
def spider_opened(self, spider: Spider) -> None:
|
def spider_opened(self, spider: Spider) -> None:
|
||||||
|
@ -147,16 +147,24 @@ def _pickle_serialize(obj: Any) -> bytes:
|
|||||||
|
|
||||||
# queue.*Queue aren't subclasses of queue.BaseQueue
|
# queue.*Queue aren't subclasses of queue.BaseQueue
|
||||||
_PickleFifoSerializationDiskQueue = _serializable_queue(
|
_PickleFifoSerializationDiskQueue = _serializable_queue(
|
||||||
_with_mkdir(queue.FifoDiskQueue), _pickle_serialize, pickle.loads # type: ignore[arg-type]
|
_with_mkdir(queue.FifoDiskQueue), # type: ignore[arg-type]
|
||||||
|
_pickle_serialize,
|
||||||
|
pickle.loads,
|
||||||
)
|
)
|
||||||
_PickleLifoSerializationDiskQueue = _serializable_queue(
|
_PickleLifoSerializationDiskQueue = _serializable_queue(
|
||||||
_with_mkdir(queue.LifoDiskQueue), _pickle_serialize, pickle.loads # type: ignore[arg-type]
|
_with_mkdir(queue.LifoDiskQueue), # type: ignore[arg-type]
|
||||||
|
_pickle_serialize,
|
||||||
|
pickle.loads,
|
||||||
)
|
)
|
||||||
_MarshalFifoSerializationDiskQueue = _serializable_queue(
|
_MarshalFifoSerializationDiskQueue = _serializable_queue(
|
||||||
_with_mkdir(queue.FifoDiskQueue), marshal.dumps, marshal.loads # type: ignore[arg-type]
|
_with_mkdir(queue.FifoDiskQueue), # type: ignore[arg-type]
|
||||||
|
marshal.dumps,
|
||||||
|
marshal.loads,
|
||||||
)
|
)
|
||||||
_MarshalLifoSerializationDiskQueue = _serializable_queue(
|
_MarshalLifoSerializationDiskQueue = _serializable_queue(
|
||||||
_with_mkdir(queue.LifoDiskQueue), marshal.dumps, marshal.loads # type: ignore[arg-type]
|
_with_mkdir(queue.LifoDiskQueue), # type: ignore[arg-type]
|
||||||
|
marshal.dumps,
|
||||||
|
marshal.loads,
|
||||||
)
|
)
|
||||||
|
|
||||||
# public queue classes
|
# public queue classes
|
||||||
|
@ -22,8 +22,7 @@ class DataAction(argparse.Action):
|
|||||||
option_string: str | None = None,
|
option_string: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
value = str(values)
|
value = str(values)
|
||||||
if value.startswith("$"):
|
value = value.removeprefix("$")
|
||||||
value = value[1:]
|
|
||||||
setattr(namespace, self.dest, value)
|
setattr(namespace, self.dest, value)
|
||||||
|
|
||||||
|
|
||||||
@ -96,7 +95,7 @@ def curl_to_request_kwargs(
|
|||||||
parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
|
parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
|
||||||
|
|
||||||
if argv:
|
if argv:
|
||||||
msg = f'Unrecognized options: {", ".join(argv)}'
|
msg = f"Unrecognized options: {', '.join(argv)}"
|
||||||
if ignore_unknown_options:
|
if ignore_unknown_options:
|
||||||
warnings.warn(msg)
|
warnings.warn(msg)
|
||||||
else:
|
else:
|
||||||
|
@ -377,7 +377,7 @@ def deferred_from_coro(o: _T) -> Deferred | _T:
|
|||||||
|
|
||||||
|
|
||||||
def deferred_f_from_coro_f(
|
def deferred_f_from_coro_f(
|
||||||
coro_f: Callable[_P, Coroutine[Any, Any, _T]]
|
coro_f: Callable[_P, Coroutine[Any, Any, _T]],
|
||||||
) -> Callable[_P, Deferred[_T]]:
|
) -> Callable[_P, Deferred[_T]]:
|
||||||
"""Converts a coroutine function into a function that returns a Deferred.
|
"""Converts a coroutine function into a function that returns a Deferred.
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ def xmliter(obj: Response | str | bytes, nodename: str) -> Iterator[Selector]:
|
|||||||
nodetext = (
|
nodetext = (
|
||||||
document_header
|
document_header
|
||||||
+ match.group().replace(
|
+ match.group().replace(
|
||||||
nodename, f'{nodename} {" ".join(namespaces.values())}', 1
|
nodename, f"{nodename} {' '.join(namespaces.values())}", 1
|
||||||
)
|
)
|
||||||
+ header_end
|
+ header_end
|
||||||
)
|
)
|
||||||
|
@ -16,7 +16,6 @@ from scrapy.settings import Settings, _SettingsKeyT
|
|||||||
from scrapy.utils.versions import get_versions
|
from scrapy.utils.versions import get_versions
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
||||||
from scrapy.crawler import Crawler
|
from scrapy.crawler import Crawler
|
||||||
from scrapy.logformatter import LogFormatterResult
|
from scrapy.logformatter import LogFormatterResult
|
||||||
|
|
||||||
|
@ -119,8 +119,7 @@ def to_unicode(
|
|||||||
return text
|
return text
|
||||||
if not isinstance(text, (bytes, str)):
|
if not isinstance(text, (bytes, str)):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
"to_unicode must receive a bytes or str "
|
f"to_unicode must receive a bytes or str object, got {type(text).__name__}"
|
||||||
f"object, got {type(text).__name__}"
|
|
||||||
)
|
)
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
@ -183,7 +182,7 @@ _SelfT = TypeVar("_SelfT")
|
|||||||
|
|
||||||
|
|
||||||
def memoizemethod_noargs(
|
def memoizemethod_noargs(
|
||||||
method: Callable[Concatenate[_SelfT, _P], _T]
|
method: Callable[Concatenate[_SelfT, _P], _T],
|
||||||
) -> Callable[Concatenate[_SelfT, _P], _T]:
|
) -> Callable[Concatenate[_SelfT, _P], _T]:
|
||||||
"""Decorator to cache the result of a method (without arguments) using a
|
"""Decorator to cache the result of a method (without arguments) using a
|
||||||
weak reference to its object
|
weak reference to its object
|
||||||
@ -313,7 +312,7 @@ def without_none_values(iterable: Iterable[_KT]) -> Iterable[_KT]: ...
|
|||||||
|
|
||||||
|
|
||||||
def without_none_values(
|
def without_none_values(
|
||||||
iterable: Mapping[_KT, _VT] | Iterable[_KT]
|
iterable: Mapping[_KT, _VT] | Iterable[_KT],
|
||||||
) -> dict[_KT, _VT] | Iterable[_KT]:
|
) -> dict[_KT, _VT] | Iterable[_KT]:
|
||||||
"""Return a copy of ``iterable`` with all ``None`` entries removed.
|
"""Return a copy of ``iterable`` with all ``None`` entries removed.
|
||||||
|
|
||||||
|
@ -338,9 +338,9 @@ class BrokenStartRequestsSpider(FollowAllSpider):
|
|||||||
if self.fail_yielding:
|
if self.fail_yielding:
|
||||||
2 / 0
|
2 / 0
|
||||||
|
|
||||||
assert (
|
assert self.seedsseen, (
|
||||||
self.seedsseen
|
"All start requests consumed before any download happened"
|
||||||
), "All start requests consumed before any download happened"
|
)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
self.seedsseen.append(response.meta.get("seed"))
|
self.seedsseen.append(response.meta.get("seed"))
|
||||||
|
@ -529,7 +529,7 @@ class ContractsManagerTest(unittest.TestCase):
|
|||||||
return TestItem()
|
return TestItem()
|
||||||
|
|
||||||
with MockServer() as mockserver:
|
with MockServer() as mockserver:
|
||||||
contract_doc = f'@url {mockserver.url("/status?n=200")}'
|
contract_doc = f"@url {mockserver.url('/status?n=200')}"
|
||||||
|
|
||||||
TestSameUrlSpider.parse_first.__doc__ = contract_doc
|
TestSameUrlSpider.parse_first.__doc__ = contract_doc
|
||||||
TestSameUrlSpider.parse_second.__doc__ = contract_doc
|
TestSameUrlSpider.parse_second.__doc__ = contract_doc
|
||||||
@ -567,7 +567,6 @@ class CustomFailContractPostProcess(Contract):
|
|||||||
|
|
||||||
|
|
||||||
class CustomContractPrePostProcess(unittest.TestCase):
|
class CustomContractPrePostProcess(unittest.TestCase):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.results = TextTestResult(stream=None, descriptions=False, verbosity=0)
|
self.results = TextTestResult(stream=None, descriptions=False, verbosity=0)
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ class TestHttpProxyMiddleware(TestCase):
|
|||||||
|
|
||||||
def test_proxy_auth_encoding(self):
|
def test_proxy_auth_encoding(self):
|
||||||
# utf-8 encoding
|
# utf-8 encoding
|
||||||
os.environ["http_proxy"] = "https://m\u00E1n:pass@proxy:3128"
|
os.environ["http_proxy"] = "https://m\u00e1n:pass@proxy:3128"
|
||||||
mw = HttpProxyMiddleware(auth_encoding="utf-8")
|
mw = HttpProxyMiddleware(auth_encoding="utf-8")
|
||||||
req = Request("http://scrapytest.org")
|
req = Request("http://scrapytest.org")
|
||||||
assert mw.process_request(req, spider) is None
|
assert mw.process_request(req, spider) is None
|
||||||
@ -103,7 +103,7 @@ class TestHttpProxyMiddleware(TestCase):
|
|||||||
|
|
||||||
# proxy from request.meta
|
# proxy from request.meta
|
||||||
req = Request(
|
req = Request(
|
||||||
"http://scrapytest.org", meta={"proxy": "https://\u00FCser:pass@proxy:3128"}
|
"http://scrapytest.org", meta={"proxy": "https://\u00fcser:pass@proxy:3128"}
|
||||||
)
|
)
|
||||||
assert mw.process_request(req, spider) is None
|
assert mw.process_request(req, spider) is None
|
||||||
self.assertEqual(req.meta["proxy"], "https://proxy:3128")
|
self.assertEqual(req.meta["proxy"], "https://proxy:3128")
|
||||||
@ -120,7 +120,7 @@ class TestHttpProxyMiddleware(TestCase):
|
|||||||
|
|
||||||
# proxy from request.meta, latin-1 encoding
|
# proxy from request.meta, latin-1 encoding
|
||||||
req = Request(
|
req = Request(
|
||||||
"http://scrapytest.org", meta={"proxy": "https://\u00FCser:pass@proxy:3128"}
|
"http://scrapytest.org", meta={"proxy": "https://\u00fcser:pass@proxy:3128"}
|
||||||
)
|
)
|
||||||
assert mw.process_request(req, spider) is None
|
assert mw.process_request(req, spider) is None
|
||||||
self.assertEqual(req.meta["proxy"], "https://proxy:3128")
|
self.assertEqual(req.meta["proxy"], "https://proxy:3128")
|
||||||
|
@ -55,12 +55,12 @@ class Base:
|
|||||||
assert isinstance(req2, Request)
|
assert isinstance(req2, Request)
|
||||||
self.assertEqual(req2.url, url2)
|
self.assertEqual(req2.url, url2)
|
||||||
self.assertEqual(req2.method, "GET")
|
self.assertEqual(req2.method, "GET")
|
||||||
assert (
|
assert "Content-Type" not in req2.headers, (
|
||||||
"Content-Type" not in req2.headers
|
"Content-Type header must not be present in redirected request"
|
||||||
), "Content-Type header must not be present in redirected request"
|
)
|
||||||
assert (
|
assert "Content-Length" not in req2.headers, (
|
||||||
"Content-Length" not in req2.headers
|
"Content-Length header must not be present in redirected request"
|
||||||
), "Content-Length header must not be present in redirected request"
|
)
|
||||||
assert not req2.body, f"Redirected body must be empty, not '{req2.body}'"
|
assert not req2.body, f"Redirected body must be empty, not '{req2.body}'"
|
||||||
|
|
||||||
def test_max_redirect_times(self):
|
def test_max_redirect_times(self):
|
||||||
@ -1243,12 +1243,12 @@ class MetaRefreshMiddlewareTest(Base.Test):
|
|||||||
assert isinstance(req2, Request)
|
assert isinstance(req2, Request)
|
||||||
self.assertEqual(req2.url, "http://example.org/newpage")
|
self.assertEqual(req2.url, "http://example.org/newpage")
|
||||||
self.assertEqual(req2.method, "GET")
|
self.assertEqual(req2.method, "GET")
|
||||||
assert (
|
assert "Content-Type" not in req2.headers, (
|
||||||
"Content-Type" not in req2.headers
|
"Content-Type header must not be present in redirected request"
|
||||||
), "Content-Type header must not be present in redirected request"
|
)
|
||||||
assert (
|
assert "Content-Length" not in req2.headers, (
|
||||||
"Content-Length" not in req2.headers
|
"Content-Length header must not be present in redirected request"
|
||||||
), "Content-Length header must not be present in redirected request"
|
)
|
||||||
assert not req2.body, f"Redirected body must be empty, not '{req2.body}'"
|
assert not req2.body, f"Redirected body must be empty, not '{req2.body}'"
|
||||||
|
|
||||||
def test_ignore_tags_default(self):
|
def test_ignore_tags_default(self):
|
||||||
|
@ -93,6 +93,6 @@ def test_params():
|
|||||||
_, actual = downloader._get_slot(request, spider=None)
|
_, actual = downloader._get_slot(request, spider=None)
|
||||||
expected = Slot(**params)
|
expected = Slot(**params)
|
||||||
for param in params:
|
for param in params:
|
||||||
assert getattr(expected, param) == getattr(
|
assert getattr(expected, param) == getattr(actual, param), (
|
||||||
actual, param
|
f"Slot.{param}: {getattr(expected, param)!r} != {getattr(actual, param)!r}"
|
||||||
), f"Slot.{param}: {getattr(expected, param)!r} != {getattr(actual, param)!r}"
|
)
|
||||||
|
@ -294,9 +294,9 @@ class EngineTest(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
urls_visited = {rp[0].url for rp in run.respplug}
|
urls_visited = {rp[0].url for rp in run.respplug}
|
||||||
urls_expected = {run.geturl(p) for p in must_be_visited}
|
urls_expected = {run.geturl(p) for p in must_be_visited}
|
||||||
assert (
|
assert urls_expected <= urls_visited, (
|
||||||
urls_expected <= urls_visited
|
f"URLs not visited: {list(urls_expected - urls_visited)}"
|
||||||
), f"URLs not visited: {list(urls_expected - urls_visited)}"
|
)
|
||||||
|
|
||||||
def _assert_scheduled_requests(self, run: CrawlerRun, count=None):
|
def _assert_scheduled_requests(self, run: CrawlerRun, count=None):
|
||||||
self.assertEqual(count, len(run.reqplug))
|
self.assertEqual(count, len(run.reqplug))
|
||||||
@ -496,9 +496,9 @@ def test_request_scheduled_signal(caplog):
|
|||||||
drop_request = Request("https://drop.example")
|
drop_request = Request("https://drop.example")
|
||||||
caplog.set_level(DEBUG)
|
caplog.set_level(DEBUG)
|
||||||
engine._schedule_request(drop_request, spider)
|
engine._schedule_request(drop_request, spider)
|
||||||
assert scheduler.enqueued == [
|
assert scheduler.enqueued == [keep_request], (
|
||||||
keep_request
|
f"{scheduler.enqueued!r} != [{keep_request!r}]"
|
||||||
], f"{scheduler.enqueued!r} != [{keep_request!r}]"
|
)
|
||||||
crawler.signals.disconnect(signal_handler, request_scheduled)
|
crawler.signals.disconnect(signal_handler, request_scheduled)
|
||||||
|
|
||||||
|
|
||||||
|
@ -67,6 +67,6 @@ class HeadersReceivedEngineTest(EngineTest):
|
|||||||
must_be_visited = ["/", "/redirect", "/redirected"]
|
must_be_visited = ["/", "/redirect", "/redirected"]
|
||||||
urls_visited = {rp[0].url for rp in run.respplug}
|
urls_visited = {rp[0].url for rp in run.respplug}
|
||||||
urls_expected = {run.geturl(p) for p in must_be_visited}
|
urls_expected = {run.geturl(p) for p in must_be_visited}
|
||||||
assert (
|
assert urls_expected <= urls_visited, (
|
||||||
urls_expected <= urls_visited
|
f"URLs not visited: {list(urls_expected - urls_visited)}"
|
||||||
), f"URLs not visited: {list(urls_expected - urls_visited)}"
|
)
|
||||||
|
@ -390,14 +390,14 @@ class CsvItemExporterTest(BaseItemExporterTest):
|
|||||||
def test_errors_default(self):
|
def test_errors_default(self):
|
||||||
with self.assertRaises(UnicodeEncodeError):
|
with self.assertRaises(UnicodeEncodeError):
|
||||||
self.assertExportResult(
|
self.assertExportResult(
|
||||||
item={"text": "W\u0275\u200Brd"},
|
item={"text": "W\u0275\u200brd"},
|
||||||
expected=None,
|
expected=None,
|
||||||
encoding="windows-1251",
|
encoding="windows-1251",
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_errors_xmlcharrefreplace(self):
|
def test_errors_xmlcharrefreplace(self):
|
||||||
self.assertExportResult(
|
self.assertExportResult(
|
||||||
item={"text": "W\u0275\u200Brd"},
|
item={"text": "W\u0275\u200brd"},
|
||||||
include_headers_line=False,
|
include_headers_line=False,
|
||||||
expected="Wɵ​rd\r\n",
|
expected="Wɵ​rd\r\n",
|
||||||
encoding="windows-1251",
|
encoding="windows-1251",
|
||||||
|
@ -1190,8 +1190,7 @@ class FeedExportTest(FeedExportTestBase):
|
|||||||
"csv": b"baz,egg,foo\r\n,spam1,bar1\r\n",
|
"csv": b"baz,egg,foo\r\n,spam1,bar1\r\n",
|
||||||
"json": b'[\n{"hello": "world2", "foo": "bar2"}\n]',
|
"json": b'[\n{"hello": "world2", "foo": "bar2"}\n]',
|
||||||
"jsonlines": (
|
"jsonlines": (
|
||||||
b'{"foo": "bar1", "egg": "spam1"}\n'
|
b'{"foo": "bar1", "egg": "spam1"}\n{"hello": "world2", "foo": "bar2"}\n'
|
||||||
b'{"hello": "world2", "foo": "bar2"}\n'
|
|
||||||
),
|
),
|
||||||
"xml": (
|
"xml": (
|
||||||
b'<?xml version="1.0" encoding="utf-8"?>\n<items>\n<item>'
|
b'<?xml version="1.0" encoding="utf-8"?>\n<items>\n<item>'
|
||||||
@ -2289,9 +2288,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
settings.update(
|
settings.update(
|
||||||
{
|
{
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "jl" / self._file_mark: {
|
||||||
/ "jl"
|
"format": "jl"
|
||||||
/ self._file_mark: {"format": "jl"},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -2311,9 +2310,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
settings.update(
|
settings.update(
|
||||||
{
|
{
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "csv" / self._file_mark: {
|
||||||
/ "csv"
|
"format": "csv"
|
||||||
/ self._file_mark: {"format": "csv"},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -2331,9 +2330,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
settings.update(
|
settings.update(
|
||||||
{
|
{
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "xml" / self._file_mark: {
|
||||||
/ "xml"
|
"format": "xml"
|
||||||
/ self._file_mark: {"format": "xml"},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -2352,12 +2351,12 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
settings.update(
|
settings.update(
|
||||||
{
|
{
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "xml" / self._file_mark: {
|
||||||
/ "xml"
|
"format": "xml"
|
||||||
/ self._file_mark: {"format": "xml"},
|
},
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "json" / self._file_mark: {
|
||||||
/ "json"
|
"format": "json"
|
||||||
/ self._file_mark: {"format": "json"},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -2384,9 +2383,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
settings.update(
|
settings.update(
|
||||||
{
|
{
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "pickle" / self._file_mark: {
|
||||||
/ "pickle"
|
"format": "pickle"
|
||||||
/ self._file_mark: {"format": "pickle"},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -2406,9 +2405,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
settings.update(
|
settings.update(
|
||||||
{
|
{
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "marshal" / self._file_mark: {
|
||||||
/ "marshal"
|
"format": "marshal"
|
||||||
/ self._file_mark: {"format": "marshal"},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -2455,9 +2454,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
for fmt in ("json", "jsonlines", "xml", "csv"):
|
for fmt in ("json", "jsonlines", "xml", "csv"):
|
||||||
settings = {
|
settings = {
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / fmt / self._file_mark: {
|
||||||
/ fmt
|
"format": fmt
|
||||||
/ self._file_mark: {"format": fmt},
|
},
|
||||||
},
|
},
|
||||||
"FEED_EXPORT_BATCH_ITEM_COUNT": 1,
|
"FEED_EXPORT_BATCH_ITEM_COUNT": 1,
|
||||||
"FEED_STORE_EMPTY": False,
|
"FEED_STORE_EMPTY": False,
|
||||||
@ -2478,9 +2477,9 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
for fmt, expctd in formats:
|
for fmt, expctd in formats:
|
||||||
settings = {
|
settings = {
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / fmt / self._file_mark: {
|
||||||
/ fmt
|
"format": fmt
|
||||||
/ self._file_mark: {"format": fmt},
|
},
|
||||||
},
|
},
|
||||||
"FEED_STORE_EMPTY": True,
|
"FEED_STORE_EMPTY": True,
|
||||||
"FEED_EXPORT_INDENT": None,
|
"FEED_EXPORT_INDENT": None,
|
||||||
@ -2520,25 +2519,19 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "json" / self._file_mark: {
|
||||||
/ "json"
|
|
||||||
/ self._file_mark: {
|
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"indent": 0,
|
"indent": 0,
|
||||||
"fields": ["bar"],
|
"fields": ["bar"],
|
||||||
"encoding": "utf-8",
|
"encoding": "utf-8",
|
||||||
},
|
},
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "xml" / self._file_mark: {
|
||||||
/ "xml"
|
|
||||||
/ self._file_mark: {
|
|
||||||
"format": "xml",
|
"format": "xml",
|
||||||
"indent": 2,
|
"indent": 2,
|
||||||
"fields": ["foo"],
|
"fields": ["foo"],
|
||||||
"encoding": "latin-1",
|
"encoding": "latin-1",
|
||||||
},
|
},
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "csv" / self._file_mark: {
|
||||||
/ "csv"
|
|
||||||
/ self._file_mark: {
|
|
||||||
"format": "csv",
|
"format": "csv",
|
||||||
"indent": None,
|
"indent": None,
|
||||||
"fields": ["foo", "bar"],
|
"fields": ["foo", "bar"],
|
||||||
@ -2563,9 +2556,7 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
}
|
}
|
||||||
settings = {
|
settings = {
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "json" / self._file_mark: {
|
||||||
/ "json"
|
|
||||||
/ self._file_mark: {
|
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"indent": None,
|
"indent": None,
|
||||||
"encoding": "utf-8",
|
"encoding": "utf-8",
|
||||||
@ -2591,8 +2582,7 @@ class BatchDeliveriesTest(FeedExportTestBase):
|
|||||||
]
|
]
|
||||||
settings = {
|
settings = {
|
||||||
"FEEDS": {
|
"FEEDS": {
|
||||||
self._random_temp_filename()
|
self._random_temp_filename() / "%(batch_id)d": {
|
||||||
/ "%(batch_id)d": {
|
|
||||||
"format": "json",
|
"format": "json",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -226,9 +226,9 @@ class RequestTest(unittest.TestCase):
|
|||||||
self.assertEqual(r1.flags, r2.flags)
|
self.assertEqual(r1.flags, r2.flags)
|
||||||
|
|
||||||
# make sure cb_kwargs dict is shallow copied
|
# make sure cb_kwargs dict is shallow copied
|
||||||
assert (
|
assert r1.cb_kwargs is not r2.cb_kwargs, (
|
||||||
r1.cb_kwargs is not r2.cb_kwargs
|
"cb_kwargs must be a shallow copy, not identical"
|
||||||
), "cb_kwargs must be a shallow copy, not identical"
|
)
|
||||||
self.assertEqual(r1.cb_kwargs, r2.cb_kwargs)
|
self.assertEqual(r1.cb_kwargs, r2.cb_kwargs)
|
||||||
|
|
||||||
# make sure meta dict is shallow copied
|
# make sure meta dict is shallow copied
|
||||||
@ -236,9 +236,9 @@ class RequestTest(unittest.TestCase):
|
|||||||
self.assertEqual(r1.meta, r2.meta)
|
self.assertEqual(r1.meta, r2.meta)
|
||||||
|
|
||||||
# make sure headers attribute is shallow copied
|
# make sure headers attribute is shallow copied
|
||||||
assert (
|
assert r1.headers is not r2.headers, (
|
||||||
r1.headers is not r2.headers
|
"headers must be a shallow copy, not identical"
|
||||||
), "headers must be a shallow copy, not identical"
|
)
|
||||||
self.assertEqual(r1.headers, r2.headers)
|
self.assertEqual(r1.headers, r2.headers)
|
||||||
self.assertEqual(r1.encoding, r2.encoding)
|
self.assertEqual(r1.encoding, r2.encoding)
|
||||||
self.assertEqual(r1.dont_filter, r2.dont_filter)
|
self.assertEqual(r1.dont_filter, r2.dont_filter)
|
||||||
|
@ -99,9 +99,9 @@ class BaseResponseTest(unittest.TestCase):
|
|||||||
self.assertEqual(r1.flags, r2.flags)
|
self.assertEqual(r1.flags, r2.flags)
|
||||||
|
|
||||||
# make sure headers attribute is shallow copied
|
# make sure headers attribute is shallow copied
|
||||||
assert (
|
assert r1.headers is not r2.headers, (
|
||||||
r1.headers is not r2.headers
|
"headers must be a shallow copy, not identical"
|
||||||
), "headers must be a shallow copy, not identical"
|
)
|
||||||
self.assertEqual(r1.headers, r2.headers)
|
self.assertEqual(r1.headers, r2.headers)
|
||||||
|
|
||||||
def test_copy_meta(self):
|
def test_copy_meta(self):
|
||||||
|
@ -289,9 +289,7 @@ class ItemMetaTest(unittest.TestCase):
|
|||||||
class ItemMetaClassCellRegression(unittest.TestCase):
|
class ItemMetaClassCellRegression(unittest.TestCase):
|
||||||
def test_item_meta_classcell_regression(self):
|
def test_item_meta_classcell_regression(self):
|
||||||
class MyItem(Item, metaclass=ItemMeta):
|
class MyItem(Item, metaclass=ItemMeta):
|
||||||
def __init__(
|
def __init__(self, *args, **kwargs): # pylint: disable=useless-parent-delegation
|
||||||
self, *args, **kwargs
|
|
||||||
): # pylint: disable=useless-parent-delegation
|
|
||||||
# This call to super() trigger the __classcell__ propagation
|
# This call to super() trigger the __classcell__ propagation
|
||||||
# requirement. When not done properly raises an error:
|
# requirement. When not done properly raises an error:
|
||||||
# TypeError: __class__ set to <class '__main__.MyItem'>
|
# TypeError: __class__ set to <class '__main__.MyItem'>
|
||||||
|
@ -215,7 +215,7 @@ class FilesPipelineTestCase(unittest.TestCase):
|
|||||||
|
|
||||||
class CustomFilesPipeline(FilesPipeline):
|
class CustomFilesPipeline(FilesPipeline):
|
||||||
def file_path(self, request, response=None, info=None, item=None):
|
def file_path(self, request, response=None, info=None, item=None):
|
||||||
return f'full/{item.get("path")}'
|
return f"full/{item.get('path')}"
|
||||||
|
|
||||||
file_path = CustomFilesPipeline.from_crawler(
|
file_path = CustomFilesPipeline.from_crawler(
|
||||||
get_crawler(None, {"FILES_STORE": self.tempdir})
|
get_crawler(None, {"FILES_STORE": self.tempdir})
|
||||||
|
@ -35,7 +35,6 @@ def _mocked_download_func(request, info):
|
|||||||
|
|
||||||
|
|
||||||
class UserDefinedPipeline(MediaPipeline):
|
class UserDefinedPipeline(MediaPipeline):
|
||||||
|
|
||||||
def media_to_download(self, request, info, *, item=None):
|
def media_to_download(self, request, info, *, item=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -376,7 +375,6 @@ class MediaPipelineTestCase(BaseMediaPipelineTestCase):
|
|||||||
|
|
||||||
|
|
||||||
class MediaPipelineAllowRedirectSettingsTestCase(unittest.TestCase):
|
class MediaPipelineAllowRedirectSettingsTestCase(unittest.TestCase):
|
||||||
|
|
||||||
def _assert_request_no3xx(self, pipeline_class, settings):
|
def _assert_request_no3xx(self, pipeline_class, settings):
|
||||||
pipe = pipeline_class(crawler=get_crawler(None, settings))
|
pipe = pipeline_class(crawler=get_crawler(None, settings))
|
||||||
request = Request("http://url")
|
request = Request("http://url")
|
||||||
@ -403,11 +401,9 @@ class MediaPipelineAllowRedirectSettingsTestCase(unittest.TestCase):
|
|||||||
self.assertNotIn(status, request.meta["handle_httpstatus_list"])
|
self.assertNotIn(status, request.meta["handle_httpstatus_list"])
|
||||||
|
|
||||||
def test_subclass_standard_setting(self):
|
def test_subclass_standard_setting(self):
|
||||||
|
|
||||||
self._assert_request_no3xx(UserDefinedPipeline, {"MEDIA_ALLOW_REDIRECTS": True})
|
self._assert_request_no3xx(UserDefinedPipeline, {"MEDIA_ALLOW_REDIRECTS": True})
|
||||||
|
|
||||||
def test_subclass_specific_setting(self):
|
def test_subclass_specific_setting(self):
|
||||||
|
|
||||||
self._assert_request_no3xx(
|
self._assert_request_no3xx(
|
||||||
UserDefinedPipeline, {"USERDEFINEDPIPELINE_MEDIA_ALLOW_REDIRECTS": True}
|
UserDefinedPipeline, {"USERDEFINEDPIPELINE_MEDIA_ALLOW_REDIRECTS": True}
|
||||||
)
|
)
|
||||||
|
@ -27,10 +27,7 @@ class BaseRobotParserTest:
|
|||||||
|
|
||||||
def test_allowed(self):
|
def test_allowed(self):
|
||||||
robotstxt_robotstxt_body = (
|
robotstxt_robotstxt_body = (
|
||||||
b"User-agent: * \n"
|
b"User-agent: * \nDisallow: /disallowed \nAllow: /allowed \nCrawl-delay: 10"
|
||||||
b"Disallow: /disallowed \n"
|
|
||||||
b"Allow: /allowed \n"
|
|
||||||
b"Crawl-delay: 10"
|
|
||||||
)
|
)
|
||||||
rp = self.parser_cls.from_crawler(
|
rp = self.parser_cls.from_crawler(
|
||||||
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
||||||
@ -140,7 +137,7 @@ class DecodeRobotsTxtTest(unittest.TestCase):
|
|||||||
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
||||||
|
|
||||||
def test_decode_non_utf8(self):
|
def test_decode_non_utf8(self):
|
||||||
robotstxt_body = b"User-agent: *\n\xFFDisallow: /\n"
|
robotstxt_body = b"User-agent: *\n\xffDisallow: /\n"
|
||||||
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
|
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
|
||||||
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
||||||
|
|
||||||
|
@ -107,9 +107,9 @@ class SelectorTestCase(unittest.TestCase):
|
|||||||
"""Check that classes are using slots and are weak-referenceable"""
|
"""Check that classes are using slots and are weak-referenceable"""
|
||||||
x = Selector(text="")
|
x = Selector(text="")
|
||||||
weakref.ref(x)
|
weakref.ref(x)
|
||||||
assert not hasattr(
|
assert not hasattr(x, "__dict__"), (
|
||||||
x, "__dict__"
|
f"{x.__class__.__name__} does not use __slots__"
|
||||||
), f"{x.__class__.__name__} does not use __slots__"
|
)
|
||||||
|
|
||||||
def test_selector_bad_args(self):
|
def test_selector_bad_args(self):
|
||||||
with self.assertRaisesRegex(ValueError, "received both response and text"):
|
with self.assertRaisesRegex(ValueError, "received both response and text"):
|
||||||
|
@ -158,18 +158,18 @@ class ResponseUtilsTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert open_in_browser(r1, _openfunc=check_base_url), "Inject base url"
|
assert open_in_browser(r1, _openfunc=check_base_url), "Inject base url"
|
||||||
assert open_in_browser(
|
assert open_in_browser(r2, _openfunc=check_base_url), (
|
||||||
r2, _openfunc=check_base_url
|
"Inject base url with argumented head"
|
||||||
), "Inject base url with argumented head"
|
)
|
||||||
assert open_in_browser(
|
assert open_in_browser(r3, _openfunc=check_base_url), (
|
||||||
r3, _openfunc=check_base_url
|
"Inject unique base url with misleading tag"
|
||||||
), "Inject unique base url with misleading tag"
|
)
|
||||||
assert open_in_browser(
|
assert open_in_browser(r4, _openfunc=check_base_url), (
|
||||||
r4, _openfunc=check_base_url
|
"Inject unique base url with misleading comment"
|
||||||
), "Inject unique base url with misleading comment"
|
)
|
||||||
assert open_in_browser(
|
assert open_in_browser(r5, _openfunc=check_base_url), (
|
||||||
r5, _openfunc=check_base_url
|
"Inject unique base url with conditional comment"
|
||||||
), "Inject unique base url with conditional comment"
|
)
|
||||||
|
|
||||||
def test_open_in_browser_redos_comment(self):
|
def test_open_in_browser_redos_comment(self):
|
||||||
MAX_CPU_TIME = 0.02
|
MAX_CPU_TIME = 0.02
|
||||||
@ -240,6 +240,6 @@ class ResponseUtilsTest(unittest.TestCase):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
def test_remove_html_comments(input_body, output_body):
|
def test_remove_html_comments(input_body, output_body):
|
||||||
assert (
|
assert _remove_html_comments(input_body) == output_body, (
|
||||||
_remove_html_comments(input_body) == output_body
|
f"{_remove_html_comments(input_body)=} == {output_body=}"
|
||||||
), f"{_remove_html_comments(input_body)=} == {output_body=}"
|
)
|
||||||
|
@ -321,9 +321,9 @@ class GuessSchemeTest(unittest.TestCase):
|
|||||||
def create_guess_scheme_t(args):
|
def create_guess_scheme_t(args):
|
||||||
def do_expected(self):
|
def do_expected(self):
|
||||||
url = guess_scheme(args[0])
|
url = guess_scheme(args[0])
|
||||||
assert url.startswith(
|
assert url.startswith(args[1]), (
|
||||||
args[1]
|
f"Wrong scheme guessed: for `{args[0]}` got `{url}`, expected `{args[1]}...`"
|
||||||
), f"Wrong scheme guessed: for `{args[0]}` got `{url}`, expected `{args[1]}...`"
|
)
|
||||||
|
|
||||||
return do_expected
|
return do_expected
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user