1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00

scrapy parse: fix the signature of callbacks from the CLI (#6182)

This commit is contained in:
Adrián Chaves 2024-01-15 13:37:03 +01:00 committed by GitHub
parent fa0c598096
commit e8dadb9592
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 30 deletions

View File

@ -1,3 +1,4 @@
import functools
import inspect
import json
import logging
@ -251,39 +252,40 @@ class Command(BaseRunSpiderCommand):
return scraped_data
def _get_callback(self, *, spider, opts, response=None):
cb = None
if response:
cb = response.meta["_callback"]
if not cb:
if opts.callback:
cb = opts.callback
elif response and opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(spider, response)
if not cb:
raise ValueError(
f"Cannot find a rule that matches {response.url!r} in spider: "
f"{spider.name}"
)
else:
cb = "parse"
if not callable(cb):
cb_method = getattr(spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
raise ValueError(
f"Cannot find callback {cb!r} in spider: {spider.name}"
)
return cb
def prepare_request(self, spider, request, opts):
def callback(response, **cb_kwargs):
# memorize first request
if not self.first_response:
self.first_response = response
# determine real callback
cb = response.meta["_callback"]
if not cb:
if opts.callback:
cb = opts.callback
elif opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(spider, response)
if not cb:
logger.error(
"Cannot find a rule that matches %(url)r in spider: %(spider)s",
{"url": response.url, "spider": spider.name},
)
return
else:
cb = "parse"
if not callable(cb):
cb_method = getattr(spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
logger.error(
"Cannot find callback %(callback)r in spider: %(spider)s",
{"callback": cb, "spider": spider.name},
)
return
cb = self._get_callback(spider=spider, opts=opts, response=response)
# parse items and requests
depth = response.meta["_depth"]
@ -303,6 +305,9 @@ class Command(BaseRunSpiderCommand):
request.meta["_depth"] = 1
request.meta["_callback"] = request.callback
if not request.callback and not opts.rules:
cb = self._get_callback(spider=spider, opts=opts)
functools.update_wrapper(callback, cb)
request.callback = callback
return request

View File

@ -16,11 +16,11 @@ import scrapy
class CheckSpider(scrapy.Spider):
name = '{self.spider_name}'
start_urls = ['http://toscrape.com']
start_urls = ['data:,']
def parse(self, response, **cb_kwargs):
\"\"\"
@url http://toscrape.com
@url data:,
{contracts}
\"\"\"
{parse_def}

View File

@ -78,9 +78,21 @@ class AsyncDefAsyncioGenExcSpider(scrapy.Spider):
if i > 5:
raise ValueError("Stopping the processing")
class CallbackSignatureDownloaderMiddleware:
def process_request(self, request, spider):
from inspect import signature
spider.logger.debug(f"request.callback signature: {{signature(request.callback)}}")
class MySpider(scrapy.Spider):
name = '{self.spider_name}'
custom_settings = {{
"DOWNLOADER_MIDDLEWARES": {{
CallbackSignatureDownloaderMiddleware: 0,
}}
}}
def parse(self, response):
if getattr(self, 'test_arg', None):
self.logger.debug('It Works!')
@ -220,7 +232,11 @@ ITEM_PIPELINES = {{'{self.project_name}.pipelines.MyPipeline': 1}}
self.url("/html"),
]
)
self.assertIn("DEBUG: It Works!", _textmode(stderr))
log = _textmode(stderr)
self.assertIn("DEBUG: It Works!", log)
self.assertIn(
"DEBUG: request.callback signature: (response, foo=None, key=None)", log
)
@defer.inlineCallbacks
def test_request_without_meta(self):