1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00

Flexible severity of logging level when items are dropped (#6608)

This commit is contained in:
anubhav 2025-01-15 15:38:18 +05:30 committed by GitHub
parent 1c1e83895c
commit ca345a3b73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 105 additions and 1 deletions

View File

@ -418,6 +418,38 @@ This setting also affects :setting:`DOWNLOAD_DELAY` and
:ref:`topics-autothrottle`: if :setting:`CONCURRENT_REQUESTS_PER_IP`
is non-zero, download delay is enforced per IP, not per domain.
.. setting:: DEFAULT_DROPITEM_LOG_LEVEL
DEFAULT_DROPITEM_LOG_LEVEL
--------------------------
Default: ``"WARNING"``
Default :ref:`log level <levels>` of messages about dropped items.
When an item is dropped by raising :exc:`scrapy.exceptions.DropItem` from the
:func:`process_item` method of an :ref:`item pipeline <topics-item-pipeline>`,
a message is logged, and by default its log level is the one configured in this
setting.
You may specify this log level as an integer (e.g. ``20``), as a log level
constant (e.g. ``logging.INFO``) or as a string with the name of a log level
constant (e.g. ``"INFO"``).
When writing an item pipeline, you can force a different log level by setting
:attr:`scrapy.exceptions.DropItem.log_level` in your
:exc:`scrapy.exceptions.DropItem` exception. For example:
.. code-block:: python
from scrapy.exceptions import DropItem
class MyPipeline:
def process_item(self, item, spider):
if not item.get("price"):
raise DropItem("Missing price data", log_level="INFO")
return item
.. setting:: DEFAULT_ITEM_CLASS

View File

@ -5,6 +5,8 @@ These exceptions are documented in docs/topics/exceptions.rst. Please don't add
new exceptions here without documenting them there.
"""
from __future__ import annotations
from typing import Any
# Internal
@ -58,6 +60,10 @@ class StopDownload(Exception):
class DropItem(Exception):
"""Drop item from the item pipeline"""
def __init__(self, message: str, log_level: str | None = None):
super().__init__(message)
self.log_level = log_level
class NotSupported(Exception):
"""Indicates a feature or method is not supported"""

View File

@ -120,8 +120,12 @@ class LogFormatter:
spider: Spider,
) -> LogFormatterResult:
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
if (level := getattr(exception, "log_level", None)) is None:
level = spider.crawler.settings["DEFAULT_DROPITEM_LOG_LEVEL"]
if isinstance(level, str):
level = getattr(logging, level)
return {
"level": logging.WARNING,
"level": level,
"msg": DROPPEDMSG,
"args": {
"exception": exception,

View File

@ -49,6 +49,8 @@ CONCURRENT_REQUESTS_PER_IP = 0
COOKIES_ENABLED = True
COOKIES_DEBUG = False
DEFAULT_DROPITEM_LOG_LEVEL = "WARNING"
DEFAULT_ITEM_CLASS = "scrapy.item.Item"
DEFAULT_REQUEST_HEADERS = {

View File

@ -1,5 +1,7 @@
import logging
import unittest
import pytest
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.python.failure import Failure
@ -26,6 +28,7 @@ class LogFormatterTestCase(unittest.TestCase):
def setUp(self):
self.formatter = LogFormatter()
self.spider = Spider("default")
self.spider.crawler = get_crawler()
def test_crawled_with_referer(self):
req = Request("http://www.example.com")
@ -68,6 +71,62 @@ class LogFormatterTestCase(unittest.TestCase):
assert all(isinstance(x, str) for x in lines)
self.assertEqual(lines, ["Dropped: \u2018", "{}"])
def test_dropitem_default_log_level(self):
item = {}
exception = DropItem("Test drop")
response = Response("http://www.example.com")
spider = Spider("foo")
spider.crawler = get_crawler(Spider)
logkws = self.formatter.dropped(item, exception, response, spider)
self.assertEqual(logkws["level"], logging.WARNING)
spider.crawler.settings.frozen = False
spider.crawler.settings["DEFAULT_DROPITEM_LOG_LEVEL"] = logging.INFO
spider.crawler.settings.frozen = True
logkws = self.formatter.dropped(item, exception, response, spider)
self.assertEqual(logkws["level"], logging.INFO)
spider.crawler.settings.frozen = False
spider.crawler.settings["DEFAULT_DROPITEM_LOG_LEVEL"] = "INFO"
spider.crawler.settings.frozen = True
logkws = self.formatter.dropped(item, exception, response, spider)
self.assertEqual(logkws["level"], logging.INFO)
spider.crawler.settings.frozen = False
spider.crawler.settings["DEFAULT_DROPITEM_LOG_LEVEL"] = 10
spider.crawler.settings.frozen = True
logkws = self.formatter.dropped(item, exception, response, spider)
self.assertEqual(logkws["level"], logging.DEBUG)
spider.crawler.settings.frozen = False
spider.crawler.settings["DEFAULT_DROPITEM_LOG_LEVEL"] = 0
spider.crawler.settings.frozen = True
logkws = self.formatter.dropped(item, exception, response, spider)
self.assertEqual(logkws["level"], logging.NOTSET)
unsupported_value = object()
spider.crawler.settings.frozen = False
spider.crawler.settings["DEFAULT_DROPITEM_LOG_LEVEL"] = unsupported_value
spider.crawler.settings.frozen = True
logkws = self.formatter.dropped(item, exception, response, spider)
self.assertEqual(logkws["level"], unsupported_value)
with pytest.raises(TypeError):
logging.log(logkws["level"], "message")
def test_dropitem_custom_log_level(self):
item = {}
response = Response("http://www.example.com")
exception = DropItem("Test drop", log_level="INFO")
logkws = self.formatter.dropped(item, exception, response, self.spider)
self.assertEqual(logkws["level"], logging.INFO)
exception = DropItem("Test drop", log_level="ERROR")
logkws = self.formatter.dropped(item, exception, response, self.spider)
self.assertEqual(logkws["level"], logging.ERROR)
def test_item_error(self):
# In practice, the complete traceback is shown by passing the
# 'exc_info' argument to the logging function
@ -145,6 +204,7 @@ class LogformatterSubclassTest(LogFormatterTestCase):
def setUp(self):
self.formatter = LogFormatterSubclass()
self.spider = Spider("default")
self.spider.crawler = get_crawler(Spider)
def test_crawled_with_referer(self):
req = Request("http://www.example.com")