From 86601e2b592a1d0e086428aaaac96af1d27b584f Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 18:04:05 +0000 Subject: [PATCH 1/3] ref: Silence transient Sentry errors at the source Several high-volume Snuba errors are operational/lifecycle noise that recover on their own and aren't actionable. Filter them out where the Sentry events are created instead of muting them in the Sentry UI. Python (snuba/environment.py): - ignore_logger("datadog.dogstatsd"): drops metrics-transport warnings like "[Errno 111] Connection refused, dropping the packet ..." emitted when the local statsd agent socket is briefly unavailable (SNUBA-A3N). - before_send now drops "Commit failed" logs from arroyo.backends.kafka.consumer that carry a transient rebalance code (UNKNOWN_MEMBER_ID / REBALANCE_IN_PROGRESS / ILLEGAL_GENERATION). These self-heal once the new consumer generation re-commits (SNUBA-9WM). Rust (rust_snuba/src/logging.rs): - Downgrade ERROR/WARN from sentry_usage_accountant to logs only; the usage accountant is a best-effort billing side channel whose Kafka producer logs "Purged in queue/flight" on shutdown/rebalance (SNUBA-474, SNUBA-475). - Downgrade arroyo run_task_in_threads / reduce task-join timeout WARNs (logged while draining work during shutdown) to logs only (SNUBA-4VF, SNUBA-4WS). All of these remain visible as Sentry logs/breadcrumbs; they just no longer create ongoing issues. Genuine errors from the same modules (non-rebalance commit failures, arroyo ERRORs) are still reported. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01XRxGfhiUKoyuTUQsJrBahM --- rust_snuba/src/logging.rs | 29 ++++++++++++++++++++--- snuba/environment.py | 38 +++++++++++++++++++++++++++++- tests/test_environment.py | 49 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 tests/test_environment.py diff --git a/rust_snuba/src/logging.rs b/rust_snuba/src/logging.rs index 49eecfa2e5b..b272a08a174 100644 --- a/rust_snuba/src/logging.rs +++ b/rust_snuba/src/logging.rs @@ -11,12 +11,35 @@ pub fn setup_logging() { // Capture errors & warnings as exceptions, and also send everything at or above INFO as logs // instead of breadcrumbs. - let sentry_layer = - sentry::integrations::tracing::layer().event_filter(|metadata| match *metadata.level() { + // + // A few targets emit high-volume transient operational noise that we keep as + // logs but don't want to surface as Sentry issues. Match on the tracing + // target (the emitting module path) so these stay observable without + // creating ongoing issues. + let sentry_layer = sentry::integrations::tracing::layer().event_filter(|metadata| { + let target = metadata.target(); + match *metadata.level() { + // The usage accountant is a best-effort billing side channel. Its + // Kafka producer logs "Purged in queue/flight" errors whenever the + // producer is flushed during a consumer shutdown/rebalance. These + // are transient and don't affect ingestion (SNUBA-474, SNUBA-475). + Level::ERROR | Level::WARN if target.starts_with("sentry_usage_accountant") => { + EventFilter::Log + } + // Arroyo logs task-join timeouts at WARN while draining in-flight + // work during shutdown. Expected on every deploy/rebalance, so + // they're not actionable (SNUBA-4VF, SNUBA-4WS). + Level::WARN + if target == "sentry_arroyo::processing::strategies::run_task_in_threads" + || target == "sentry_arroyo::processing::strategies::reduce" => + { + EventFilter::Log + } Level::ERROR | Level::WARN => EventFilter::Event | EventFilter::Log, Level::INFO => EventFilter::Log, Level::DEBUG | Level::TRACE => EventFilter::Ignore, - }); + } + }); tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer().json()) diff --git a/snuba/environment.py b/snuba/environment.py index 1850c346a5c..4166f95dc0e 100644 --- a/snuba/environment.py +++ b/snuba/environment.py @@ -8,7 +8,7 @@ import structlog from sentry_sdk.integrations.flask import FlaskIntegration from sentry_sdk.integrations.gnu_backtrace import GnuBacktraceIntegration -from sentry_sdk.integrations.logging import LoggingIntegration +from sentry_sdk.integrations.logging import LoggingIntegration, ignore_logger from sentry_sdk.integrations.redis import RedisIntegration from sentry_sdk.integrations.threading import ThreadingIntegration from sentry_sdk.types import Event, Hint @@ -73,8 +73,37 @@ def setup_logging(level: Optional[str] = None) -> None: ) +# Kafka error codes that indicate the consumer was (briefly) evicted from its +# group during a rebalance. Offset commits that fail with one of these self-heal +# once the new generation re-commits, so the resulting "Commit failed" logs are +# transient operational noise rather than actionable errors (see SNUBA-9WM). +_TRANSIENT_REBALANCE_CODES = ( + "UNKNOWN_MEMBER_ID", + "REBALANCE_IN_PROGRESS", + "ILLEGAL_GENERATION", +) + + +def _is_transient_rebalance_commit_log(hint: Hint) -> bool: + record = hint.get("log_record") + if not isinstance(record, logging.LogRecord): + return False + if record.name != "arroyo.backends.kafka.consumer": + return False + message = record.getMessage() + return "Commit failed" in message and any( + code in message for code in _TRANSIENT_REBALANCE_CODES + ) + + def before_send(event: Event, hint: Hint) -> Event | None: """Filter out AllocationPolicyViolations and RPCAllocationPolicyException from being sent to Sentry""" + # Drop transient Kafka offset-commit failures caused by consumer-group + # rebalances. They are expected during deploys/scaling and recover on their + # own, so they should not be surfaced as Sentry issues (SNUBA-9WM). + if _is_transient_rebalance_commit_log(hint): + return None + if "exc_info" in hint: _, exc_value, _ = hint["exc_info"] # Check if it's an AllocationPolicyViolations in the cause chain @@ -117,6 +146,13 @@ def setup_sentry() -> None: }, ) + # The dogstatsd client logs "[Errno 111] Connection refused" (and similar) + # at WARNING whenever the local statsd agent socket is briefly unavailable + # (agent restarts, socket churn). These are metrics-transport failures that + # don't affect query handling, so don't turn them into Sentry issues + # (SNUBA-A3N). + ignore_logger("datadog.dogstatsd") + from snuba.utils.profiler import run_ondemand_profiler if settings.SENTRY_DSN is not None: diff --git a/tests/test_environment.py b/tests/test_environment.py new file mode 100644 index 00000000000..67aac61ace4 --- /dev/null +++ b/tests/test_environment.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import logging +from typing import cast + +from sentry_sdk.types import Event, Hint + +from snuba.environment import before_send + + +def _log_record(name: str, message: str) -> logging.LogRecord: + return logging.LogRecord( + name=name, + level=logging.ERROR, + pathname=__file__, + lineno=1, + msg=message, + args=(), + exc_info=None, + ) + + +def _hint(record: logging.LogRecord) -> Hint: + return cast(Hint, {"log_record": record}) + + +def test_before_send_drops_transient_rebalance_commit_failures() -> None: + for code in ("UNKNOWN_MEMBER_ID", "REBALANCE_IN_PROGRESS", "ILLEGAL_GENERATION"): + record = _log_record( + "arroyo.backends.kafka.consumer", + f'Commit failed: KafkaError{{code={code},val=25,str="Broker: ..."}}. ' + "Partitions: ['group-attributes:8']", + ) + assert before_send(cast(Event, {}), _hint(record)) is None + + +def test_before_send_keeps_genuine_kafka_commit_failures() -> None: + record = _log_record( + "arroyo.backends.kafka.consumer", + "Commit failed: KafkaError{code=BROKER_NOT_AVAILABLE}", + ) + event = cast(Event, {"message": "Commit failed"}) + assert before_send(event, _hint(record)) is event + + +def test_before_send_keeps_other_loggers() -> None: + record = _log_record("snuba.something", "Commit failed: UNKNOWN_MEMBER_ID") + event = cast(Event, {"message": "boom"}) + assert before_send(event, _hint(record)) is event From acf87ba57441329bde561d4abcfdc6fa417fa103 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 18:28:49 +0000 Subject: [PATCH 2/3] Stop forwarding WARN logs to Sentry as issues; keep them as logs Switch from the targeted per-source filters to a blanket policy: only ERROR and above become Sentry issues. Warnings are operational noise far more often than they're actionable (consumer rebalance/shutdown timeouts, transient transport failures, etc.) and remain captured as logs/breadcrumbs. This dissolves the fragile message/target string matching the previous approach relied on: - Python (environment.py): LoggingIntegration event_level WARNING -> ERROR. Removes the datadog.dogstatsd ignore_logger (SNUBA-A3N) and the arroyo "Commit failed" rebalance before_send filter (SNUBA-9WM) -- both are WARN, so the policy now covers them. - Rust (logging.rs): WARN -> EventFilter::Log instead of Event. Drops the arroyo run_task_in_threads/reduce target matching (SNUBA-4VF, SNUBA-4WS, also WARN). The usage accountant errors (SNUBA-474, SNUBA-475) are ERROR-level, so they still need an explicit downgrade: sentry_usage_accountant ERROR -> Log is retained. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01XRxGfhiUKoyuTUQsJrBahM --- rust_snuba/src/logging.rs | 32 ++++++++----------------- snuba/environment.py | 44 +++++------------------------------ tests/test_environment.py | 49 --------------------------------------- 3 files changed, 16 insertions(+), 109 deletions(-) delete mode 100644 tests/test_environment.py diff --git a/rust_snuba/src/logging.rs b/rust_snuba/src/logging.rs index b272a08a174..ff562ac1aa1 100644 --- a/rust_snuba/src/logging.rs +++ b/rust_snuba/src/logging.rs @@ -9,34 +9,22 @@ pub fn setup_logging() { .or_else(|_| EnvFilter::try_new("info")) .unwrap(); - // Capture errors & warnings as exceptions, and also send everything at or above INFO as logs - // instead of breadcrumbs. - // - // A few targets emit high-volume transient operational noise that we keep as - // logs but don't want to surface as Sentry issues. Match on the tracing - // target (the emitting module path) so these stay observable without - // creating ongoing issues. + // Only errors are forwarded to Sentry as issues. Warnings are operational + // noise far more often than they're actionable (e.g. consumer + // rebalance/shutdown timeouts), so they're kept as logs alongside + // everything at or above INFO, instead of breadcrumbs. let sentry_layer = sentry::integrations::tracing::layer().event_filter(|metadata| { - let target = metadata.target(); match *metadata.level() { // The usage accountant is a best-effort billing side channel. Its - // Kafka producer logs "Purged in queue/flight" errors whenever the + // Kafka producer logs "Purged in queue/flight" at ERROR whenever the // producer is flushed during a consumer shutdown/rebalance. These - // are transient and don't affect ingestion (SNUBA-474, SNUBA-475). - Level::ERROR | Level::WARN if target.starts_with("sentry_usage_accountant") => { + // are transient and don't affect ingestion, so keep them as logs + // rather than issues (SNUBA-474, SNUBA-475). + Level::ERROR if metadata.target().starts_with("sentry_usage_accountant") => { EventFilter::Log } - // Arroyo logs task-join timeouts at WARN while draining in-flight - // work during shutdown. Expected on every deploy/rebalance, so - // they're not actionable (SNUBA-4VF, SNUBA-4WS). - Level::WARN - if target == "sentry_arroyo::processing::strategies::run_task_in_threads" - || target == "sentry_arroyo::processing::strategies::reduce" => - { - EventFilter::Log - } - Level::ERROR | Level::WARN => EventFilter::Event | EventFilter::Log, - Level::INFO => EventFilter::Log, + Level::ERROR => EventFilter::Event | EventFilter::Log, + Level::WARN | Level::INFO => EventFilter::Log, Level::DEBUG | Level::TRACE => EventFilter::Ignore, } }); diff --git a/snuba/environment.py b/snuba/environment.py index 4166f95dc0e..de1ac845ef8 100644 --- a/snuba/environment.py +++ b/snuba/environment.py @@ -8,7 +8,7 @@ import structlog from sentry_sdk.integrations.flask import FlaskIntegration from sentry_sdk.integrations.gnu_backtrace import GnuBacktraceIntegration -from sentry_sdk.integrations.logging import LoggingIntegration, ignore_logger +from sentry_sdk.integrations.logging import LoggingIntegration from sentry_sdk.integrations.redis import RedisIntegration from sentry_sdk.integrations.threading import ThreadingIntegration from sentry_sdk.types import Event, Hint @@ -73,37 +73,8 @@ def setup_logging(level: Optional[str] = None) -> None: ) -# Kafka error codes that indicate the consumer was (briefly) evicted from its -# group during a rebalance. Offset commits that fail with one of these self-heal -# once the new generation re-commits, so the resulting "Commit failed" logs are -# transient operational noise rather than actionable errors (see SNUBA-9WM). -_TRANSIENT_REBALANCE_CODES = ( - "UNKNOWN_MEMBER_ID", - "REBALANCE_IN_PROGRESS", - "ILLEGAL_GENERATION", -) - - -def _is_transient_rebalance_commit_log(hint: Hint) -> bool: - record = hint.get("log_record") - if not isinstance(record, logging.LogRecord): - return False - if record.name != "arroyo.backends.kafka.consumer": - return False - message = record.getMessage() - return "Commit failed" in message and any( - code in message for code in _TRANSIENT_REBALANCE_CODES - ) - - def before_send(event: Event, hint: Hint) -> Event | None: """Filter out AllocationPolicyViolations and RPCAllocationPolicyException from being sent to Sentry""" - # Drop transient Kafka offset-commit failures caused by consumer-group - # rebalances. They are expected during deploys/scaling and recover on their - # own, so they should not be surfaced as Sentry issues (SNUBA-9WM). - if _is_transient_rebalance_commit_log(hint): - return None - if "exc_info" in hint: _, exc_value, _ = hint["exc_info"] # Check if it's an AllocationPolicyViolations in the cause chain @@ -129,7 +100,11 @@ def setup_sentry() -> None: integrations=[ FlaskIntegration(), GnuBacktraceIntegration(), - LoggingIntegration(event_level=logging.WARNING), + # Only forward ERROR and above to Sentry as issues. Warnings are + # operational noise far more often than they're actionable (e.g. + # consumer rebalance/shutdown timeouts) and are still captured as + # logs/breadcrumbs. + LoggingIntegration(event_level=logging.ERROR), RedisIntegration(), ThreadingIntegration(propagate_hub=True), ], @@ -146,13 +121,6 @@ def setup_sentry() -> None: }, ) - # The dogstatsd client logs "[Errno 111] Connection refused" (and similar) - # at WARNING whenever the local statsd agent socket is briefly unavailable - # (agent restarts, socket churn). These are metrics-transport failures that - # don't affect query handling, so don't turn them into Sentry issues - # (SNUBA-A3N). - ignore_logger("datadog.dogstatsd") - from snuba.utils.profiler import run_ondemand_profiler if settings.SENTRY_DSN is not None: diff --git a/tests/test_environment.py b/tests/test_environment.py deleted file mode 100644 index 67aac61ace4..00000000000 --- a/tests/test_environment.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import annotations - -import logging -from typing import cast - -from sentry_sdk.types import Event, Hint - -from snuba.environment import before_send - - -def _log_record(name: str, message: str) -> logging.LogRecord: - return logging.LogRecord( - name=name, - level=logging.ERROR, - pathname=__file__, - lineno=1, - msg=message, - args=(), - exc_info=None, - ) - - -def _hint(record: logging.LogRecord) -> Hint: - return cast(Hint, {"log_record": record}) - - -def test_before_send_drops_transient_rebalance_commit_failures() -> None: - for code in ("UNKNOWN_MEMBER_ID", "REBALANCE_IN_PROGRESS", "ILLEGAL_GENERATION"): - record = _log_record( - "arroyo.backends.kafka.consumer", - f'Commit failed: KafkaError{{code={code},val=25,str="Broker: ..."}}. ' - "Partitions: ['group-attributes:8']", - ) - assert before_send(cast(Event, {}), _hint(record)) is None - - -def test_before_send_keeps_genuine_kafka_commit_failures() -> None: - record = _log_record( - "arroyo.backends.kafka.consumer", - "Commit failed: KafkaError{code=BROKER_NOT_AVAILABLE}", - ) - event = cast(Event, {"message": "Commit failed"}) - assert before_send(event, _hint(record)) is event - - -def test_before_send_keeps_other_loggers() -> None: - record = _log_record("snuba.something", "Commit failed: UNKNOWN_MEMBER_ID") - event = cast(Event, {"message": "boom"}) - assert before_send(event, _hint(record)) is event From 4b77bc4b64d623db94da0b77ee29478ee8823715 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 18:33:50 +0000 Subject: [PATCH 3/3] Make WARN->logs policy explicit on the structlog SentryProcessor path LoggingIntegration(event_level=ERROR) only covers the stdlib logging path. structlog logs go through structlog-sentry's SentryProcessor, which is now pinned to event_level=ERROR as well so WARNING-level structlog events stay as logs/breadcrumbs rather than Sentry issues. This is already structlog-sentry's default; setting it explicitly keeps the policy consistent across both logging paths and robust to upstream default changes. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01XRxGfhiUKoyuTUQsJrBahM --- snuba/environment.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/snuba/environment.py b/snuba/environment.py index de1ac845ef8..b8311e27540 100644 --- a/snuba/environment.py +++ b/snuba/environment.py @@ -66,7 +66,11 @@ def setup_logging(level: Optional[str] = None) -> None: structlog.processors.StackInfoRenderer(), structlog.processors.format_exc_info, structlog.processors.TimeStamper(fmt="iso", utc=True), - SentryProcessor(), + # Mirror the LoggingIntegration policy on the structlog path: only + # ERROR and above become Sentry issues, WARNING and below stay as + # logs/breadcrumbs. (This is structlog-sentry's default, set + # explicitly so the policy is obvious and robust to upstream changes.) + SentryProcessor(event_level=logging.ERROR), drop_level, JSONRenderer(), ],