diff --git a/cloud_pipelines_backend/instrumentation/error_normalization.py b/cloud_pipelines_backend/instrumentation/error_normalization.py index 41d2977..06b74b7 100644 --- a/cloud_pipelines_backend/instrumentation/error_normalization.py +++ b/cloud_pipelines_backend/instrumentation/error_normalization.py @@ -16,9 +16,14 @@ r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE ) _LONG_ALNUM_ID_PATTERN = re.compile(r"\b[a-zA-Z0-9]{16,}\b") +# Matches any embedded JSON object or Python dict literal (starts with `{"` or `{'`). +# These are stripped from grouping strings because they contain highly variable +# runtime data (e.g. full Kubernetes pod specs) that would fragment error groups. +_JSON_OBJECT_PATTERN = re.compile(r"\{['\"].*", re.DOTALL) def _strip_generic(*, message: str) -> str: + message = _JSON_OBJECT_PATTERN.sub("{...}", message) message = _OBJECT_REPR_PATTERN.sub("{object}", message) message = _HEX_ADDRESS_PATTERN.sub("{addr}", message) message = _UUID_PATTERN.sub("{uuid}", message) @@ -85,6 +90,19 @@ def _normalize_orchestrator_error(*, exception: BaseException) -> str | None: return f"OrchestratorError: {message}" +def _normalize_launcher_error(*, exception: BaseException) -> str | None: + try: + from ..launchers.interfaces import LauncherError + except ImportError: + return None + if not isinstance(exception, LauncherError): + return None + # Take only the verb phrase before the first colon to drop any embedded + # serialized data (e.g. the full Kubernetes pod spec appended after ": "). + head = str(exception).split(":", 1)[0].strip() + return f"LauncherError: {head}" + + def normalize_error_message(*, exception: BaseException) -> str: """Return a stable normalized string for error grouping.""" for normalizer in ( @@ -92,6 +110,7 @@ def normalize_error_message(*, exception: BaseException) -> str: _normalize_max_retry_error, _normalize_unicode_decode_error, _normalize_orchestrator_error, + _normalize_launcher_error, ): result = normalizer(exception=exception) if result is not None: diff --git a/tests/instrumentation/test_error_normalization.py b/tests/instrumentation/test_error_normalization.py index dc01b55..f12489f 100644 --- a/tests/instrumentation/test_error_normalization.py +++ b/tests/instrumentation/test_error_normalization.py @@ -184,6 +184,41 @@ def test_strips_object_repr(self): ) +class TestNormalizeLauncherError: + def _make_launcher_error( + self, message: str, cause: BaseException | None = None + ) -> Exception: + try: + from cloud_pipelines_backend.launchers.interfaces import LauncherError + except ImportError: + pytest.skip("LauncherError not importable") + if cause: + try: + raise LauncherError(message) from cause + except LauncherError as exc: + return exc + return LauncherError(message) + + def test_strips_pod_spec_json(self): + pod_spec = ( + "{'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'name': 'task-abc-xyz'}}" + ) + exc = self._make_launcher_error(f"Failed to create pod: {pod_spec}") + result = error_normalization.normalize_error_message(exception=exc) + assert result == "LauncherError: Failed to create pod" + + def test_with_timeout_cause(self): + cause = TimeoutError("The read operation timed out") + exc = self._make_launcher_error("Failed to create pod: {big spec}", cause=cause) + result = error_normalization.normalize_error_message(exception=exc) + assert result == "LauncherError: Failed to create pod" + + def test_no_colon_in_message(self): + exc = self._make_launcher_error("launch failed") + result = error_normalization.normalize_error_message(exception=exc) + assert result == "LauncherError: launch failed" + + class TestFallback: def test_strips_hex_address(self): exc = ValueError("object at 0xdeadbeef failed") @@ -204,3 +239,13 @@ def test_stable_message_unchanged(self): exc = AttributeError("'NoneType' object has no attribute 'encode'") result = error_normalization.normalize_error_message(exception=exc) assert result == "AttributeError: 'NoneType' object has no attribute 'encode'" + + def test_strips_json_object(self): + exc = RuntimeError("operation failed: {'key': 'value', 'nested': {'a': 1}}") + result = error_normalization.normalize_error_message(exception=exc) + assert result == "RuntimeError: operation failed: {...}" + + def test_strips_json_object_double_quotes(self): + exc = RuntimeError('operation failed: {"key": "value"}') + result = error_normalization.normalize_error_message(exception=exc) + assert result == "RuntimeError: operation failed: {...}"