Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions cloud_pipelines_backend/instrumentation/execution_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from .. import backend_types_sql as bts
from ..launchers import common_annotations
from ..launchers import kubernetes_launchers

_logger = logging.getLogger(__name__)
_tracer = trace.get_tracer("tangle.orchestrator")
Expand Down Expand Up @@ -144,6 +145,23 @@ def _pipeline_attrs(*, execution: bts.ExecutionNode) -> dict[str, object]:
return attrs


def _resource_attrs(*, execution: bts.ExecutionNode, status: str) -> dict[str, object]:
"""CPU, memory, and accelerator requests for the PENDING span."""
if status != bts.ContainerExecutionStatus.PENDING:
return {}
annotations: dict = (execution.task_spec or {}).get("annotations", {})
attrs: dict[str, object] = {}
if cpu := annotations.get(kubernetes_launchers.RESOURCES_CPU_ANNOTATION_KEY):
attrs["execution.resources.cpu"] = cpu
if memory := annotations.get(kubernetes_launchers.RESOURCES_MEMORY_ANNOTATION_KEY):
attrs["execution.resources.memory"] = memory
if accelerators := annotations.get(
kubernetes_launchers.RESOURCES_ACCELERATORS_ANNOTATION_KEY
):
attrs["execution.resources.accelerators"] = accelerators
return attrs


def _ns(*, dt: datetime.datetime) -> int:
"""Return *dt* as nanoseconds since the Unix epoch (required by OTel SDK)."""
if dt.tzinfo is None:
Expand Down Expand Up @@ -189,13 +207,18 @@ def emit_execution_trace(*, execution: bts.ExecutionNode) -> None:
"execution.status": entry["status"],
**_error_attrs(execution=execution, status=entry["status"]),
**_launcher_pod_attrs(execution=execution, status=entry["status"]),
**_resource_attrs(execution=execution, status=entry["status"]),
}
start_ns = _ns(dt=t_start)
end_ns = _ns(dt=t_end)
if end_ns <= start_ns:
end_ns = start_ns + 1
_tracer.start_span(
f"execution.status {entry['status']}",
context=root_ctx,
attributes=attrs,
start_time=_ns(dt=t_start),
).end(end_time=_ns(dt=t_end))
start_time=start_ns,
).end(end_time=end_ns)

if history[-1]["status"] in _ERROR_TERMINAL_STATUSES:
root.set_status(status=StatusCode.ERROR)
Expand Down
2 changes: 1 addition & 1 deletion cloud_pipelines_backend/instrumentation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,4 @@ def _handle_before_commit(session: orm.Session) -> None:
exc_info=True,
)
obj._status_changed = False
execution_tracing.try_emit_execution_trace(execution=obj)
execution_tracing.emit_execution_trace(execution=obj)
65 changes: 61 additions & 4 deletions tests/instrumentation/test_execution_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def test_cache_miss_sets_hit_false(
self, span_exporter: InMemorySpanExporter
) -> None:
execution = _make_execution(statuses=["QUEUED", "SUCCEEDED"])
execution_tracing.try_emit_execution_trace(execution=execution)
execution_tracing.emit_execution_trace(execution=execution)

root = next(
s for s in span_exporter.get_finished_spans() if s.name == "execution"
Expand All @@ -360,7 +360,7 @@ def test_cache_hit_sets_hit_true_and_reused_from_id(
statuses=["QUEUED", "SUCCEEDED"],
extra={"reused_from_execution_node_id": "source-execution-id"},
)
execution_tracing.try_emit_execution_trace(execution=execution)
execution_tracing.emit_execution_trace(execution=execution)

root = next(
s for s in span_exporter.get_finished_spans() if s.name == "execution"
Expand All @@ -378,7 +378,7 @@ def test_root_span_carries_parent_and_task_id(
execution = _make_execution(statuses=["QUEUED", "SUCCEEDED"])
execution.parent_execution_id = "parent-exec-id"
execution.task_id_in_parent_execution = "my-task"
execution_tracing.try_emit_execution_trace(execution=execution)
execution_tracing.emit_execution_trace(execution=execution)

root = next(
s for s in span_exporter.get_finished_spans() if s.name == "execution"
Expand All @@ -390,10 +390,67 @@ def test_root_execution_omits_parent_attrs_when_absent(
self, span_exporter: InMemorySpanExporter
) -> None:
execution = _make_execution(statuses=["QUEUED", "SUCCEEDED"])
execution_tracing.try_emit_execution_trace(execution=execution)
execution_tracing.emit_execution_trace(execution=execution)

root = next(
s for s in span_exporter.get_finished_spans() if s.name == "execution"
)
assert "execution.parent_id" not in (root.attributes or {})
assert "execution.task_id" not in (root.attributes or {})


class TestResourceAttrs:
def test_pending_span_carries_cpu_and_memory(
self, span_exporter: InMemorySpanExporter
) -> None:
from cloud_pipelines_backend.launchers import kubernetes_launchers

execution = _make_execution(
statuses=["QUEUED", "PENDING", "RUNNING", "SUCCEEDED"]
)
execution.task_spec = {
"annotations": {
kubernetes_launchers.RESOURCES_CPU_ANNOTATION_KEY: "4",
kubernetes_launchers.RESOURCES_MEMORY_ANNOTATION_KEY: "16Gi",
}
}
execution_tracing.emit_execution_trace(execution=execution)

pending_span = next(
s
for s in span_exporter.get_finished_spans()
if s.attributes.get("execution.status") == "PENDING"
)
assert pending_span.attributes["execution.resources.cpu"] == "4"
assert pending_span.attributes["execution.resources.memory"] == "16Gi"

def test_pending_span_carries_accelerators_when_present(
self, span_exporter: InMemorySpanExporter
) -> None:
from cloud_pipelines_backend.launchers import kubernetes_launchers

execution = _make_execution(statuses=["QUEUED", "PENDING", "SUCCEEDED"])
execution.task_spec = {
"annotations": {
kubernetes_launchers.RESOURCES_ACCELERATORS_ANNOTATION_KEY: '{"H100": 1}',
}
}
execution_tracing.emit_execution_trace(execution=execution)

pending_span = next(
s
for s in span_exporter.get_finished_spans()
if s.attributes.get("execution.status") == "PENDING"
)
assert (
pending_span.attributes["execution.resources.accelerators"] == '{"H100": 1}'
)

def test_non_pending_spans_have_no_resource_attrs(
self, span_exporter: InMemorySpanExporter
) -> None:
execution = _make_execution(statuses=["QUEUED", "SUCCEEDED"])
execution_tracing.emit_execution_trace(execution=execution)

for span in span_exporter.get_finished_spans():
assert "execution.resources.cpu" not in (span.attributes or {})
Comment on lines +452 to +456
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it'll be more thorough to add a task_spec so that it proves PENDING is the only one that gets resource states.

        execution.task_spec = {

            "annotations": {
                kubernetes_launchers.RESOURCES_ACCELERATORS_ANNOTATION_KEY: '{"H100": 1}',
            }
        }
        execution_tracing.emit_execution_trace(execution=execution)