ServiceNow · lindsaydbrin · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/src/eva/__init__.py b/src/eva/__init__.py
@@ -11,4 +11,4 @@
 
 # Bump metrics_version when changes affect metric computation (metrics code,
 # judge prompts, pricing tables, postprocessor).
-metrics_version = "2.0.0"
+metrics_version = "2.1.0"
diff --git a/src/eva/metrics/aggregation.py b/src/eva/metrics/aggregation.py
@@ -9,7 +9,10 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+import numpy as np
+
 from eva.models.results import RecordMetrics
+from eva.utils.bootstrap import BASE_SEED, assign_bootstrap_cis, bootstrap_ci
 from eva.utils.pass_at_k import (
     compute_pass_at_k,
     compute_pass_power_k,
@@ -83,6 +86,51 @@ class EVACompositeDefinition:
 ]
 
 
+def _scenario_means_for_metric(
+    all_metrics: dict[str, RecordMetrics],
+    metric_name: str,
+) -> np.ndarray:
+    """Collapse trials → one value per scenario for a single metric.
+
+    Per-scenario value = mean over trials of ``normalized_score`` (falling back
+    to ``score``). Scenarios where all trials are missing/errored are dropped.
+    For k=1 runs each record is its own scenario.
+    """
+    grouped: dict[str, list[float]] = {}
+    for record_id, record_metrics in all_metrics.items():
+        base_id, _ = parse_trial_record_id(record_id)
+        val = record_metrics.get_score(metric_name)
+        if val is None:
+            continue
+        grouped.setdefault(base_id, []).append(float(val))
+    if not grouped:
+        return np.array([], dtype=float)
+    return np.array([sum(vs) / len(vs) for vs in grouped.values()], dtype=float)
+
+
+def _scenario_values_for_composite(
+    all_metrics: dict[str, RecordMetrics],
+    comp: EVACompositeDefinition,
+) -> np.ndarray:
+    """Collapse trials → one value per scenario for a composite.
+
+    Per-scenario value = mean over trials of the per-trial composite value
+    stored in ``aggregate_metrics``. For pass/derived composites this is the
+    scenario pass rate. Scenarios where all trials have ``None`` for this
+    composite are dropped.
+    """
+    grouped: dict[str, list[float]] = {}
+    for record_id, record_metrics in all_metrics.items():
+        base_id, _ = parse_trial_record_id(record_id)
+        val = record_metrics.aggregate_metrics.get(comp.name)
+        if val is None:
+            continue
+        grouped.setdefault(base_id, []).append(float(val))
+    if not grouped:
+        return np.array([], dtype=float)
+    return np.array([sum(vs) / len(vs) for vs in grouped.values()], dtype=float)
+
+
 def _check_threshold(value: float, operator: str, threshold: float) -> bool:
     """Check whether a value passes the given threshold comparison."""
     if operator == "==":
@@ -159,16 +207,19 @@ def compute_run_level_aggregates(
     all_metrics: dict[str, RecordMetrics],
     num_draws: int = 1,
     composites: list[EVACompositeDefinition] | None = None,
+    seed: int = BASE_SEED,
 ) -> dict:
     """Compute run-level aggregate scores from all records.
 
     Args:
         all_metrics: Dict mapping record ID to RecordMetrics (must have aggregate_metrics populated).
         num_draws: Number of draws (k) for pass@k computation.
         composites: Custom composite definitions. Defaults to EVA_COMPOSITES.
+        seed: Bootstrap seed for CI computation. Defaults to ``BASE_SEED``.
+            Production callers (the metrics runner) pass ``run_seed(run_dir.name)``.
 
     Returns:
-        Dict with per-composite statistics and optional pass@k data.
+        Dict with per-composite statistics, CI fields, and optional pass@k data.
     """
     composites = composites or EVA_COMPOSITES
 
@@ -206,11 +257,23 @@ def compute_run_level_aggregates(
             else:
                 entry["success_rate"] = round(sum(1 for v in values if v >= 0.5) / len(values), 4)
 
+        # Bootstrap CI on the per-scenario mean.
+        scenario_values = _scenario_values_for_composite(all_metrics, comp)
+        if len(scenario_values) == 0:
+            entry["mean_ci_lower"] = None
+            entry["mean_ci_upper"] = None
+            entry["mean_ci_n_scenarios"] = 0
+        else:
+            lower, upper = bootstrap_ci(scenario_values, seed=seed)
+            entry["mean_ci_lower"] = round(lower, 4)
+            entry["mean_ci_upper"] = round(upper, 4)
+            entry["mean_ci_n_scenarios"] = len(scenario_values)
+
         result[comp.name] = entry
 
     # pass_k for aggregate metrics if multi-trial
     if num_draws > 1:
-        pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites)
+        pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites, seed=seed)
         if pass_k_data:
             result["pass_k"] = pass_k_data
 
@@ -221,6 +284,7 @@ def _compute_aggregate_pass_k(
     all_metrics: dict[str, RecordMetrics],
     num_draws: int,
     composites: list[EVACompositeDefinition] | None = None,
+    seed: int = BASE_SEED,
 ) -> dict:
     """Compute pass@1, pass@k, pass^k (observed), and pass^k (theoretical) for aggregate metrics across trials."""
     composites = composites or EVA_COMPOSITES
@@ -264,13 +328,23 @@ def _compute_aggregate_pass_k(
 
         if pass_at_k_values:
             count = len(pass_at_k_values)
-            result[comp.name] = {
+            entry = {
                 "pass_at_1": round(sum(pass_at_1_values) / count, 4),
                 "pass_at_k": round(sum(pass_at_k_values) / count, 4),
                 "pass_power_k_observed": round(sum(pass_power_k_observed_values) / count, 4),
                 "pass_power_k_theoretical": round(sum(pass_power_k_theoretical_values) / count, 4),
                 "k": num_draws,
                 "count": count,
             }
+            assign_bootstrap_cis(
+                entry,
+                {
+                    "pass_at_1": pass_at_1_values,
+                    "pass_at_k": pass_at_k_values,
+                    "pass_power_k_observed": pass_power_k_observed_values,
+                },
+                seed=seed,
+            )
+            result[comp.name] = entry
 
     return result
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
@@ -10,7 +10,11 @@
 import yaml
 
 from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric
-from eva.metrics.aggregation import compute_record_aggregates, compute_run_level_aggregates
+from eva.metrics.aggregation import (
+    _scenario_means_for_metric,
+    compute_record_aggregates,
+    compute_run_level_aggregates,
+)
 from eva.metrics.base import BaseMetric, MetricContext
 from eva.metrics.legacy_aliases import rename_metric_keys
 from eva.metrics.processor import MetricsContextProcessor
@@ -20,6 +24,7 @@
 from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
+from eva.utils.bootstrap import BASE_SEED, assign_bootstrap_cis, bootstrap_ci, run_seed
 from eva.utils.hash_utils import get_dict_hash
 from eva.utils.logging import get_logger
 from eva.utils.pass_at_k import (
@@ -632,6 +637,7 @@ def _build_per_metric_aggregates(
         metric_names: list[str],
         pass_at_k_results: dict[str, dict[str, PassAtKResult]] | None = None,
         num_draws: int = 1,
+        seed: int = BASE_SEED,
     ) -> dict[str, dict[str, Any]]:
         """Build per-metric aggregate stats including pass_k.
 
@@ -640,6 +646,8 @@ def _build_per_metric_aggregates(
             metric_names: List of metric names to aggregate.
             pass_at_k_results: Per-record pass@k results (if multi-trial).
             num_draws: Number of draws (k) for pass@k.
+            seed: Bootstrap seed for CI computation. Defaults to ``BASE_SEED``;
+                production callers pass ``run_seed(run_dir.name)``.
 
         Returns:
             Dict mapping metric name to aggregate stats.
@@ -698,6 +706,18 @@ def _build_per_metric_aggregates(
                         coverage["not_applicable_turns"] = total_not_applicable_across_records
                     entry["per_turn_coverage"] = coverage
 
+                # Bootstrap CI on the per-scenario mean.
+                scenario_values = _scenario_means_for_metric(all_metrics, name)
+                if len(scenario_values) == 0:
+                    entry["mean_ci_lower"] = None
+                    entry["mean_ci_upper"] = None
+                    entry["mean_ci_n_scenarios"] = 0
+                else:
+                    lower, upper = bootstrap_ci(scenario_values, seed=seed)
+                    entry["mean_ci_lower"] = round(lower, 4)
+                    entry["mean_ci_upper"] = round(upper, 4)
+                    entry["mean_ci_n_scenarios"] = len(scenario_values)
+
                 entry["higher_is_better"] = _metric_higher_is_better(name)
                 metric_aggregates[name] = entry
 
@@ -720,14 +740,24 @@ def _build_per_metric_aggregates(
 
                 if pass_at_k_values:
                     count = len(pass_at_k_values)
-                    metric_aggregates[name]["pass_k"] = {
+                    pass_k_block: dict[str, Any] = {
                         "pass_at_1": round(sum(pass_at_1_values) / count, 4),
                         "pass_at_k": round(sum(pass_at_k_values) / count, 4),
                         "pass_power_k_observed": round(sum(pass_power_k_obs_values) / count, 4),
                         "pass_power_k_theoretical": round(sum(pass_power_k_theo_values) / count, 4),
                         "k": num_draws,
                         "count": count,
                     }
+                    assign_bootstrap_cis(
+                        pass_k_block,
+                        {
+                            "pass_at_1": pass_at_1_values,
+                            "pass_at_k": pass_at_k_values,
+                            "pass_power_k_observed": pass_power_k_obs_values,
+                        },
+                        seed=seed,
+                    )
+                    metric_aggregates[name]["pass_k"] = pass_k_block
 
         # Generic sub-metric aggregation.
         # Sub-keys are collected in first-seen insertion order so each metric controls
@@ -920,8 +950,13 @@ async def _save_summary(
         # Aggregate per_metric for ALL metrics present across records (not just those just run),
         # so that a partial re-run (e.g. --metrics response_speed) preserves other metrics.
         all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
+        seed = run_seed(self.run_dir.name)
         metric_aggregates = self._build_per_metric_aggregates(
-            all_metrics, all_metric_names, pass_at_k_results, self.num_draws
+            all_metrics,
+            all_metric_names,
+            pass_at_k_results,
+            self.num_draws,
+            seed=seed,
         )
 
         # Compute metric failures for MetricsRunResult (only for metrics just run)
@@ -934,7 +969,7 @@ async def _save_summary(
                         metric_failures.setdefault(name, []).append(record_id)
 
         # Compute EVA composite run-level aggregates
-        overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws)
+        overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws, seed=seed)
 
         # Load existing summary to preserve fields for metrics not being re-run
         summary_path = self.run_dir / "metrics_summary.json"
@@ -1038,12 +1073,17 @@ async def run_aggregate_only(cls, run_dir: Path, num_draws: int = 1) -> None:
         all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
 
         # Compute per-metric aggregates (including pass_k)
+        seed = run_seed(run_dir.name)
         metric_aggregates = cls._build_per_metric_aggregates(
-            all_metrics, all_metric_names, pass_at_k_results or None, num_draws
+            all_metrics,
+            all_metric_names,
+            pass_at_k_results or None,
+            num_draws,
+            seed=seed,
         )
 
         # Compute run-level aggregates
-        overall_scores = compute_run_level_aggregates(all_metrics, num_draws)
+        overall_scores = compute_run_level_aggregates(all_metrics, num_draws, seed=seed)
 
         # Update metrics_summary.json (preserve existing fields, replace computed sections)
         summary_path = run_dir / "metrics_summary.json"

diff --git a/src/eva/utils/bootstrap.py b/src/eva/utils/bootstrap.py
@@ -0,0 +1,74 @@
+"""Percentile bootstrap primitives for sample-mean confidence intervals.
+
+This module is pure: numpy in, numpy/floats out. It has no eva imports and
+is safe to use from anywhere in the package.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from collections.abc import Sequence
+from typing import Any
+
+import numpy as np
+
+N_BOOT = 2000
+ALPHA = 0.05
+BASE_SEED = 42
+
+
+def run_seed(run_id: str) -> int:
+    """Stable, run-dependent seed derived from the run directory name.
+
+    Uses ``hashlib.sha256`` rather than Python's built-in ``hash()`` because the
+    latter is salted per interpreter process — re-invoking ``eva metrics`` on the
+    same run would otherwise yield slightly different CI bounds. SHA-based hashing
+    is byte-stable across processes.
+    """
+    h = hashlib.sha256(run_id.encode()).digest()
+    return int.from_bytes(h[:4], "big") % (2**31)
+
+
+def bootstrap_resample(values: np.ndarray, n_boot: int, seed: int) -> np.ndarray:
+    """Return ``n_boot`` resampled means of ``values``.
+
+    Returns a zero-length array for empty input.
+    """
+    values = np.asarray(values, dtype=float)
+    if len(values) == 0:
+        return np.array([], dtype=float)
+    rng = np.random.default_rng(seed)
+    idx = rng.integers(0, len(values), size=(n_boot, len(values)))
+    return values[idx].mean(axis=1)
+
+
+def bootstrap_ci(
+    values: np.ndarray,
+    n_boot: int = N_BOOT,
+    seed: int = BASE_SEED,
+    alpha: float = ALPHA,
+) -> tuple[float, float]:
+    """95% percentile bootstrap CI on the mean (default alpha=0.05).
+
+    Returns ``(lower, upper)``; ``(nan, nan)`` if the input is empty.
+    """
+    boot = bootstrap_resample(values, n_boot=n_boot, seed=seed)
+    if len(boot) == 0:
+        return float("nan"), float("nan")
+    lower = float(np.percentile(boot, 100 * alpha / 2))
+    upper = float(np.percentile(boot, 100 * (1 - alpha / 2)))
+    return lower, upper
+
+
+def assign_bootstrap_cis(
+    target: dict[str, Any],
+    samples: dict[str, Sequence[float]],
+    *,
+    seed: int,
+    decimals: int = 4,
+) -> None:
+    """Bootstrap each ``(name, sample)`` pair and write ``{name}_ci_lower`` / ``{name}_ci_upper`` to ``target``."""
+    for name, sample in samples.items():
+        lower, upper = bootstrap_ci(sample, seed=seed)
+        target[f"{name}_ci_lower"] = round(lower, decimals)
+        target[f"{name}_ci_upper"] = round(upper, decimals)