Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/eva/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@

# Bump metrics_version when changes affect metric computation (metrics code,
# judge prompts, pricing tables, postprocessor).
metrics_version = "2.0.0"
metrics_version = "2.1.0"
80 changes: 77 additions & 3 deletions src/eva/metrics/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from dataclasses import dataclass, field
from typing import Literal

import numpy as np

from eva.models.results import RecordMetrics
from eva.utils.bootstrap import BASE_SEED, assign_bootstrap_cis, bootstrap_ci
from eva.utils.pass_at_k import (
compute_pass_at_k,
compute_pass_power_k,
Expand Down Expand Up @@ -83,6 +86,51 @@ class EVACompositeDefinition:
]


def _scenario_means_for_metric(
all_metrics: dict[str, RecordMetrics],
metric_name: str,
) -> np.ndarray:
"""Collapse trials → one value per scenario for a single metric.

Per-scenario value = mean over trials of ``normalized_score`` (falling back
to ``score``). Scenarios where all trials are missing/errored are dropped.
For k=1 runs each record is its own scenario.
"""
grouped: dict[str, list[float]] = {}
for record_id, record_metrics in all_metrics.items():
base_id, _ = parse_trial_record_id(record_id)
val = record_metrics.get_score(metric_name)
if val is None:
continue
grouped.setdefault(base_id, []).append(float(val))
if not grouped:
return np.array([], dtype=float)
return np.array([sum(vs) / len(vs) for vs in grouped.values()], dtype=float)


def _scenario_values_for_composite(
all_metrics: dict[str, RecordMetrics],
comp: EVACompositeDefinition,
) -> np.ndarray:
"""Collapse trials → one value per scenario for a composite.

Per-scenario value = mean over trials of the per-trial composite value
stored in ``aggregate_metrics``. For pass/derived composites this is the
scenario pass rate. Scenarios where all trials have ``None`` for this
composite are dropped.
"""
grouped: dict[str, list[float]] = {}
for record_id, record_metrics in all_metrics.items():
base_id, _ = parse_trial_record_id(record_id)
val = record_metrics.aggregate_metrics.get(comp.name)
if val is None:
continue
grouped.setdefault(base_id, []).append(float(val))
if not grouped:
return np.array([], dtype=float)
return np.array([sum(vs) / len(vs) for vs in grouped.values()], dtype=float)


def _check_threshold(value: float, operator: str, threshold: float) -> bool:
"""Check whether a value passes the given threshold comparison."""
if operator == "==":
Expand Down Expand Up @@ -159,16 +207,19 @@ def compute_run_level_aggregates(
all_metrics: dict[str, RecordMetrics],
num_draws: int = 1,
composites: list[EVACompositeDefinition] | None = None,
seed: int = BASE_SEED,
) -> dict:
"""Compute run-level aggregate scores from all records.

Args:
all_metrics: Dict mapping record ID to RecordMetrics (must have aggregate_metrics populated).
num_draws: Number of draws (k) for pass@k computation.
composites: Custom composite definitions. Defaults to EVA_COMPOSITES.
seed: Bootstrap seed for CI computation. Defaults to ``BASE_SEED``.
Production callers (the metrics runner) pass ``run_seed(run_dir.name)``.

Returns:
Dict with per-composite statistics and optional pass@k data.
Dict with per-composite statistics, CI fields, and optional pass@k data.
"""
composites = composites or EVA_COMPOSITES

Expand Down Expand Up @@ -206,11 +257,23 @@ def compute_run_level_aggregates(
else:
entry["success_rate"] = round(sum(1 for v in values if v >= 0.5) / len(values), 4)

# Bootstrap CI on the per-scenario mean.
scenario_values = _scenario_values_for_composite(all_metrics, comp)
if len(scenario_values) == 0:
entry["mean_ci_lower"] = None
entry["mean_ci_upper"] = None
entry["mean_ci_n_scenarios"] = 0
else:
lower, upper = bootstrap_ci(scenario_values, seed=seed)
entry["mean_ci_lower"] = round(lower, 4)
entry["mean_ci_upper"] = round(upper, 4)
entry["mean_ci_n_scenarios"] = len(scenario_values)

result[comp.name] = entry

# pass_k for aggregate metrics if multi-trial
if num_draws > 1:
pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites)
pass_k_data = _compute_aggregate_pass_k(all_metrics, num_draws, composites, seed=seed)
if pass_k_data:
result["pass_k"] = pass_k_data

Expand All @@ -221,6 +284,7 @@ def _compute_aggregate_pass_k(
all_metrics: dict[str, RecordMetrics],
num_draws: int,
composites: list[EVACompositeDefinition] | None = None,
seed: int = BASE_SEED,
) -> dict:
"""Compute pass@1, pass@k, pass^k (observed), and pass^k (theoretical) for aggregate metrics across trials."""
composites = composites or EVA_COMPOSITES
Expand Down Expand Up @@ -264,13 +328,23 @@ def _compute_aggregate_pass_k(

if pass_at_k_values:
count = len(pass_at_k_values)
result[comp.name] = {
entry = {
"pass_at_1": round(sum(pass_at_1_values) / count, 4),
"pass_at_k": round(sum(pass_at_k_values) / count, 4),
"pass_power_k_observed": round(sum(pass_power_k_observed_values) / count, 4),
"pass_power_k_theoretical": round(sum(pass_power_k_theoretical_values) / count, 4),
"k": num_draws,
"count": count,
}
assign_bootstrap_cis(
entry,
{
"pass_at_1": pass_at_1_values,
"pass_at_k": pass_at_k_values,
"pass_power_k_observed": pass_power_k_observed_values,
},
seed=seed,
)
result[comp.name] = entry

return result
52 changes: 46 additions & 6 deletions src/eva/metrics/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
import yaml

from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric
from eva.metrics.aggregation import compute_record_aggregates, compute_run_level_aggregates
from eva.metrics.aggregation import (
_scenario_means_for_metric,
compute_record_aggregates,
compute_run_level_aggregates,
)
from eva.metrics.base import BaseMetric, MetricContext
from eva.metrics.legacy_aliases import rename_metric_keys
from eva.metrics.processor import MetricsContextProcessor
Expand All @@ -20,6 +24,7 @@
from eva.models.config import PipelineType, get_pipeline_type
from eva.models.record import EvaluationRecord
from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
from eva.utils.bootstrap import BASE_SEED, assign_bootstrap_cis, bootstrap_ci, run_seed
from eva.utils.hash_utils import get_dict_hash
from eva.utils.logging import get_logger
from eva.utils.pass_at_k import (
Expand Down Expand Up @@ -632,6 +637,7 @@ def _build_per_metric_aggregates(
metric_names: list[str],
pass_at_k_results: dict[str, dict[str, PassAtKResult]] | None = None,
num_draws: int = 1,
seed: int = BASE_SEED,
) -> dict[str, dict[str, Any]]:
"""Build per-metric aggregate stats including pass_k.

Expand All @@ -640,6 +646,8 @@ def _build_per_metric_aggregates(
metric_names: List of metric names to aggregate.
pass_at_k_results: Per-record pass@k results (if multi-trial).
num_draws: Number of draws (k) for pass@k.
seed: Bootstrap seed for CI computation. Defaults to ``BASE_SEED``;
production callers pass ``run_seed(run_dir.name)``.

Returns:
Dict mapping metric name to aggregate stats.
Expand Down Expand Up @@ -698,6 +706,18 @@ def _build_per_metric_aggregates(
coverage["not_applicable_turns"] = total_not_applicable_across_records
entry["per_turn_coverage"] = coverage

# Bootstrap CI on the per-scenario mean.
scenario_values = _scenario_means_for_metric(all_metrics, name)
if len(scenario_values) == 0:
entry["mean_ci_lower"] = None
entry["mean_ci_upper"] = None
entry["mean_ci_n_scenarios"] = 0
else:
lower, upper = bootstrap_ci(scenario_values, seed=seed)
entry["mean_ci_lower"] = round(lower, 4)
entry["mean_ci_upper"] = round(upper, 4)
entry["mean_ci_n_scenarios"] = len(scenario_values)

entry["higher_is_better"] = _metric_higher_is_better(name)
metric_aggregates[name] = entry

Expand All @@ -720,14 +740,24 @@ def _build_per_metric_aggregates(

if pass_at_k_values:
count = len(pass_at_k_values)
metric_aggregates[name]["pass_k"] = {
pass_k_block: dict[str, Any] = {
"pass_at_1": round(sum(pass_at_1_values) / count, 4),
"pass_at_k": round(sum(pass_at_k_values) / count, 4),
"pass_power_k_observed": round(sum(pass_power_k_obs_values) / count, 4),
"pass_power_k_theoretical": round(sum(pass_power_k_theo_values) / count, 4),
"k": num_draws,
"count": count,
}
assign_bootstrap_cis(
pass_k_block,
{
"pass_at_1": pass_at_1_values,
"pass_at_k": pass_at_k_values,
"pass_power_k_observed": pass_power_k_obs_values,
},
seed=seed,
)
metric_aggregates[name]["pass_k"] = pass_k_block

# Generic sub-metric aggregation.
# Sub-keys are collected in first-seen insertion order so each metric controls
Expand Down Expand Up @@ -920,8 +950,13 @@ async def _save_summary(
# Aggregate per_metric for ALL metrics present across records (not just those just run),
# so that a partial re-run (e.g. --metrics response_speed) preserves other metrics.
all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
seed = run_seed(self.run_dir.name)
metric_aggregates = self._build_per_metric_aggregates(
all_metrics, all_metric_names, pass_at_k_results, self.num_draws
all_metrics,
all_metric_names,
pass_at_k_results,
self.num_draws,
seed=seed,
)

# Compute metric failures for MetricsRunResult (only for metrics just run)
Expand All @@ -934,7 +969,7 @@ async def _save_summary(
metric_failures.setdefault(name, []).append(record_id)

# Compute EVA composite run-level aggregates
overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws)
overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws, seed=seed)

# Load existing summary to preserve fields for metrics not being re-run
summary_path = self.run_dir / "metrics_summary.json"
Expand Down Expand Up @@ -1038,12 +1073,17 @@ async def run_aggregate_only(cls, run_dir: Path, num_draws: int = 1) -> None:
all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})

# Compute per-metric aggregates (including pass_k)
seed = run_seed(run_dir.name)
metric_aggregates = cls._build_per_metric_aggregates(
all_metrics, all_metric_names, pass_at_k_results or None, num_draws
all_metrics,
all_metric_names,
pass_at_k_results or None,
num_draws,
seed=seed,
)

# Compute run-level aggregates
overall_scores = compute_run_level_aggregates(all_metrics, num_draws)
overall_scores = compute_run_level_aggregates(all_metrics, num_draws, seed=seed)

# Update metrics_summary.json (preserve existing fields, replace computed sections)
summary_path = run_dir / "metrics_summary.json"
Expand Down
74 changes: 74 additions & 0 deletions src/eva/utils/bootstrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Percentile bootstrap primitives for sample-mean confidence intervals.

This module is pure: numpy in, numpy/floats out. It has no eva imports and
is safe to use from anywhere in the package.
"""

from __future__ import annotations

import hashlib
from collections.abc import Sequence
from typing import Any

import numpy as np

N_BOOT = 2000
ALPHA = 0.05
BASE_SEED = 42


def run_seed(run_id: str) -> int:
"""Stable, run-dependent seed derived from the run directory name.

Uses ``hashlib.sha256`` rather than Python's built-in ``hash()`` because the
latter is salted per interpreter process — re-invoking ``eva metrics`` on the
same run would otherwise yield slightly different CI bounds. SHA-based hashing
is byte-stable across processes.
"""
h = hashlib.sha256(run_id.encode()).digest()
return int.from_bytes(h[:4], "big") % (2**31)


def bootstrap_resample(values: np.ndarray, n_boot: int, seed: int) -> np.ndarray:
"""Return ``n_boot`` resampled means of ``values``.

Returns a zero-length array for empty input.
"""
values = np.asarray(values, dtype=float)
if len(values) == 0:
return np.array([], dtype=float)
rng = np.random.default_rng(seed)
idx = rng.integers(0, len(values), size=(n_boot, len(values)))
return values[idx].mean(axis=1)


def bootstrap_ci(
values: np.ndarray,
n_boot: int = N_BOOT,
seed: int = BASE_SEED,
alpha: float = ALPHA,
) -> tuple[float, float]:
"""95% percentile bootstrap CI on the mean (default alpha=0.05).

Returns ``(lower, upper)``; ``(nan, nan)`` if the input is empty.
"""
boot = bootstrap_resample(values, n_boot=n_boot, seed=seed)
if len(boot) == 0:
return float("nan"), float("nan")
lower = float(np.percentile(boot, 100 * alpha / 2))
upper = float(np.percentile(boot, 100 * (1 - alpha / 2)))
return lower, upper


def assign_bootstrap_cis(
target: dict[str, Any],
samples: dict[str, Sequence[float]],
*,
seed: int,
decimals: int = 4,
) -> None:
"""Bootstrap each ``(name, sample)`` pair and write ``{name}_ci_lower`` / ``{name}_ci_upper`` to ``target``."""
for name, sample in samples.items():
lower, upper = bootstrap_ci(sample, seed=seed)
target[f"{name}_ci_lower"] = round(lower, decimals)
target[f"{name}_ci_upper"] = round(upper, decimals)
Loading
Loading