From 11b6bcd94997f36fea32fb63d6d8262294d219c2 Mon Sep 17 00:00:00 2001 From: Barrett Pyke Date: Fri, 12 Jun 2026 11:41:09 -0500 Subject: [PATCH] fix(experiments): pass base_experiment_id to summarize --- py/src/braintrust/framework.py | 5 ++- py/src/braintrust/logger.py | 6 +++ py/src/braintrust/test_framework.py | 58 ++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 49eee975..a56a818a 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1367,7 +1367,10 @@ async def run_evaluator( ) if experiment: - summary = experiment.summarize(summarize_scores=evaluator.summarize_scores) + summary = experiment.summarize( + summarize_scores=evaluator.summarize_scores, + comparison_experiment_id=evaluator.base_experiment_id, + ) else: summary = build_local_summary(evaluator, results) diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index 460f9232..dea4b504 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -3969,6 +3969,12 @@ def summarize( if base_experiment: comparison_experiment_id = base_experiment.id comparison_experiment_name = base_experiment.name + else: + try: + comparison_experiment = state.api_conn().get_json(f"v1/experiment/{comparison_experiment_id}") + comparison_experiment_name = comparison_experiment.get("name") + except Exception: + pass try: summary_items = state.api_conn().get_json( diff --git a/py/src/braintrust/test_framework.py b/py/src/braintrust/test_framework.py index 71168a38..4ac32c81 100644 --- a/py/src/braintrust/test_framework.py +++ b/py/src/braintrust/test_framework.py @@ -1,7 +1,7 @@ import importlib.util import re import sys -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest from braintrust.logger import BraintrustState @@ -78,6 +78,62 @@ def exact_match(input_value, output, expected): assert result.summary.scores["exact_match"].score == 1.0 +@pytest.mark.asyncio +async def test_run_evaluator_forwards_base_experiment_id_to_summary(with_memory_logger, with_simulate_login): + def exact_match(input_value, output, expected): + return 1.0 if output == expected else 0.0 + + evaluator = Evaluator( + project_name="test-project", + eval_name="test-evaluator", + data=[EvalCase(input=1, expected=1)], + task=lambda input_value: input_value, + scores=[exact_match], + experiment_name=None, + metadata=None, + base_experiment_id="base-exp-id", + ) + + exp = init_test_exp("test-evaluator", "test-project") + expected_summary = MagicMock() + exp.summarize = MagicMock(return_value=expected_summary) + + result = await run_evaluator(experiment=exp, evaluator=evaluator, position=None, filters=[]) + + assert result.summary is expected_summary + exp.summarize.assert_called_once_with( + summarize_scores=True, + comparison_experiment_id="base-exp-id", + ) + + +def test_experiment_summarize_resolves_explicit_comparison_name(with_memory_logger, with_simulate_login): + exp = init_test_exp("test-evaluator", "test-project") + mock_conn = MagicMock() + + def get_json(path, args=None): + if path == "v1/experiment/base-exp-id": + return {"name": "base-exp"} + if path == "experiment-comparison2": + return {"scores": {}, "metrics": {}} + raise AssertionError(f"Unexpected get_json call: {path}, {args}") + + mock_conn.get_json.side_effect = get_json + + with patch.object(exp.state, "api_conn", return_value=mock_conn): + summary = exp.summarize(comparison_experiment_id="base-exp-id") + + assert summary.comparison_experiment_name == "base-exp" + mock_conn.get_json.assert_any_call("v1/experiment/base-exp-id") + mock_conn.get_json.assert_any_call( + "experiment-comparison2", + args={ + "experiment_id": "test-evaluator", + "base_experiment_id": "base-exp-id", + }, + ) + + @pytest.mark.asyncio @pytest.mark.skipif(not HAS_PYDANTIC, reason="pydantic not installed") async def test_run_evaluator_exposes_validated_parameter_values_to_hooks():