From 7cbdb8a7b76944baf14a13780cd006a597bd50ca Mon Sep 17 00:00:00 2001 From: eldwin-easynet-world Date: Fri, 12 Jun 2026 07:47:21 -0700 Subject: [PATCH] Add custom trajectory helpers for BixBench --- README.md | 38 ++++++- bixbench/custom_trajectories.py | 161 ++++++++++++++++++++++++++++++ bixbench/postprocessing_utils.py | 4 +- tests/test_custom_trajectories.py | 62 ++++++++++++ 4 files changed, 262 insertions(+), 3 deletions(-) create mode 100644 bixbench/custom_trajectories.py create mode 100644 tests/test_custom_trajectories.py diff --git a/README.md b/README.md index e03b52e..63eb80e 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,43 @@ Edit or create a new YAML file to modify: ### Using Your Own Agent -To use your own agent, use the `generate_trajectories.py` script by editing the [`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239) function to generate trajectories in the same format as the BixBench trajectories, then use the `postprocessing.py` script to evaluate your agent's performance. +To use your own agent, write BixBench-compatible trajectory JSON files and then +run the standard `postprocessing.py` script on the output directory. The helper +below validates the required fields before you spend credits on a full benchmark +run: + +```python +from bixbench.custom_trajectories import minimal_notebook, write_custom_trajectory + +record = { + "problem_id": "capsule-001", + "agent_answer": "The treatment increased expression by 2x.", + "ideal_answer": "The treatment increased expression by 2x.", + "problem": "Compare expression in control and treatment samples.", + "mcq_options": [], + "mcq_question": "Which conclusion is best supported?", + "notebook_stats": {"num_cells": 1}, + "num_actions": 3, + "question_format": "open", + "refusal_option": True, + "metadata": {"distractors": ["No change.", "Expression decreased."]}, + "refusal_options": None, + "nb": minimal_notebook("print('analysis complete')"), + "run_name": "my-custom-agent", +} + +write_custom_trajectory(record, "data/trajectories/my-custom-agent") +``` + +The writer stores files as `_replica_.json`, matching the +directory loader used by postprocessing. For open-answer trajectories, +`metadata["distractors"]` is required by default because postprocessing expands +open answers into multiple-choice grading rows unless MCQ expansion is disabled. + +You can still subclass the rollout path and implement +[`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239), +but external agents no longer need to edit core rollout code just to validate +and postprocess already-generated trajectories. ### Evaluate trajectories diff --git a/bixbench/custom_trajectories.py b/bixbench/custom_trajectories.py new file mode 100644 index 0000000..544fcb8 --- /dev/null +++ b/bixbench/custom_trajectories.py @@ -0,0 +1,161 @@ +"""Helpers for writing BixBench-compatible custom agent trajectories.""" + +from __future__ import annotations + +import json +from collections.abc import Mapping +from pathlib import Path +from typing import Any + + +class CustomTrajectoryValidationError(ValueError): + """Raised when a custom trajectory cannot be postprocessed by BixBench.""" + + +REQUIRED_TRAJECTORY_FIELDS = frozenset( + { + "problem_id", + "agent_answer", + "ideal_answer", + "problem", + "mcq_options", + "mcq_question", + "notebook_stats", + "num_actions", + "question_format", + "refusal_option", + "metadata", + "refusal_options", + "nb", + "run_name", + } +) + + +def minimal_notebook(source: str = "", language: str = "python") -> dict[str, Any]: + """Return the smallest notebook object accepted by postprocessing.""" + return { + "cells": [ + { + "cell_type": "code", + "execution_count": None, + "metadata": {}, + "outputs": [], + "source": source, + } + ], + "metadata": { + "kernelspec": { + "display_name": language, + "language": language, + "name": language, + }, + "language_info": {"name": language}, + }, + "nbformat": 4, + "nbformat_minor": 5, + } + + +def validate_custom_trajectory( + record: Mapping[str, Any], *, require_open_distractors: bool = True +) -> dict[str, Any]: + """Validate and normalize a custom trajectory before writing it to JSON. + + The returned dictionary uses the same top-level fields emitted by + ``generate_trajectories.py`` so it can be loaded by ``postprocessing.py``. + """ + missing = sorted(REQUIRED_TRAJECTORY_FIELDS.difference(record)) + if missing: + raise CustomTrajectoryValidationError( + f"Custom trajectory is missing required fields: {', '.join(missing)}" + ) + + normalized = dict(record) + _require_non_empty_string(normalized, "problem_id") + _require_non_empty_string(normalized, "run_name") + _require_non_empty_string(normalized, "problem") + + question_format = normalized["question_format"] + if question_format not in {"open", "mcq", "hypothesis"}: + raise CustomTrajectoryValidationError( + "question_format must be one of: open, mcq, hypothesis" + ) + + if not normalized["agent_answer"]: + raise CustomTrajectoryValidationError( + "agent_answer must be truthy; postprocessing drops empty answers" + ) + if not normalized["ideal_answer"]: + raise CustomTrajectoryValidationError("ideal_answer must be truthy") + + mcq_question = normalized["mcq_question"] + if isinstance(mcq_question, list): + if not mcq_question or not all(isinstance(item, str) for item in mcq_question): + raise CustomTrajectoryValidationError( + "mcq_question must be a string or a non-empty list of strings" + ) + elif not isinstance(mcq_question, str): + raise CustomTrajectoryValidationError( + "mcq_question must be a string or a non-empty list of strings" + ) + + if not isinstance(normalized["mcq_options"], list): + raise CustomTrajectoryValidationError("mcq_options must be a list") + + _require_mapping(normalized, "metadata") + _require_mapping(normalized, "notebook_stats") + _require_mapping(normalized, "nb") + + nb = normalized["nb"] + if not isinstance(nb.get("cells"), list): + raise CustomTrajectoryValidationError("nb must contain a cells list") + + if not isinstance(normalized["num_actions"], int) or normalized["num_actions"] < 0: + raise CustomTrajectoryValidationError("num_actions must be a non-negative int") + + if require_open_distractors and question_format == "open": + distractors = normalized["metadata"].get("distractors") + if not isinstance(distractors, list) or not distractors: + raise CustomTrajectoryValidationError( + "open trajectories need metadata['distractors'] for MCQ expansion" + ) + + return normalized + + +def write_custom_trajectory( + record: Mapping[str, Any], + output_dir: str | Path, + *, + replica_id: int = 0, + require_open_distractors: bool = True, +) -> Path: + """Validate and write one trajectory JSON file. + + The filename includes ``replica_`` so directory loading can infer the + replica number consistently for single-run and batched custom agents. + """ + if replica_id < 0: + raise CustomTrajectoryValidationError("replica_id must be non-negative") + + validated = validate_custom_trajectory( + record, require_open_distractors=require_open_distractors + ) + output_path = ( + Path(output_dir) + / f"{validated['problem_id']}_replica_{replica_id}.json" + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(validated, indent=2) + "\n", encoding="utf-8") + return output_path + + +def _require_non_empty_string(record: Mapping[str, Any], field: str) -> None: + if not isinstance(record[field], str) or not record[field].strip(): + raise CustomTrajectoryValidationError(f"{field} must be a non-empty string") + + +def _require_mapping(record: Mapping[str, Any], field: str) -> None: + if not isinstance(record[field], Mapping): + raise CustomTrajectoryValidationError(f"{field} must be an object") diff --git a/bixbench/postprocessing_utils.py b/bixbench/postprocessing_utils.py index 893b4f5..4030122 100644 --- a/bixbench/postprocessing_utils.py +++ b/bixbench/postprocessing_utils.py @@ -21,8 +21,8 @@ def load_dataframe_from_json_directory(path: str) -> pd.DataFrame: """Load a dataframe from a json directory.""" all_data = [] for file in list(Path(path).glob("**/*.json")): - replica = re.search(r"replica_(\d+)", file.name)[1] - replica = int(replica) if replica is not None else 0 + replica_match = re.search(r"replica_(\d+)", file.name) + replica = int(replica_match[1]) if replica_match else 0 with open(file, encoding="utf-8") as f: data = json.load(f) data["replica"] = replica diff --git a/tests/test_custom_trajectories.py b/tests/test_custom_trajectories.py new file mode 100644 index 0000000..3f23760 --- /dev/null +++ b/tests/test_custom_trajectories.py @@ -0,0 +1,62 @@ +import json + +import pytest + +from bixbench.custom_trajectories import ( + CustomTrajectoryValidationError, + minimal_notebook, + validate_custom_trajectory, + write_custom_trajectory, +) +from bixbench.postprocessing_utils import load_dataframe_from_json_directory + + +def valid_open_record(): + return { + "problem_id": "capsule-001", + "agent_answer": "The treatment increased expression by 2x.", + "ideal_answer": "The treatment increased expression by 2x.", + "problem": "Compare expression in control and treatment samples.", + "mcq_options": [], + "mcq_question": "Which conclusion is best supported?", + "notebook_stats": {"num_cells": 1}, + "num_actions": 3, + "question_format": "open", + "refusal_option": True, + "metadata": {"distractors": ["No change.", "Expression decreased."]}, + "refusal_options": None, + "nb": minimal_notebook("print('analysis complete')"), + "run_name": "custom-agent-smoke", + } + + +def test_validate_custom_open_trajectory(): + record = validate_custom_trajectory(valid_open_record()) + + assert record["problem_id"] == "capsule-001" + assert record["nb"]["cells"][0]["source"] == "print('analysis complete')" + + +def test_validate_custom_open_trajectory_requires_distractors(): + record = valid_open_record() + record["metadata"] = {} + + with pytest.raises(CustomTrajectoryValidationError, match="distractors"): + validate_custom_trajectory(record) + + +def test_write_custom_trajectory_uses_replica_filename(tmp_path): + path = write_custom_trajectory(valid_open_record(), tmp_path) + + assert path.name == "capsule-001_replica_0.json" + assert json.loads(path.read_text())["run_name"] == "custom-agent-smoke" + + +def test_directory_loader_accepts_custom_file_without_replica(tmp_path): + path = tmp_path / "capsule-001.json" + path.write_text(json.dumps(valid_open_record()), encoding="utf-8") + + df = load_dataframe_from_json_directory(str(tmp_path)) + + assert df.loc[0, "replica"] == 0 + assert df.loc[0, "problem_id"] == "capsule-001"