From 7cbdb8a7b76944baf14a13780cd006a597bd50ca Mon Sep 17 00:00:00 2001
From: eldwin-easynet-world <eldwin.easynet.world@gmail.com>
Date: Fri, 12 Jun 2026 07:47:21 -0700
Subject: [PATCH] Add custom trajectory helpers for BixBench

---
 README.md                         |  38 ++++++-
 bixbench/custom_trajectories.py   | 161 ++++++++++++++++++++++++++++++
 bixbench/postprocessing_utils.py  |   4 +-
 tests/test_custom_trajectories.py |  62 ++++++++++++
 4 files changed, 262 insertions(+), 3 deletions(-)
 create mode 100644 bixbench/custom_trajectories.py
 create mode 100644 tests/test_custom_trajectories.py
diff --git a/README.md b/README.md
index e03b52e..63eb80e 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,43 @@ Edit or create a new YAML file to modify:
 
 ### Using Your Own Agent
 
-To use your own agent, use the `generate_trajectories.py` script by editing the [`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239) function to generate trajectories in the same format as the BixBench trajectories, then use the `postprocessing.py` script to evaluate your agent's performance.
+To use your own agent, write BixBench-compatible trajectory JSON files and then
+run the standard `postprocessing.py` script on the output directory. The helper
+below validates the required fields before you spend credits on a full benchmark
+run:
+
+```python
+from bixbench.custom_trajectories import minimal_notebook, write_custom_trajectory
+
+record = {
+    "problem_id": "capsule-001",
+    "agent_answer": "The treatment increased expression by 2x.",
+    "ideal_answer": "The treatment increased expression by 2x.",
+    "problem": "Compare expression in control and treatment samples.",
+    "mcq_options": [],
+    "mcq_question": "Which conclusion is best supported?",
+    "notebook_stats": {"num_cells": 1},
+    "num_actions": 3,
+    "question_format": "open",
+    "refusal_option": True,
+    "metadata": {"distractors": ["No change.", "Expression decreased."]},
+    "refusal_options": None,
+    "nb": minimal_notebook("print('analysis complete')"),
+    "run_name": "my-custom-agent",
+}
+
+write_custom_trajectory(record, "data/trajectories/my-custom-agent")
+```
+
+The writer stores files as `<problem_id>_replica_<id>.json`, matching the
+directory loader used by postprocessing. For open-answer trajectories,
+`metadata["distractors"]` is required by default because postprocessing expands
+open answers into multiple-choice grading rows unless MCQ expansion is disabled.
+
+You can still subclass the rollout path and implement
+[`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239),
+but external agents no longer need to edit core rollout code just to validate
+and postprocess already-generated trajectories.
 
 ### Evaluate trajectories
 
diff --git a/bixbench/custom_trajectories.py b/bixbench/custom_trajectories.py
new file mode 100644
index 0000000..544fcb8
--- /dev/null
+++ b/bixbench/custom_trajectories.py
@@ -0,0 +1,161 @@
+"""Helpers for writing BixBench-compatible custom agent trajectories."""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Mapping
+from pathlib import Path
+from typing import Any
+
+
+class CustomTrajectoryValidationError(ValueError):
+    """Raised when a custom trajectory cannot be postprocessed by BixBench."""
+
+
+REQUIRED_TRAJECTORY_FIELDS = frozenset(
+    {
+        "problem_id",
+        "agent_answer",
+        "ideal_answer",
+        "problem",
+        "mcq_options",
+        "mcq_question",
+        "notebook_stats",
+        "num_actions",
+        "question_format",
+        "refusal_option",
+        "metadata",
+        "refusal_options",
+        "nb",
+        "run_name",
+    }
+)
+
+
+def minimal_notebook(source: str = "", language: str = "python") -> dict[str, Any]:
+    """Return the smallest notebook object accepted by postprocessing."""
+    return {
+        "cells": [
+            {
+                "cell_type": "code",
+                "execution_count": None,
+                "metadata": {},
+                "outputs": [],
+                "source": source,
+            }
+        ],
+        "metadata": {
+            "kernelspec": {
+                "display_name": language,
+                "language": language,
+                "name": language,
+            },
+            "language_info": {"name": language},
+        },
+        "nbformat": 4,
+        "nbformat_minor": 5,
+    }
+
+
+def validate_custom_trajectory(
+    record: Mapping[str, Any], *, require_open_distractors: bool = True
+) -> dict[str, Any]:
+    """Validate and normalize a custom trajectory before writing it to JSON.
+
+    The returned dictionary uses the same top-level fields emitted by
+    ``generate_trajectories.py`` so it can be loaded by ``postprocessing.py``.
+    """
+    missing = sorted(REQUIRED_TRAJECTORY_FIELDS.difference(record))
+    if missing:
+        raise CustomTrajectoryValidationError(
+            f"Custom trajectory is missing required fields: {', '.join(missing)}"
+        )
+
+    normalized = dict(record)
+    _require_non_empty_string(normalized, "problem_id")
+    _require_non_empty_string(normalized, "run_name")
+    _require_non_empty_string(normalized, "problem")
+
+    question_format = normalized["question_format"]
+    if question_format not in {"open", "mcq", "hypothesis"}:
+        raise CustomTrajectoryValidationError(
+            "question_format must be one of: open, mcq, hypothesis"
+        )
+
+    if not normalized["agent_answer"]:
+        raise CustomTrajectoryValidationError(
+            "agent_answer must be truthy; postprocessing drops empty answers"
+        )
+    if not normalized["ideal_answer"]:
+        raise CustomTrajectoryValidationError("ideal_answer must be truthy")
+
+    mcq_question = normalized["mcq_question"]
+    if isinstance(mcq_question, list):
+        if not mcq_question or not all(isinstance(item, str) for item in mcq_question):
+            raise CustomTrajectoryValidationError(
+                "mcq_question must be a string or a non-empty list of strings"
+            )
+    elif not isinstance(mcq_question, str):
+        raise CustomTrajectoryValidationError(
+            "mcq_question must be a string or a non-empty list of strings"
+        )
+
+    if not isinstance(normalized["mcq_options"], list):
+        raise CustomTrajectoryValidationError("mcq_options must be a list")
+
+    _require_mapping(normalized, "metadata")
+    _require_mapping(normalized, "notebook_stats")
+    _require_mapping(normalized, "nb")
+
+    nb = normalized["nb"]
+    if not isinstance(nb.get("cells"), list):
+        raise CustomTrajectoryValidationError("nb must contain a cells list")
+
+    if not isinstance(normalized["num_actions"], int) or normalized["num_actions"] < 0:
+        raise CustomTrajectoryValidationError("num_actions must be a non-negative int")
+
+    if require_open_distractors and question_format == "open":
+        distractors = normalized["metadata"].get("distractors")
+        if not isinstance(distractors, list) or not distractors:
+            raise CustomTrajectoryValidationError(
+                "open trajectories need metadata['distractors'] for MCQ expansion"
+            )
+
+    return normalized
+
+
+def write_custom_trajectory(
+    record: Mapping[str, Any],
+    output_dir: str | Path,
+    *,
+    replica_id: int = 0,
+    require_open_distractors: bool = True,
+) -> Path:
+    """Validate and write one trajectory JSON file.
+
+    The filename includes ``replica_<id>`` so directory loading can infer the
+    replica number consistently for single-run and batched custom agents.
+    """
+    if replica_id < 0:
+        raise CustomTrajectoryValidationError("replica_id must be non-negative")
+
+    validated = validate_custom_trajectory(
+        record, require_open_distractors=require_open_distractors
+    )
+    output_path = (
+        Path(output_dir)
+        / f"{validated['problem_id']}_replica_{replica_id}.json"
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(validated, indent=2) + "\n", encoding="utf-8")
+    return output_path
+
+
+def _require_non_empty_string(record: Mapping[str, Any], field: str) -> None:
+    if not isinstance(record[field], str) or not record[field].strip():
+        raise CustomTrajectoryValidationError(f"{field} must be a non-empty string")
+
+
+def _require_mapping(record: Mapping[str, Any], field: str) -> None:
+    if not isinstance(record[field], Mapping):
+        raise CustomTrajectoryValidationError(f"{field} must be an object")
diff --git a/bixbench/postprocessing_utils.py b/bixbench/postprocessing_utils.py
index 893b4f5..4030122 100644
--- a/bixbench/postprocessing_utils.py
+++ b/bixbench/postprocessing_utils.py
@@ -21,8 +21,8 @@ def load_dataframe_from_json_directory(path: str) -> pd.DataFrame:
     """Load a dataframe from a json directory."""
     all_data = []
     for file in list(Path(path).glob("**/*.json")):
-        replica = re.search(r"replica_(\d+)", file.name)[1]
-        replica = int(replica) if replica is not None else 0
+        replica_match = re.search(r"replica_(\d+)", file.name)
+        replica = int(replica_match[1]) if replica_match else 0
         with open(file, encoding="utf-8") as f:
             data = json.load(f)
             data["replica"] = replica
diff --git a/tests/test_custom_trajectories.py b/tests/test_custom_trajectories.py
new file mode 100644
index 0000000..3f23760
--- /dev/null
+++ b/tests/test_custom_trajectories.py
@@ -0,0 +1,62 @@
+import json
+
+import pytest
+
+from bixbench.custom_trajectories import (
+    CustomTrajectoryValidationError,
+    minimal_notebook,
+    validate_custom_trajectory,
+    write_custom_trajectory,
+)
+from bixbench.postprocessing_utils import load_dataframe_from_json_directory
+
+
+def valid_open_record():
+    return {
+        "problem_id": "capsule-001",
+        "agent_answer": "The treatment increased expression by 2x.",
+        "ideal_answer": "The treatment increased expression by 2x.",
+        "problem": "Compare expression in control and treatment samples.",
+        "mcq_options": [],
+        "mcq_question": "Which conclusion is best supported?",
+        "notebook_stats": {"num_cells": 1},
+        "num_actions": 3,
+        "question_format": "open",
+        "refusal_option": True,
+        "metadata": {"distractors": ["No change.", "Expression decreased."]},
+        "refusal_options": None,
+        "nb": minimal_notebook("print('analysis complete')"),
+        "run_name": "custom-agent-smoke",
+    }
+
+
+def test_validate_custom_open_trajectory():
+    record = validate_custom_trajectory(valid_open_record())
+
+    assert record["problem_id"] == "capsule-001"
+    assert record["nb"]["cells"][0]["source"] == "print('analysis complete')"
+
+
+def test_validate_custom_open_trajectory_requires_distractors():
+    record = valid_open_record()
+    record["metadata"] = {}
+
+    with pytest.raises(CustomTrajectoryValidationError, match="distractors"):
+        validate_custom_trajectory(record)
+
+
+def test_write_custom_trajectory_uses_replica_filename(tmp_path):
+    path = write_custom_trajectory(valid_open_record(), tmp_path)
+
+    assert path.name == "capsule-001_replica_0.json"
+    assert json.loads(path.read_text())["run_name"] == "custom-agent-smoke"
+
+
+def test_directory_loader_accepts_custom_file_without_replica(tmp_path):
+    path = tmp_path / "capsule-001.json"
+    path.write_text(json.dumps(valid_open_record()), encoding="utf-8")
+
+    df = load_dataframe_from_json_directory(str(tmp_path))
+
+    assert df.loc[0, "replica"] == 0
+    assert df.loc[0, "problem_id"] == "capsule-001"