Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,43 @@ Edit or create a new YAML file to modify:

### Using Your Own Agent

To use your own agent, use the `generate_trajectories.py` script by editing the [`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239) function to generate trajectories in the same format as the BixBench trajectories, then use the `postprocessing.py` script to evaluate your agent's performance.
To use your own agent, write BixBench-compatible trajectory JSON files and then
run the standard `postprocessing.py` script on the output directory. The helper
below validates the required fields before you spend credits on a full benchmark
run:

```python
from bixbench.custom_trajectories import minimal_notebook, write_custom_trajectory

record = {
"problem_id": "capsule-001",
"agent_answer": "The treatment increased expression by 2x.",
"ideal_answer": "The treatment increased expression by 2x.",
"problem": "Compare expression in control and treatment samples.",
"mcq_options": [],
"mcq_question": "Which conclusion is best supported?",
"notebook_stats": {"num_cells": 1},
"num_actions": 3,
"question_format": "open",
"refusal_option": True,
"metadata": {"distractors": ["No change.", "Expression decreased."]},
"refusal_options": None,
"nb": minimal_notebook("print('analysis complete')"),
"run_name": "my-custom-agent",
}

write_custom_trajectory(record, "data/trajectories/my-custom-agent")
```

The writer stores files as `<problem_id>_replica_<id>.json`, matching the
directory loader used by postprocessing. For open-answer trajectories,
`metadata["distractors"]` is required by default because postprocessing expands
open answers into multiple-choice grading rows unless MCQ expansion is disabled.

You can still subclass the rollout path and implement
[`custom_rollout`](https://github.com/Future-House/BixBench/blob/6c28217959d5d7dd6f48c59894534fced7c6c040/bixbench/generate_trajectories.py#L239),
but external agents no longer need to edit core rollout code just to validate
and postprocess already-generated trajectories.

### Evaluate trajectories

Expand Down
161 changes: 161 additions & 0 deletions bixbench/custom_trajectories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""Helpers for writing BixBench-compatible custom agent trajectories."""

from __future__ import annotations

import json
from collections.abc import Mapping
from pathlib import Path
from typing import Any


class CustomTrajectoryValidationError(ValueError):
"""Raised when a custom trajectory cannot be postprocessed by BixBench."""


REQUIRED_TRAJECTORY_FIELDS = frozenset(
{
"problem_id",
"agent_answer",
"ideal_answer",
"problem",
"mcq_options",
"mcq_question",
"notebook_stats",
"num_actions",
"question_format",
"refusal_option",
"metadata",
"refusal_options",
"nb",
"run_name",
}
)


def minimal_notebook(source: str = "", language: str = "python") -> dict[str, Any]:
"""Return the smallest notebook object accepted by postprocessing."""
return {
"cells": [
{
"cell_type": "code",
"execution_count": None,
"metadata": {},
"outputs": [],
"source": source,
}
],
"metadata": {
"kernelspec": {
"display_name": language,
"language": language,
"name": language,
},
"language_info": {"name": language},
},
"nbformat": 4,
"nbformat_minor": 5,
}


def validate_custom_trajectory(
record: Mapping[str, Any], *, require_open_distractors: bool = True
) -> dict[str, Any]:
"""Validate and normalize a custom trajectory before writing it to JSON.

The returned dictionary uses the same top-level fields emitted by
``generate_trajectories.py`` so it can be loaded by ``postprocessing.py``.
"""
missing = sorted(REQUIRED_TRAJECTORY_FIELDS.difference(record))
if missing:
raise CustomTrajectoryValidationError(
f"Custom trajectory is missing required fields: {', '.join(missing)}"
)

normalized = dict(record)
_require_non_empty_string(normalized, "problem_id")
_require_non_empty_string(normalized, "run_name")
_require_non_empty_string(normalized, "problem")

question_format = normalized["question_format"]
if question_format not in {"open", "mcq", "hypothesis"}:
raise CustomTrajectoryValidationError(
"question_format must be one of: open, mcq, hypothesis"
)

if not normalized["agent_answer"]:
raise CustomTrajectoryValidationError(
"agent_answer must be truthy; postprocessing drops empty answers"
)
if not normalized["ideal_answer"]:
raise CustomTrajectoryValidationError("ideal_answer must be truthy")

mcq_question = normalized["mcq_question"]
if isinstance(mcq_question, list):
if not mcq_question or not all(isinstance(item, str) for item in mcq_question):
raise CustomTrajectoryValidationError(
"mcq_question must be a string or a non-empty list of strings"
)
elif not isinstance(mcq_question, str):
raise CustomTrajectoryValidationError(
"mcq_question must be a string or a non-empty list of strings"
)

if not isinstance(normalized["mcq_options"], list):
raise CustomTrajectoryValidationError("mcq_options must be a list")

_require_mapping(normalized, "metadata")
_require_mapping(normalized, "notebook_stats")
_require_mapping(normalized, "nb")

nb = normalized["nb"]
if not isinstance(nb.get("cells"), list):
raise CustomTrajectoryValidationError("nb must contain a cells list")

if not isinstance(normalized["num_actions"], int) or normalized["num_actions"] < 0:
raise CustomTrajectoryValidationError("num_actions must be a non-negative int")

if require_open_distractors and question_format == "open":
distractors = normalized["metadata"].get("distractors")
if not isinstance(distractors, list) or not distractors:
raise CustomTrajectoryValidationError(
"open trajectories need metadata['distractors'] for MCQ expansion"
)

return normalized


def write_custom_trajectory(
record: Mapping[str, Any],
output_dir: str | Path,
*,
replica_id: int = 0,
require_open_distractors: bool = True,
) -> Path:
"""Validate and write one trajectory JSON file.

The filename includes ``replica_<id>`` so directory loading can infer the
replica number consistently for single-run and batched custom agents.
"""
if replica_id < 0:
raise CustomTrajectoryValidationError("replica_id must be non-negative")

validated = validate_custom_trajectory(
record, require_open_distractors=require_open_distractors
)
output_path = (
Path(output_dir)
/ f"{validated['problem_id']}_replica_{replica_id}.json"
)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(validated, indent=2) + "\n", encoding="utf-8")
return output_path


def _require_non_empty_string(record: Mapping[str, Any], field: str) -> None:
if not isinstance(record[field], str) or not record[field].strip():
raise CustomTrajectoryValidationError(f"{field} must be a non-empty string")


def _require_mapping(record: Mapping[str, Any], field: str) -> None:
if not isinstance(record[field], Mapping):
raise CustomTrajectoryValidationError(f"{field} must be an object")
4 changes: 2 additions & 2 deletions bixbench/postprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def load_dataframe_from_json_directory(path: str) -> pd.DataFrame:
"""Load a dataframe from a json directory."""
all_data = []
for file in list(Path(path).glob("**/*.json")):
replica = re.search(r"replica_(\d+)", file.name)[1]
replica = int(replica) if replica is not None else 0
replica_match = re.search(r"replica_(\d+)", file.name)
replica = int(replica_match[1]) if replica_match else 0
with open(file, encoding="utf-8") as f:
data = json.load(f)
data["replica"] = replica
Expand Down
62 changes: 62 additions & 0 deletions tests/test_custom_trajectories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import json

import pytest

from bixbench.custom_trajectories import (
CustomTrajectoryValidationError,
minimal_notebook,
validate_custom_trajectory,
write_custom_trajectory,
)
from bixbench.postprocessing_utils import load_dataframe_from_json_directory


def valid_open_record():
return {
"problem_id": "capsule-001",
"agent_answer": "The treatment increased expression by 2x.",
"ideal_answer": "The treatment increased expression by 2x.",
"problem": "Compare expression in control and treatment samples.",
"mcq_options": [],
"mcq_question": "Which conclusion is best supported?",
"notebook_stats": {"num_cells": 1},
"num_actions": 3,
"question_format": "open",
"refusal_option": True,
"metadata": {"distractors": ["No change.", "Expression decreased."]},
"refusal_options": None,
"nb": minimal_notebook("print('analysis complete')"),
"run_name": "custom-agent-smoke",
}


def test_validate_custom_open_trajectory():
record = validate_custom_trajectory(valid_open_record())

assert record["problem_id"] == "capsule-001"
assert record["nb"]["cells"][0]["source"] == "print('analysis complete')"


def test_validate_custom_open_trajectory_requires_distractors():
record = valid_open_record()
record["metadata"] = {}

with pytest.raises(CustomTrajectoryValidationError, match="distractors"):
validate_custom_trajectory(record)


def test_write_custom_trajectory_uses_replica_filename(tmp_path):
path = write_custom_trajectory(valid_open_record(), tmp_path)

assert path.name == "capsule-001_replica_0.json"
assert json.loads(path.read_text())["run_name"] == "custom-agent-smoke"


def test_directory_loader_accepts_custom_file_without_replica(tmp_path):
path = tmp_path / "capsule-001.json"
path.write_text(json.dumps(valid_open_record()), encoding="utf-8")

df = load_dataframe_from_json_directory(str(tmp_path))

assert df.loc[0, "replica"] == 0
assert df.loc[0, "problem_id"] == "capsule-001"