Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,16 @@ async function runSingleEvalFile(params: {
console.log(`${targetMessage}`);
}

// Hint about pipeline for CLI agent targets
const targetKind = resolvedTargetSelection.resolvedTarget.kind;
if ((targetKind === 'claude-cli' || targetKind === 'copilot-cli') && !options.dryRun) {
console.log('');
console.log(' TIP: For subagent-mode evals, use `agentv pipeline` instead of `eval run`.');
console.log(' The agent orchestrates executor + grader subagents directly.');
console.log(' Run: agentv pipeline --help');
console.log('');
}

const agentTimeoutMs =
options.agentTimeoutSeconds != null
? Math.max(0, options.agentTimeoutSeconds) * 1000
Expand Down
4 changes: 3 additions & 1 deletion apps/cli/src/commands/init/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ export interface InitCommandOptions {

function printSkillFirstInstructions(): void {
console.log('\nAI-skills-first setup (recommended):');
console.log(' agentv skills get agentv-bench');
console.log(' Then ask your agent: "Set up AgentV in this repo."');
console.log('\nFor Claude Code users, the agentv-dev plugin also provides skill discovery:');
console.log(' npx allagents plugin marketplace add EntityProcess/agentv');
console.log(' npx allagents plugin install agentv-dev@agentv');
console.log(' Then ask your agent: "Set up AgentV in this repo."');
}

async function promptYesNo(message: string): Promise<boolean> {
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/pipeline/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { evalRunCommand } from './run.js';

export const pipelineCommand = subcommands({
name: 'pipeline',
description: 'Agent-mode eval pipeline commands (input → grade → bench)',
description: 'Subagent-mode eval pipeline (input → executor subagents → grade → bench) — use this for agent targets',
cmds: {
input: evalInputCommand,
grade: evalGradeCommand,
Expand Down
41 changes: 40 additions & 1 deletion apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,39 @@ export const evalInputCommand = command({
});

console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);

// --- Subagent mode guidance ---
if (targetKind === 'agent') {
console.log(`
Target: ${targetName} (subagent-as-target mode)`);
console.log(` Tests: ${testIds.join(', ')}`);
console.log('');
console.log(' Next steps for the orchestrating agent:');
console.log(' 1. Dispatch executor subagents — one per test case (all in parallel):');
console.log(' - Each reads <run-dir>/<test-id>/input.json');
console.log(' - Executes the task, writes <run-dir>/<test-id>/response.md');
console.log(' 2. Run code graders: agentv pipeline grade <run-dir>');
console.log(' 3. Dispatch grader subagents — one per (test × LLM grader) pair (all in parallel):');
console.log(' - Read agents/grader.md and embed its content as system instructions in each subagent prompt');
console.log(' - Each subagent reads llm_graders/<name>.json + response.md for its test');
console.log(' - Each writes llm_grader_results/<name>.json');
console.log(' 4. Merge scores: agentv pipeline bench <run-dir>');
console.log('');
console.log(' For the full procedure:');
console.log(' agentv skills get agentv-bench --ref subagent-pipeline');
console.log('');
}
},
});

interface GraderCounts { codeGraders: number; llmGraders: number; builtinAssertions: number; }

async function writeGraderConfigs(
testDir: string,
assertions: readonly GraderConfig[],
evalDir: string,
): Promise<void> {
): Promise<GraderCounts> {
const counts: GraderCounts = { codeGraders: 0, llmGraders: 0, builtinAssertions: 0 };
const codeGradersDir = join(testDir, 'code_graders');
const llmGradersDir = join(testDir, 'llm_graders');

Expand Down Expand Up @@ -257,9 +282,22 @@ async function writeGraderConfigs(
promptContent = config.prompt;
}

// For rubrics assertions, include the criteria array directly
// so grader subagents can evaluate without needing a prompt file.
const rubrics = (config as LlmGraderConfig).rubrics;
const rubricsData = rubrics?.map((r) => ({
id: r.id,
outcome: r.outcome,
weight: r.weight ?? 1.0,
...(r.score_ranges ? { score_range: r.score_ranges } : {}),
...(r.required !== undefined ? { required: r.required } : {}),
...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}),
}));

await writeJson(join(llmGradersDir, `${config.name}.json`), {
name: config.name,
prompt_content: promptContent,
...(rubricsData && rubricsData.length > 0 ? { rubrics: rubricsData } : {}),
weight: config.weight ?? 1.0,
threshold: 0.5,
config: {},
Expand All @@ -280,6 +318,7 @@ async function writeGraderConfigs(
});
}
}
return counts;
}

async function writeJson(filePath: string, data: unknown): Promise<void> {
Expand Down
56 changes: 50 additions & 6 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ function loadEnvFile(dir: string): Record<string, string> {

export const evalRunCommand = command({
name: 'run',
description: 'Extract inputs, invoke CLI targets, and run code graders in one step',
description: 'Extract inputs, invoke CLI targets, and run code graders (for agent targets, use pipeline input + subagents)',
args: {
evalPath: positional({
type: string,
Expand Down Expand Up @@ -341,15 +341,41 @@ export const evalRunCommand = command({
await Promise.all(pending);
process.stderr.write('\n');
} else {
console.log('Subagent-as-target mode — skipping CLI invocation.');
console.log('Subagent-as-target mode — the agent IS the target.');
console.log('');
console.log(' What happened: pipeline extracted inputs but did NOT invoke a CLI target.');
console.log(' The orchestrating agent must dispatch executor subagents to process each test.');
console.log('');
console.log(' Next steps:');
console.log(' 1. Dispatch executor subagents — one per test case (all in parallel):');
console.log(' - Each reads <run-dir>/<test-id>/input.json');
console.log(' - Executes the task, writes <run-dir>/<test-id>/response.md');
console.log(' 2. Run code graders: agentv pipeline grade <run-dir>');
console.log(' 3. Dispatch grader subagents — one per (test x LLM grader) pair (all in parallel):');
console.log(' - Read agents/grader.md and embed its content as system instructions in each subagent prompt');
console.log(' - Each subagent reads llm_graders/<name>.json + response.md');
console.log(' - Each writes llm_grader_results/<name>.json');
console.log(' 4. Merge scores: agentv pipeline bench <run-dir>');
console.log('');
console.log(' For the full procedure:');
console.log(' agentv skills get agentv-bench --ref subagent-pipeline');
console.log('');
}

// ── Step 3: Run code graders (only when explicitly requested) ─────
if (graderType !== 'code') {
console.log(`\nDone. Results in ${outDir}`);
console.log(
'To run code graders: agentv pipeline grade <run-dir> (or re-run with --grader-type code)',
);
console.log('');
if (targetKind === 'agent') {
console.log(' The agent must now:');
console.log(' 1. Dispatch executor subagents to generate response.md files');
console.log(' 2. Run code graders: agentv pipeline grade <run-dir>');
console.log(' 3. Dispatch grader subagents for llm_graders/ configs');
console.log(' 4. Merge scores: agentv pipeline bench <run-dir>');
} else {
console.log(' To run code graders: agentv pipeline grade <run-dir>');
console.log(' Or re-run with --grader-type code to grade inline.');
}
return;
}

Expand Down Expand Up @@ -382,7 +408,13 @@ export const evalRunCommand = command({
const graderConcurrency = workers ?? 10;
const { totalGraders, totalPassed } = await runCodeGraders(graderTasks, graderConcurrency);
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
console.log(`\nDone. Agent can now perform LLM grading on responses in ${outDir}`);
console.log('');
console.log(`Results in ${outDir}`);
console.log('');
console.log(' Remaining steps:');
console.log(' 1. If llm_graders/ configs exist, dispatch grader subagents');
console.log(' - Read agents/grader.md, embed as system instructions in each subagent prompt');
console.log(' 2. Merge all scores: agentv pipeline bench <run-dir>');
},
});

Expand Down Expand Up @@ -433,9 +465,21 @@ async function writeGraderConfigs(
} else if (typeof config.prompt === 'string') {
promptContent = config.prompt;
}
// For rubrics assertions, include the criteria array directly
const rubrics = (config as LlmGraderConfig).rubrics;
const rubricsData = rubrics?.map((r) => ({
id: r.id,
outcome: r.outcome,
weight: r.weight ?? 1.0,
...(r.score_ranges ? { score_range: r.score_ranges } : {}),
...(r.required !== undefined ? { required: r.required } : {}),
...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}),
}));

await writeJson(join(llmGradersDir, `${config.name}.json`), {
name: config.name,
prompt_content: promptContent,
...(rubricsData && rubricsData.length > 0 ? { rubrics: rubricsData } : {}),
weight: config.weight ?? 1.0,
threshold: 0.5,
config: {},
Expand Down
Loading
Loading