diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index ab601e7f..d244fbb9 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -12,10 +12,15 @@ * All handlers accept a `cwd` (project root) to resolve paths against. * The module spawns `bun apps/cli/src/cli.ts eval run ...` and tracks * process state in memory. + * + * Stdout/stderr are also persisted to `/console.log` so that + * RunDetail can show the full captured log after the in-memory buffers are + * pruned. The static log file is served by the run-log routes registered in + * `serve.ts` via `getActiveRunStatus`/`getActiveRunTarget` cross-referencing. */ import { type ChildProcess, execFileSync, spawn } from 'node:child_process'; -import { existsSync } from 'node:fs'; +import { type WriteStream, createWriteStream, existsSync, mkdirSync } from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { listTargetNames, readTargetDefinitions } from '@agentv/core'; @@ -80,6 +85,21 @@ export function getActiveRunTarget(indexJsonlPath: string): string | undefined { return undefined; } +/** + * Look up the in-memory status for a Studio-launched run by its index.jsonl path. + * Returns 'starting' | 'running' | 'finished' | 'failed' if the run is tracked, + * else undefined. Used by handleRuns to render a spinner for active runs in the + * RunList instead of a misleading red ✗ derived from a 0 pass-rate. + */ +export function getActiveRunStatus(indexJsonlPath: string): StudioRun['status'] | undefined { + for (const run of activeRuns.values()) { + if (run.outputDir && path.join(run.outputDir, 'index.jsonl') === indexJsonlPath) { + return run.status; + } + } + return undefined; +} + // ── Discover targets file from project root ────────────────────────────── async function discoverTargetsInProject(cwd: string): Promise { @@ -259,6 +279,33 @@ function isCommandAvailable(cmd: string): boolean { } } +/** + * Open a writable stream to `/console.log` for persisting the + * spawned eval process's combined stdout/stderr. Returns `undefined` when the + * directory cannot be created or the file cannot be opened — callers fall back + * to the in-memory buffer in that case. + * + * The log file is the source of truth shown by the RunDetail "Console Log" + * section after the run completes. The in-memory `stdout`/`stderr` buffers on + * `StudioRun` remain capped for live status polling. + * + * Stream `error` events (e.g. the output dir was removed underneath us by a + * test teardown) are swallowed so they don't surface as unhandled errors and + * fail unrelated tests. + */ +function openConsoleLogStream(outputDir: string): WriteStream | undefined { + try { + mkdirSync(outputDir, { recursive: true }); + const stream = createWriteStream(path.join(outputDir, 'console.log'), { flags: 'w' }); + stream.on('error', () => { + /* best-effort log capture; ignore filesystem errors */ + }); + return stream; + } catch { + return undefined; + } +} + // ── Route registration ─────────────────────────────────────────────────── // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route @@ -366,7 +413,10 @@ export function registerEvalRoutes( run.process = child; run.status = 'running'; + const logStream = openConsoleLogStream(outputDir); + child.stdout?.on('data', (chunk: Buffer) => { + logStream?.write(chunk); run.stdout += chunk.toString(); // Cap buffer at 100KB if (run.stdout.length > 100_000) { @@ -375,6 +425,7 @@ export function registerEvalRoutes( }); child.stderr?.on('data', (chunk: Buffer) => { + logStream?.write(chunk); run.stderr += chunk.toString(); if (run.stderr.length > 100_000) { run.stderr = run.stderr.slice(-80_000); @@ -386,6 +437,7 @@ export function registerEvalRoutes( run.status = code === 0 ? 'finished' : 'failed'; run.finishedAt = new Date().toISOString(); run.process = undefined; + logStream?.end(); pruneFinishedRuns(); }); @@ -394,6 +446,8 @@ export function registerEvalRoutes( run.stderr += `\nProcess error: ${err.message}`; run.finishedAt = new Date().toISOString(); run.process = undefined; + logStream?.write(`\nProcess error: ${err.message}\n`); + logStream?.end(); }); return c.json( @@ -574,11 +628,15 @@ export function registerEvalRoutes( run.process = child; run.status = 'running'; + const logStream = openConsoleLogStream(outputDir); + child.stdout?.on('data', (chunk: Buffer) => { + logStream?.write(chunk); run.stdout += chunk.toString(); if (run.stdout.length > 100_000) run.stdout = run.stdout.slice(-80_000); }); child.stderr?.on('data', (chunk: Buffer) => { + logStream?.write(chunk); run.stderr += chunk.toString(); if (run.stderr.length > 100_000) run.stderr = run.stderr.slice(-80_000); }); @@ -587,6 +645,7 @@ export function registerEvalRoutes( run.status = code === 0 ? 'finished' : 'failed'; run.finishedAt = new Date().toISOString(); run.process = undefined; + logStream?.end(); pruneFinishedRuns(); }); child.on('error', (err) => { @@ -594,6 +653,8 @@ export function registerEvalRoutes( run.stderr += `\nProcess error: ${err.message}`; run.finishedAt = new Date().toISOString(); run.process = undefined; + logStream?.write(`\nProcess error: ${err.message}\n`); + logStream?.end(); }); return c.json({ id: runId, status: run.status, command }, 202); diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 1c00f1f3..ea679a85 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -9,6 +9,7 @@ * - GET / — Studio SPA (React app) * - GET /api/runs — list available run workspaces with metadata * - GET /api/runs/:filename — load results from a specific run workspace + * - GET /api/runs/:filename/log — stream the captured console.log for a run * - GET /api/feedback — read feedback reviews * - POST /api/feedback — write feedback reviews * - GET /api/benchmarks — list registered benchmarks @@ -55,7 +56,7 @@ import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; import { findRepoRoot } from '../eval/shared.js'; import { listResultFiles } from '../inspect/utils.js'; -import { getActiveRunTarget, registerEvalRoutes } from './eval-runner.js'; +import { getActiveRunStatus, getActiveRunTarget, registerEvalRoutes } from './eval-runner.js'; import { loadLightweightResults, loadManifestResults, @@ -299,6 +300,10 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { } catch { // ignore enrichment errors } + // Surface live status for Studio-launched runs that are still starting + // or running so the RunList can render a spinner instead of the + // pass/fail dot derived from a 0% pass rate. + const liveStatus = getActiveRunStatus(m.path); const tagsEntry = readRunTags(m.path); return { filename: m.filename, @@ -313,11 +318,31 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { ...(target && { target }), ...(experiment && { experiment }), ...(tagsEntry && { tags: tagsEntry.tags }), + ...(liveStatus && { status: liveStatus }), }; }), }); } +async function handleRunLog(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); + if (!meta) return c.json({ error: 'Run not found' }, 404); + if (meta.source === 'remote') { + return c.json({ error: 'Console log is not available for remote runs' }, 404); + } + const logPath = path.join(path.dirname(meta.path), 'console.log'); + if (!existsSync(logPath)) { + return c.json({ error: 'Console log not found for this run' }, 404); + } + try { + const content = readFileSync(logPath, 'utf8'); + return c.text(content); + } catch { + return c.json({ error: 'Failed to read console log' }, 500); + } +} + async function handleRunDetail(c: C, { searchDir }: DataContext) { const filename = c.req.param('filename') ?? ''; const meta = await findRunById(searchDir, filename); @@ -1169,6 +1194,7 @@ export function createApp( return handleRunTagsDelete(c, defaultCtx); }); app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx)); + app.get('/api/runs/:filename/log', (c) => handleRunLog(c, defaultCtx)); app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx)); app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx)); app.get('/api/runs/:filename/categories/:category/suites', (c) => @@ -1293,6 +1319,7 @@ export function createApp( return withBenchmark(c, handleRunTagsDelete); }); app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail)); + app.get('/api/benchmarks/:benchmarkId/runs/:filename/log', (c) => withBenchmark(c, handleRunLog)); app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) => withBenchmark(c, handleRunSuites), ); diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 019885ee..78facded 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -4,13 +4,20 @@ * Groups results by category, then by suite within each category. * Category Breakdown is shown as a clean table with coloured pass-rate pills. * The All Evals table shows ERR badge instead of 0% for execution errors. + * + * Also renders a collapsible "Console Log" section sourced from the run's + * captured `console.log` file (served by `/api/runs/:id/log`). Hidden when no + * log is available — e.g. for remote runs or local runs that completed before + * the console-log capture feature shipped. */ +import { useState } from 'react'; + import { Link } from '@tanstack/react-router'; import type { EvalResult } from '~/lib/types'; -import { isPassing, useStudioConfig } from '~/lib/api'; +import { isPassing, useRunLog, useStudioConfig } from '~/lib/api'; import { PassRatePill } from './PassRatePill'; import { StatsCards } from './StatsCards'; @@ -234,6 +241,50 @@ export function RunDetail({ results, runId, benchmarkId }: RunDetailProps) { + + + + ); +} + +function ConsoleLogSection({ runId, benchmarkId }: { runId: string; benchmarkId?: string }) { + const [open, setOpen] = useState(false); + const { data: log, isLoading, error } = useRunLog(runId, benchmarkId); + + // Hide the section entirely when no log was captured (remote runs, or + // local runs from before this feature shipped). The 404 path resolves + // to `null` in fetchText, distinct from `undefined` (loading). + if (!isLoading && !error && log == null) return null; + + return ( +
+ + {open && ( +
+ {error ? ( +
+ Failed to load console log: {(error as Error).message} +
+ ) : ( +
+              {log ?? ''}
+            
+ )} +
+ )}
); } diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 03f90779..ca17f477 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -4,6 +4,12 @@ * Displays all available runs with a pass/fail status dot, human-readable name, * source badge, date, test count, and coloured pass-rate pill. * Clicking a row navigates to the run detail view. + * + * In-progress runs (status `starting` / `running`, surfaced by the backend + * via the RunMeta `status` field while a Studio-launched run is still + * tracked in-memory) render a pulsing cyan dot instead of the pass/fail + * dot — otherwise a 0% pass rate during the warm-up window would show as + * a misleading red ✗. */ import type React from 'react'; @@ -82,18 +88,27 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { {runs.map((run) => { const ts = formatDate(run.timestamp); const passing = run.pass_rate >= passThreshold; + const isActive = run.status === 'starting' || run.status === 'running'; const label = formatRunLabel(run); const passedCount = Math.round(run.pass_rate * run.test_count); const failedCount = run.test_count - passedCount; return ( - {/* Status dot */} + {/* Status dot — spinner for active runs, otherwise pass/fail */} - - {passing ? '✓' : '✗'} - + {isActive ? ( + + ) : ( + + {passing ? '✓' : '✗'} + + )} {/* Run name */} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index eb4b9501..212069f7 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -42,6 +42,21 @@ async function fetchJson(url: string): Promise { return res.json() as Promise; } +/** + * Fetch a text/plain endpoint. Treats 404 as `null` so callers can model + * "log not yet captured" without throwing — used by the RunDetail console log + * viewer for runs that finished before this feature shipped (no console.log + * on disk) and for remote runs. + */ +async function fetchText(url: string): Promise { + const res = await fetch(url); + if (res.status === 404) return null; + if (!res.ok) { + throw new Error(`API error: ${res.status} ${res.statusText}`); + } + return res.text(); +} + // ── Query option factories ────────────────────────────────────────────── export const runListOptions = queryOptions({ @@ -58,6 +73,23 @@ export function runDetailOptions(filename: string) { }); } +export function runLogOptions(filename: string, benchmarkId?: string) { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(filename)}/log` + : `/api/runs/${encodeURIComponent(filename)}/log`; + return queryOptions({ + queryKey: ['runs', filename, 'log', benchmarkId ?? ''], + queryFn: () => fetchText(url), + enabled: !!filename, + // Re-fetch while a run is still capturing output so the viewer streams in. + refetchInterval: 3_000, + }); +} + +export function useRunLog(filename: string, benchmarkId?: string) { + return useQuery(runLogOptions(filename, benchmarkId)); +} + export function runSuitesOptions(runId: string) { return queryOptions({ queryKey: ['runs', runId, 'suites'], diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 6ef4dc1b..466e648b 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -21,6 +21,13 @@ export interface RunMeta { benchmark_name?: string; /** Optional user-assigned tags from the run's sidecar tags.json. */ tags?: string[]; + /** + * Live execution status. Only present for Studio-launched runs that are + * still being tracked in-memory — used to render a spinner in RunList + * instead of the pass/fail dot when pass_rate is 0 simply because no + * results have been written yet. + */ + status?: 'starting' | 'running' | 'finished' | 'failed'; } export interface RunListResponse {