From 6b4a88b238d8d92fa4379dc7e9f74bb0d41dddf3 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 01:07:41 +0000 Subject: [PATCH 01/14] docs(supervisor): add design spec and implementation plan Spec + plan for /supervisor:goal, /supervisor:retry, and a configurable rubric layered on the reflection-3 judge loop. Co-Authored-By: Claude Opus 4.8 --- .../plans/2026-06-03-supervisor-mode.md | 268 ++++++++++++++++++ .../2026-06-03-supervisor-mode-design.md | 244 ++++++++++++++++ 2 files changed, 512 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-03-supervisor-mode.md create mode 100644 docs/superpowers/specs/2026-06-03-supervisor-mode-design.md diff --git a/docs/superpowers/plans/2026-06-03-supervisor-mode.md b/docs/superpowers/plans/2026-06-03-supervisor-mode.md new file mode 100644 index 0000000..178ecc7 --- /dev/null +++ b/docs/superpowers/plans/2026-06-03-supervisor-mode.md @@ -0,0 +1,268 @@ +# Supervisor controls for `reflection-3` — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a configurable "supervisor" control surface over the always-on `reflection-3` judge: a user-editable rubric, a configurable retry budget (default 16), and a session-scoped `/supervisor:goal` whose completion requires the condition **and** all applicable workflow gates. + +**Architecture:** All new logic is added as **exported functions in `reflection-3.ts`** (single source file; `packages/reflection/reflection-3.ts` is a symlink to it). New functions are imported **directly** in tests from `../reflection-3.ts` (the `detectPlanningLoop` pattern), bypassing the `reflection-3.test-helpers.ts` duplication. The default rubric is an **embedded constant** (preserves the single-file `cp` install). Continuation stays a user turn via `client.session.promptAsync` (provider-safe). Spec: `docs/superpowers/specs/2026-06-03-supervisor-mode-design.md`. + +**Tech Stack:** TypeScript (ESM), `@opencode-ai/plugin`, Jest + ts-jest (ESM preset), promptfoo evals. + +--- + +## File structure + +| File | Responsibility | Change | +| --- | --- | --- | +| `reflection-3.ts` | plugin runtime; hosts new `DEFAULT_RUBRIC`, `loadRubric`, `supervisorStore`, `buildGoalRequirementSection`, cap resolution, command capture, loop integration | Modify | +| `test/supervisor.unit.test.ts` | unit tests for rubric loader, store, cap resolution, goal requirement section | Create | +| `test/supervisor.integration.test.ts` | idle-loop integration (mocked client): goal continues → achieves → clears; retry cap | Create | +| `.opencode/command/supervisor/goal.md`, `…/retry.md` (or `opencode.json` entries) | the `/supervisor:*` command surface | Create (per Phase 0 finding) | +| `evals/prompts/task-verification.txt` + `evals/datasets/*` | add goal/verification-theater fixtures | Modify | +| `README.md` | document supervisor commands, rubric override, retry | Modify | + +--- + +## Phase 0 — Spikes (resolve OpenCode API unknowns) + +These gate the command-capture code only. Rubric/retry/store phases do **not** depend on them and can proceed in parallel. + +### Task 0.1: Spike command namespacing + arg capture + +- [ ] **Step 1:** Create a throwaway probe plugin `/tmp/probe/probe.ts` that logs every `event.type` and full `event.properties` to a file, and registers a `command.executed` log. +- [ ] **Step 2:** Add commands two ways and see which produces `/supervisor:goal`: (a) `.opencode/command/supervisor/goal.md`; (b) `opencode.json` `command["supervisor:goal"]`. Run `opencode` and invoke each. +- [ ] **Step 3:** Record in the issue: does `command.executed` carry `{name, arguments}` for the invoked command? Does a `supervisor`-namespaced command appear as `/supervisor:goal`? +- [ ] **Step 4:** Decide capture mechanism: **A)** `command.executed` event (preferred, deterministic) or **B)** control-marker in the command template parsed from the user message. Document the choice + payload shape in the issue before Phase 4. + +**Acceptance:** issue comment states the namespacing mechanism, the `command.executed` payload (or its absence), and the chosen capture path with a concrete example. + +--- + +## Phase 1 — Configurable rubric (no API dependency) + +### Task 1.1: Extract inline antipatterns into `DEFAULT_RUBRIC` + +**Files:** Modify `reflection-3.ts` (near `:25`); source text from `:1140`–`:1143` (self-assessment) and `:1400`–`:1403` (judge). + +- [ ] **Step 1: Write failing test** — `test/supervisor.unit.test.ts`: +```ts +import assert from "node:assert" +import { describe, it } from "@jest/globals" +import { DEFAULT_RUBRIC, parseRubric } from "../reflection-3.ts" + +describe("rubric", () => { + it("DEFAULT_RUBRIC has both sections and the mined antipatterns", () => { + const r = parseRubric(DEFAULT_RUBRIC) + assert.ok(r.patterns.length > 0, "patterns section present") + assert.match(r.antipatterns, /PERMISSION-SEEKING/) + assert.match(r.antipatterns, /STOPPED-WITH-TODOS/) + assert.match(r.antipatterns, /FALSE-COMPLETE/) + }) +}) +``` +- [ ] **Step 2: Run, verify fail** — `npx jest test/supervisor.unit.test.ts -t rubric` → FAIL (`DEFAULT_RUBRIC`/`parseRubric` not exported). +- [ ] **Step 3: Implement** — add to `reflection-3.ts`: +```ts +export const DEFAULT_RUBRIC = `## Patterns + + +## Antipatterns +` + +export function parseRubric(md: string): { patterns: string; antipatterns: string } { + const section = (name: string) => { + const re = new RegExp(`##\\s+${name}\\s*\\n([\\s\\S]*?)(?=\\n##\\s|$)`, "i") + return (md.match(re)?.[1] ?? "").trim() + } + return { patterns: section("Patterns"), antipatterns: section("Antipatterns") } +} +``` +Copy the antipattern text **verbatim** from the two existing inline blocks (use the more complete `:1140` wording). +- [ ] **Step 4: Run, verify pass.** +- [ ] **Step 5: Commit** — `git commit -m "feat(supervisor): extract default rubric into embedded constant"` + +### Task 1.2: `loadRubric(directory)` with override precedence + +- [ ] **Step 1: Failing test:** +```ts +import { loadRubric } from "../reflection-3.ts" +import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" +import { tmpdir } from "node:os"; import { join } from "node:path" + +it("project .reflection/rubric.md overrides default", async () => { + const dir = mkdtempSync(join(tmpdir(), "rub-")) + mkdirSync(join(dir, ".reflection"), { recursive: true }) + writeFileSync(join(dir, ".reflection/rubric.md"), "## Patterns\nP\n## Antipatterns\nMY-RULE") + const r = await loadRubric(dir) + assert.strictEqual(r.source, "project") + assert.match(r.antipatterns, /MY-RULE/) +}) +it("falls back to default when no override / empty file", async () => { + const dir = mkdtempSync(join(tmpdir(), "rub-")) + const r = await loadRubric(dir) + assert.strictEqual(r.source, "default") + assert.match(r.antipatterns, /PERMISSION-SEEKING/) +}) +``` +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** `loadRubric(directory)`: try `${directory}/.reflection/rubric.md` (source `project`) → `~/.config/opencode/supervisor/rubric.md` (source `global`) → `DEFAULT_RUBRIC` (source `default`). `parseRubric` each; if either section empty, fall through to default. Return `{ patterns, antipatterns, source }`. +- [ ] **Step 4: Run, verify pass.** +- [ ] **Step 5: Commit.** + +### Task 1.3: Wire `loadRubric` into both prompt builders + +**Files:** `reflection-3.ts` `buildSelfAssessmentPrompt:1053`, `analyzeSelfAssessmentWithLLM:1350`, call site `runReflection:1717`. + +- [ ] **Step 1: Failing test** — `buildSelfAssessmentPrompt` accepts a `rubric` arg and interpolates `rubric.antipatterns`: +```ts +const prompt = buildSelfAssessmentPrompt(ctx, "AGENTS", "last", 0, { patterns: "PP", antipatterns: "ZZ-RULE" }) +assert.match(prompt, /ZZ-RULE/) +``` +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** — add optional `rubric` param to both builders, replace the inline antipattern literals with `${rubric.antipatterns}` / `${rubric.patterns}`; default the param to `parseRubric(DEFAULT_RUBRIC)` for back-compat. In `runReflection`, call `const rubric = await loadRubric(directory)` once and thread it into both builders (and the judge path `:1717`). +- [ ] **Step 4: Run** full unit suite + `npm run typecheck`; verify pass. +- [ ] **Step 5: Commit** — `feat(supervisor): load rubric from file with default fallback` + +--- + +## Phase 2 — Configurable retry budget + +### Task 2.1: `DEFAULT_MAX_ATTEMPTS = 16` + cap resolver + +**Files:** `reflection-3.ts:25` (`MAX_ATTEMPTS`), `reflection.yaml` loader (`loadRoutingConfig:765` area). + +- [ ] **Step 1: Failing test:** +```ts +import { resolveMaxAttempts } from "../reflection-3.ts" +it("session override > config > default 16", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 5, config: 30 }), 5) + assert.strictEqual(resolveMaxAttempts({ sessionOverride: undefined, config: 30 }), 30) + assert.strictEqual(resolveMaxAttempts({}), 16) +}) +it("clamps to 1..100", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 0 }), 1) + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 999 }), 100) +}) +``` +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** — rename const to `DEFAULT_MAX_ATTEMPTS = 16`; add `resolveMaxAttempts({sessionOverride?, config?})` clamped to `[1,100]`. Read `maxAttempts` from `reflection.yaml` in the config loader. +- [ ] **Step 4:** Replace the hardcoded `MAX_ATTEMPTS` use at `:1927`/`:1929`/`:1080` with an `effectiveMaxAttempts` resolved per session (computed in `runReflection`, passed where needed). +- [ ] **Step 5: Run, verify pass + typecheck. Commit** — `feat(supervisor): make retry budget configurable, default 16` + +--- + +## Phase 3 — `supervisorStore` (per-session state) + +### Task 3.1: Store round-trip + +**Files:** `reflection-3.ts`; state at `${directory}/.reflection/supervisor/.json`. + +- [ ] **Step 1: Failing test:** +```ts +import { supervisorStore } from "../reflection-3.ts" +it("saves and loads goal + retry, clears goal", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + await supervisorStore.setRetry(dir, "s1", 12) + await supervisorStore.setGoal(dir, "s1", "tests pass") + let st = await supervisorStore.load(dir, "s1") + assert.strictEqual(st.maxAttempts, 12) + assert.strictEqual(st.goal?.status, "active") + await supervisorStore.clearGoal(dir, "s1") + st = await supervisorStore.load(dir, "s1") + assert.strictEqual(st.goal, undefined) + assert.strictEqual(st.maxAttempts, 12) // retry survives goal clear +}) +``` +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** `supervisorStore` object: `load`, `save`, `setGoal` (init `{condition,status:"active",attempts:0,tokenBaseline:0,startedAt:Date.now(),deadline:Date.now()+maxDurationMs,lastReason:""}`), `clearGoal`, `setRetry`, `list`. Files `0600`; corrupt JSON → `{}`. Mkdir `.reflection/supervisor` on save. +- [ ] **Step 4: Run, verify pass. Commit** — `feat(supervisor): per-session goal+retry store` + +--- + +## Phase 4 — `/supervisor:*` command capture (after Phase 0) + +### Task 4.1: Ship the commands + +- [ ] **Step 1:** Per Phase-0 finding, create `.opencode/command/supervisor/goal.md` and `retry.md` (or `opencode.json` entries). Template carries `$ARGUMENTS`; if capture path B, prefix a control marker (e.g. ``). +- [ ] **Step 2:** Document install in README. +- [ ] **Step 3: Commit** — `feat(supervisor): add /supervisor:goal and /supervisor:retry commands` + +### Task 4.2: Capture handler + +**Files:** `reflection-3.ts` `event` handler (`:1990`), parser `parseSupervisorCommand`. + +- [ ] **Step 1: Failing test** for the pure parser: +```ts +import { parseSupervisorCommand } from "../reflection-3.ts" +assert.deepStrictEqual(parseSupervisorCommand("goal", "tests pass"), { kind: "goal-set", condition: "tests pass" }) +assert.deepStrictEqual(parseSupervisorCommand("goal", ""), { kind: "goal-status" }) +assert.deepStrictEqual(parseSupervisorCommand("goal", "clear"), { kind: "goal-clear" }) +assert.deepStrictEqual(parseSupervisorCommand("retry", "12"), { kind: "retry-set", n: 12 }) +assert.deepStrictEqual(parseSupervisorCommand("retry", ""), { kind: "retry-status" }) +``` +Aliases for clear: `stop|off|reset|none|cancel`. +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** `parseSupervisorCommand(name, args)`; then in the `event` handler, on the captured command (path A: `command.executed`; path B: scan latest user message for the marker), call `supervisorStore.setGoal/clearGoal/setRetry` and `showToast` the status. Condition clamped to 4000 chars. +- [ ] **Step 4: Run, verify pass. Commit** — `feat(supervisor): capture /supervisor commands into store` + +--- + +## Phase 5 — Goal loop integration + +### Task 5.1: `buildGoalRequirementSection` + +- [ ] **Step 1: Failing test:** +```ts +import { buildGoalRequirementSection } from "../reflection-3.ts" +const s = buildGoalRequirementSection("all tests in test/auth pass") +assert.match(s, /MANDATORY/i) +assert.match(s, /all tests in test\/auth pass/) +assert.match(s, /evidence/i) // reinforces no-false-complete +``` +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** — returns a prompt fragment marking the condition as a mandatory completion requirement, restating that claims need transcript evidence. Appended after `rubric.antipatterns` in both builders when a goal is active. +- [ ] **Step 4: Run, verify pass. Commit.** + +### Task 5.2: Integrate into `runReflection` + +**Files:** `reflection-3.ts` `runReflection:1667`, budget gate before judge, completion + continuation at `:1925`–`:1976`. + +- [ ] **Step 1: Failing integration test** — `test/supervisor.integration.test.ts` with a mocked `client` (mirror `test/reflection.test.ts` mock): a session with an active goal whose judge verdict is `complete:false` triggers `client.session.promptAsync`; verdict `complete:true` sets goal `status:"achieved"` (assert via `supervisorStore.load`) and injects **no** continuation; `attempts >= effectiveMaxAttempts` sets `status:"exhausted"` and injects nothing. +- [ ] **Step 2: Run, verify fail.** +- [ ] **Step 3: Implement** in `runReflection`: + - load `supervisorState`; `effectiveMaxAttempts = resolveMaxAttempts({sessionOverride: state.maxAttempts, config})`. + - if `state.goal?.status === "active"`: **budget gate first** — if `goal.attempts >= effectiveMaxAttempts` || tokens/deadline exceeded → set `status:"exhausted"`, save, toast, `return`. + - thread `buildGoalRequirementSection(goal.condition)` into the prompt builders (Task 5.1). + - on `analysis.complete` with active goal → set `status:"achieved"`, save, `✓` toast, `return` (no continuation). + - on continuation, increment `goal.attempts` alongside the existing `attempts` map and persist; reuse existing `promptAsync` block. +- [ ] **Step 4: Run** integration + unit + `npm test` + typecheck; verify pass. +- [ ] **Step 5: Commit** — `feat(supervisor): goal loop — gates AND condition, auto-clear on achieve` + +### Task 5.3: Resume active + +- [ ] **Step 1: Failing test** — a persisted `active` goal loaded fresh stays `active` with `attempts` reset to 0 (unless `supervisorResumePaused`). +- [ ] **Step 2–4:** Implement reset-on-load (attempts/deadline/tokenBaseline) honoring `supervisorResumePaused` (default false). Run, verify, commit. + +--- + +## Phase 6 — Evals & docs + +### Task 6.1: Verification-theater fixtures +- [ ] Add promptfoo cases to `evals/` (or a new `evals/supervisor-goal.yaml`): (a) condition met **with** test evidence → judge `complete:true`; (b) bare "done" claim, no evidence → `complete:false`; (c) editing the `## Antipatterns` section of a fixture `rubric.md` flips the verdict. +- [ ] Run `npm run eval:judge` (or the new config); record pass rate in the issue. Commit. + +### Task 6.2: README +- [ ] Document `/supervisor:goal`, `/supervisor:retry`, rubric override (`rubric.md` resolution order), resume behavior, and the `anthropic`-provider recommendation for long unattended runs. Commit. + +--- + +## Self-review (spec coverage) + +- Feature 1 (configurable rubric) → Phase 1 ✓ (embedded default, file override, both builders). +- Feature 2 (retry 16 + `/supervisor:retry`) → Phase 2 + 4 ✓. +- Feature 3 (`/supervisor:goal`, gates AND condition, auto-clear, resume active) → Phases 3–5 ✓. +- Independent-evaluator reuse → Phase 5 uses the existing throwaway-session judge (no new model stack) ✓. +- C5 / verification-theater eval → Phase 6 ✓. +- Provider-safe continuation (user turn) → reuses existing `promptAsync` at `:1957` ✓. + +**Open items intentionally deferred to Phase 0 spike (not placeholders):** exact command namespacing + `command.executed` payload. All other steps are concrete and code-complete. diff --git a/docs/superpowers/specs/2026-06-03-supervisor-mode-design.md b/docs/superpowers/specs/2026-06-03-supervisor-mode-design.md new file mode 100644 index 0000000..2a9571d --- /dev/null +++ b/docs/superpowers/specs/2026-06-03-supervisor-mode-design.md @@ -0,0 +1,244 @@ +# Supervisor controls for `reflection-3` (OpenCode) + +**Date:** 2026-06-03 +**Status:** Design — awaiting review +**Target:** `dzianisv/opencode-plugins` → `reflection-3.ts` (published as `opencode-reflection`) + +## Context + +Claude Code's [`/goal`](https://code.claude.com/docs/en/goal) sets a session-scoped completion +condition. After every turn a small fast model judges the transcript ("is the condition met?"). If +not, it injects another turn with the reason as guidance; when met, the goal clears. Under the hood +it is a **session-scoped, prompt-based Stop hook** whose evaluator is a *fresh, independent model* +that judges only what the worker surfaced in the transcript. + +This repo's `reflection-3.ts` already implements that independent-evaluator loop (idle → judge → +re-prompt). The community OpenCode goal plugins (`willytop8/OpenCode-goal-plugin`, +`DraconDev/opencode-auto-continue`) do **not** — the former is sentinel-based and its README admits +"there is no independent evaluator," the latter is todo-driven. So we build on `reflection-3`. + +This spec adds a **supervisor control surface** over the always-on reflection engine, in three parts: + +1. **Configurable rubric** — move the judge's hardcoded patterns/antipatterns into user-editable files. +2. **`/supervisor:retry `** — make the retry budget configurable; raise the default from 3 to **16**. +3. **`/supervisor:goal …`** — a faithful, session-scoped `/goal`. + +"Reflection" = the always-on judge. "Supervisor" = the interactive control surface (rubric files + +commands) layered on it. + +### What already exists in `reflection-3.ts` (reused, not rebuilt) + +| Capability | Location | +| --- | --- | +| `session.idle` → `runReflection(sessionId)` loop | `reflection-3.ts:2002`, `:1667` | +| Independent judge on a throwaway session (`create`→`promptAsync`→`waitForResponse`→`delete`) | `:1747`–`:1783`, `analyzeSelfAssessmentWithLLM` `:1350` | +| Continuation injection into the main session via `client.session.promptAsync` | `:1957` | +| 3-source prompt precedence: `.reflection.md` file > `toolReflectionPrompt` > default rubric | `resolveReflectionPromptPrecedence` `:213` | +| Runtime guidance setter tool (`set-reflection`), incl. clear/read + 4000-char note | `executeSetReflection` `:1595` | +| Workflow gate inference (`requiresTests/Build/PR/CI`) by task type & repo signals | `WorkflowRequirements` `:46`, `buildTaskContext` `:972` | +| Structured judge verdict `{complete, shouldContinue, reason, severity, …}` | `ReflectionAnalysis` `:98` | +| **Hardcoded** antipatterns, duplicated in two prompt builders | `buildSelfAssessmentPrompt:1140`, `analyzeSelfAssessmentWithLLM:1400` | +| `MAX_ATTEMPTS = 3` retry cap, referenced throughout | `:25` | +| Toasts, per-session disable, attempt tracking, debug log | `showToast:541`, `.reflection/disabled:1570` | + +## Goal / Non-goals + +**Goal:** A configurable supervisor over `reflection-3` — user-editable rubric, a configurable retry +budget (default 16), and a session-scoped `/supervisor:goal` whose completion requires the condition +**and** all applicable workflow gates. + +**Non-goals (YAGNI):** sentinels; a separate notification channel (Telegram/TTS already fire); +multiple simultaneous goals per session; a bespoke evaluator model stack (reuse the existing judge); +the Claude Code side (`claude/`). + +--- + +## Feature 1 — Configurable rubric files + +### Problem +The premature-stop antipatterns (PERMISSION-SEEKING, STOPPED-WITH-TODOS, FALSE-COMPLETE) are +hardcoded as inline template strings in **two** builders (`:1140`, `:1400`) with slightly divergent +wording. Users cannot tune them, and the duplication risks drift. + +### Design +Extract the rubric into editable Markdown files with a single load path used by both builders. + +- **Single file, two sections.** One `rubric.md` with `## Patterns` (positive "what 'done' looks + like" criteria) and `## Antipatterns` (the mined premature-stop rules: PERMISSION-SEEKING, + STOPPED-WITH-TODOS, FALSE-COMPLETE). Managed/overridden as a unit. +- **Packaged default = embedded constant.** The default lives as a `DEFAULT_RUBRIC` string constant + (seeded verbatim from today's inline text) so behavior is preserved out of the box **and** the + single-file `cp reflection-3.ts` install path keeps working (no shipped rubric file to lose). + Externalizing also consolidates the two divergent inline copies into one source of truth. The + user-facing `rubric.md` is the *override*, not a shipped artifact. +- **Override resolution** (first found wins): + 1. project: `.reflection/rubric.md` + 2. global: `~/.config/opencode/supervisor/rubric.md` + 3. embedded `DEFAULT_RUBRIC` constant +- New loader `loadRubric(directory)` → `{ patterns: string, antipatterns: string, source }`, cached + per `runReflection` pass. It splits the file on the two `##` headings. Both + `buildSelfAssessmentPrompt` and `analyzeSelfAssessmentWithLLM` interpolate `rubric.patterns` / + `rubric.antipatterns` instead of inline literals. +- Malformed/empty override, or a missing section → log + fall back to the packaged default for the + whole file (never run with an empty rubric). + +This is file-based config only; no command is added for the rubric in v1 (a `/supervisor:rubric` +print/reset helper is a possible later add, deferred under YAGNI). + +--- + +## Feature 2 — Configurable retry budget (`/supervisor:retry`) + +### Problem +`MAX_ATTEMPTS = 3` (`:25`) is too low for substantial autonomous work and is not configurable. + +### Design +- Rename the constant to `DEFAULT_MAX_ATTEMPTS` and **default it to 16**. +- Resolve the effective cap per session: `supervisorState.maxAttempts ?? configMaxAttempts ?? 16`, + where `configMaxAttempts` comes from `~/.config/opencode/reflection.yaml` (existing file, `:132`). +- **`/supervisor:retry `** sets the per-session override (clamped to a sane range, e.g. 1–100); + `/supervisor:retry` with no arg reports the current effective value via toast. +- This single cap governs **all** reflection re-prompts (with or without a goal). The goal loop uses + the same budget — no second turn counter. `maxTokens` / `maxDurationMs` remain as optional + secondary safety caps (config-only) so a runaway loop still terminates on spend/time. + +--- + +## Feature 3 — `/supervisor:goal` (session-scoped goal) + +### What it adds over the existing `set-reflection` tool +1. A **user-typed command** (`/supervisor:goal …`) rather than an agent-only tool. +2. **Session-scoped persistence + resume** (vs the in-memory `toolReflectionPrompt` `let`). +3. **Budget**: reuses the Feature-2 retry budget + secondary token/time caps. +4. **Auto-clear on achieve + status**, with the condition enforced as a mandatory completion + requirement. + +### Completion semantics (decision: "goal AND all applicable gates") +One judge call: when a goal is active, the condition is injected into the prompt as a top-priority +*mandatory completion requirement*, on top of the default rubric (which already encodes the +applicable gates). The resulting `ReflectionAnalysis.complete` means "applicable gates pass **and** +the condition is met." Gates are the *applicable* ones the engine already infers — a docs-only goal +won't require a PR/CI. If `complete` is false, the loop continues with the existing feedback +(`analysis.reason` + `analysis.missing`). + +### Precedence (additive, not replacement) +A goal does not swap out the judge prompt (that would drop the gates). While a goal is active the +prompt is `rubric (patterns + antipatterns) + buildGoalRequirementSection(condition)`, and the +`.reflection.md` / `toolReflectionPrompt` overrides are bypassed (the goal is the strongest expressed +intent and must compose with the gates). With no goal active, existing precedence is unchanged. + +### Command surface (mirrors CC) +- `/supervisor:goal ` — set/replace the session goal and start working (≤ 4000 chars) +- `/supervisor:goal` — status toast: condition, status, attempts used / budget, last reason +- `/supervisor:goal clear` (aliases `stop`/`off`/`reset`/`none`/`cancel`) — clear the active goal + +--- + +## Components + +### A. `supervisorStore` (new, isolated) +Per-session state at `.reflection/supervisor/.json` (`0600`): + +```jsonc +{ + "maxAttempts": 16, // /supervisor:retry override (optional; else config/default) + "goal": { + "condition": "all tests in test/auth pass and lint is clean", + "status": "active", // active | paused | achieved | cleared | exhausted + "attempts": 0, // shared with the reflection retry counter + "tokenBaseline": 0, + "startedAt": 0, + "deadline": 0, + "lastReason": "" + } +} +``` +API: `load(sid)`, `save(sid, state)`, `setGoal/clearGoal(sid)`, `setRetry(sid, n)`, `list()`. + +### B. Rubric loader (new) — Feature 1, as above. + +### C. `/supervisor:*` command capture (new) +Shipped as OpenCode commands under the `supervisor` namespace (mapping `supervisor:goal` / +`supervisor:retry` to OpenCode's command-namespacing convention — **to verify**: subdirectory +`.opencode/command/supervisor/{goal,retry}.md` vs a colon-named file vs `opencode.json` `command` +entries). **Capture mechanism — spike first:** prefer the `command.executed` event if it carries the +command name + raw args (deterministic, no transcript parsing); fall back to a control-marker in the +command template that the plugin scans from the user message. + +### D. `buildGoalRequirementSection(condition)` (new) +Prompt fragment appended to the loaded `rubric` (patterns + antipatterns) when a goal is active; states the +condition as mandatory and reinforces the evidence rules. Feeds the **existing** throwaway-session +judge — one call, no new model stack, no separate `goalMet` field (folded into `complete`). + +### E. Loop integration in `runReflection` (modified) +1. Load `supervisorState`. Effective cap = `state.maxAttempts ?? config ?? 16`. +2. If a goal is `active`: **budget gate first** — if `attempts >= cap`, or `spend >= maxTokens`, or + `now >= deadline` → `status = "exhausted"`, persist, toast, **return without continuing**. +3. Build prompt (with goal-requirement section if a goal is active) → run independent judge. +4. **Complete** → if a goal is active, set `status = "achieved"`, persist, `✓` toast; stop. +5. **Not complete** → increment `attempts` (shared counter), persist, inject continuation via the + existing `client.session.promptAsync` (feedback = `reason` + `missing` + remaining budget). + +With no goal, behavior is the prior reflection flow but with the cap now defaulting to 16 and +honoring `/supervisor:retry`. + +### F. Resume behavior +A persisted goal restores **active** on resume (faithful to CC); `attempts`/`deadline`/`tokenBaseline` +reset (matches CC counter reset), so the loop re-enters on the next `session.idle`. A +`supervisorResumePaused` config flag (default false) can opt into restoring paused for the cautious. +`maxAttempts` overrides persist across resume. Achieved/cleared goals are not restored. + +### G. Config knobs +`~/.config/opencode/reflection.yaml` (+ env): `maxAttempts` (16), `goalMaxTokens` (400000), +`goalMaxDurationMs` (1800000), `supervisorResumePaused` (false), rubric override path (implicit via +the resolution order above). + +## Data flow + +``` +user: /supervisor:goal user: /supervisor:retry 16 + → capture → supervisorStore.setGoal → capture → supervisorStore.setRetry(sid,16) + → agent works ... session.idle + → runReflection(sid): + cap = state.maxAttempts ?? config ?? 16 + goal active & budget exceeded? → pause + toast → STOP + else → rubric (patterns + antipatterns) [+ goal requirement] → independent judge + → ReflectionAnalysis.complete (gates [AND condition]) + complete? → (goal) achieved + ✓ toast → STOP + else → attempts++, persist → promptAsync(main session, reason+missing+budget) + → loop until complete or budget exhausted +``` + +## Error handling +- **Judge failure/timeout** (`JUDGE_RESPONSE_TIMEOUT`): never clear a goal and never inject on a + failed judge — log + skip this idle (fail safe; never falsely "achieve"). +- **Empty rubric / corrupt state file**: fall back to packaged default rubric / treat as no goal; log. +- **Continuation = a user turn** via `promptAsync` — provider-safe; avoids the `github-copilot` + prefill-400 and the empty-continuation race (opencode issue #15267). README recommends the + `anthropic` provider for long unattended runs. +- **Session deleted mid-loop** (existing guard `:1684`): clear in-memory tracking; leave files for resume. +- Concurrency: reuse the `activeReflections` guard `:1669`. + +## Testing & eval +- **Unit:** `supervisorStore` round-trip (goal + retry) and clear; rubric loader precedence + (project > global > packaged) and empty-file fallback; effective-cap resolution + (`/supervisor:retry` > config > 16); budget gate (attempts/tokens/deadline); + `buildGoalRequirementSection` composes onto the rubric and bypasses file/tool overrides while + active; idempotent continuation under `activeReflections`. +- **Behavioral / eval:** reuse the repo's promptfoo harness + labeled CC-stop dataset. Key criterion + is **C5 / verification-theater**: a bare "condition met" claim with no evidence (tests not run) + must yield `complete=false` and the goal must **not** clear. Fixtures: (a) condition met with + evidence → achieves; (b) bare claim → continues; (c) budget exhausted → pauses, no continuation; + (d) editing the `## Antipatterns` section of `rubric.md` changes the judge's verdict (proves the + rubric is live-configurable). +- **Integration:** existing `test/e2e.test.ts` style — set a goal, drive idle events, assert + continuation until complete, then auto-clear; `/supervisor:retry 1` caps the loop at one attempt. + +## Open items (resolve in implementation, not design) +1. OpenCode command namespacing for `supervisor:goal` / `supervisor:retry` (subdir vs colon vs config entry). +2. `command.executed` payload (args available?) vs control-marker fallback — spike first. +3. Whether per-goal `--max-turns`/`--max-tokens` flags are worth parsing in v1 or config-only. + +## Out of scope +Sentinels; multi-goal stacking; new notification channels; a separate evaluator model; a +`/supervisor:rubric` command (v2); the Claude Code `claude/` runtime. From 948874dd87d0155fbbc03f0fb85fb9852e7d3874 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 01:22:00 +0000 Subject: [PATCH 02/14] feat(supervisor): embedded default rubric + loadRubric override Extract the judge's positive completion rules (Patterns) and mined premature-stop rules (Antipatterns) into an embedded DEFAULT_RUBRIC, overridable via .reflection/rubric.md (project) or ~/.config/opencode/supervisor/rubric.md (global). Falls back to the default if an override is missing either section. Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 71 ++++++++++++++++++++++++++++++++++++ test/supervisor.unit.test.ts | 46 +++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 test/supervisor.unit.test.ts diff --git a/reflection-3.ts b/reflection-3.ts index fe3acab..15bf502 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -24,6 +24,77 @@ const SELF_ASSESSMENT_MARKER = "## Reflection-3 Self-Assessment" const FEEDBACK_MARKER = "## Reflection-3:" const MAX_ATTEMPTS = 3 +// --------------------------------------------------------------------------- +// Supervisor rubric (configurable patterns/antipatterns) +// +// The judge's positive completion rules ("Patterns") and the mined +// premature-stop rules ("Antipatterns") were historically hardcoded inline in +// two prompt builders. They now live in this single embedded default, which +// users can override with a rubric.md file (see loadRubric). The embedded +// default keeps the single-file `cp reflection-3.ts` install working. +// --------------------------------------------------------------------------- + +export const DEFAULT_RUBRIC = `## Patterns +- If coding work is complete, confirm tests ran after the latest changes and passed. +- If local tests are required, provide the exact commands run in this session. +- If PR exists, verify CI checks and report status. +- Tests cannot be skipped or marked as flaky/not important. +- Direct pushes to main/master are not allowed; require a PR instead. +- If stuck, propose an alternate approach. +- If you need user action (auth, 2FA, credentials, access requests, uploads, approvals), list it in needs_user_action. +- PLANNING LOOP CHECK: If the task requires code changes (fix, implement, add, create, build, refactor, update) but the tool commands show ONLY read operations (read, glob, grep, git log, git status, git diff, webfetch, task/explore) and NO write operations (edit, write, bash with build/test/commit, github_create_pull_request, etc.), then the task is NOT complete. Set status to "in_progress", set stuck to true, and list "Implement the actual code changes" in remaining_work. Analyzing and recommending changes is not the same as making them. +- If you are repeating the same actions (deploy, test, build) without making progress, set "stuck": true. +- Do not retry the same failing approach more than twice — try something different or report stuck. + +## Antipatterns +PREMATURE-STOP ANTIPATTERNS (mined from 227 real agent stops where the user replied; 78% were premature — the user said "go"/"continue"/"yes do it" or corrected the agent). If the agent's last response matches one of these AND executable work remains, the task is NOT complete — set status "in_progress", and put the concrete next action in remaining_work and next_steps: +- PERMISSION-SEEKING (most common, ~40%): the response ends by asking to do work it can already do — "Want me to…?", "Would you like me to…?", "Should I…?", "Shall I proceed?", or "Try running it now"/"Please run X and confirm" (deferring a check it could run itself). DECISIVE TEST: if the final turn is a yes/no or "want me to X?" question AND X is something the agent can do with its own tools AND X carries no irreversible risk, the stop is premature — it should have just done X. Asking is only legitimate before a destructive/irreversible action (delete prod data, force-push, send an irreversible external message). +- STOPPED-WITH-TODOS (~30%): the response lists "Remaining Tasks"/"Next steps"/"Still TODO"/"What I did NOT do" or names a verify/run/check/create-PR step as "next" — then stops without doing it. Listing remaining work does not complete it; a self-contained named step must be DONE before stopping. Set status "in_progress" with that work in remaining_work. +- FALSE-COMPLETE: claims "done"/"complete"/"ready"/"all tasks complete" but the CORE requested action never happened, a required check was skipped, or there is no evidence. An empty/no-text response, or a response with no write/tool evidence on an action task, is NEVER complete. For an "add a " task, writing files is necessary but NOT sufficient — the new code must be WIRED IN (imported/registered/routed, not orphaned modules) AND verified (test/build/run); "ready to use" with no integration is incomplete (status "in_progress"). +- LEGITIMATE STOP (do NOT flag): genuine human-only block (OAuth consent, 2FA code, credential/API-key retrieval, captcha) → status "waiting_for_user" with the item in needs_user_action. Genuine completion WITH evidence (commands+output, tests passing, PR/CI verified) → status "complete"; do not invent missing work. +- SEVERITY/STUCK: a single recoverable technical snag mid-task (knows the fix) is not "stuck". But a policy/process violation — pushing to main when a PR was required, skipping mandated tests — is a real failure: status "in_progress" with the corrective action in remaining_work, never "complete".` + +export interface Rubric { + patterns: string + antipatterns: string +} + +/** Split a rubric markdown doc into its `## Patterns` and `## Antipatterns` sections. */ +export function parseRubric(md: string): Rubric { + const section = (name: string): string => { + const re = new RegExp(`##\\s+${name}\\s*\\n([\\s\\S]*?)(?=\\n##\\s|$)`, "i") + return (md.match(re)?.[1] ?? "").trim() + } + return { patterns: section("Patterns"), antipatterns: section("Antipatterns") } +} + +/** + * Load the active rubric for a project. Precedence (first complete one wins): + * 1. project: /.reflection/rubric.md → source "project" + * 2. global: ~/.config/opencode/supervisor/rubric.md → source "global" + * 3. embedded DEFAULT_RUBRIC → source "default" + * An override missing either section falls through to the next source, so the + * judge never runs with an empty rubric. + */ +export async function loadRubric( + directory: string +): Promise { + const candidates: Array<{ path: string; source: "project" | "global" }> = [ + { path: join(directory, ".reflection", "rubric.md"), source: "project" }, + { path: join(homedir(), ".config", "opencode", "supervisor", "rubric.md"), source: "global" }, + ] + for (const { path, source } of candidates) { + try { + const md = await readFile(path, "utf-8") + const r = parseRubric(md) + if (r.patterns && r.antipatterns) return { ...r, source } + } catch { + /* not present / unreadable — try next */ + } + } + return { ...parseRubric(DEFAULT_RUBRIC), source: "default" } +} + const JUDGE_BLOCKED_PATTERNS = [ /\bhaiku\b/i, /\bmini\b/i, diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts new file mode 100644 index 0000000..4a740ce --- /dev/null +++ b/test/supervisor.unit.test.ts @@ -0,0 +1,46 @@ +import assert from "node:assert" +import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" +import { tmpdir } from "node:os" +import { join } from "node:path" +import { DEFAULT_RUBRIC, parseRubric, loadRubric } from "../reflection-3.ts" + +describe("supervisor: rubric", () => { + it("DEFAULT_RUBRIC has both sections and the mined antipatterns", () => { + const r = parseRubric(DEFAULT_RUBRIC) + assert.ok(r.patterns.length > 0, "patterns section present") + assert.match(r.antipatterns, /PERMISSION-SEEKING/) + assert.match(r.antipatterns, /STOPPED-WITH-TODOS/) + assert.match(r.antipatterns, /FALSE-COMPLETE/) + }) + + it("parseRubric splits on the two headings", () => { + const r = parseRubric("## Patterns\nP-BODY\n## Antipatterns\nA-BODY") + assert.strictEqual(r.patterns, "P-BODY") + assert.strictEqual(r.antipatterns, "A-BODY") + }) + + it("project .reflection/rubric.md overrides default", async () => { + const dir = mkdtempSync(join(tmpdir(), "rub-")) + mkdirSync(join(dir, ".reflection"), { recursive: true }) + writeFileSync(join(dir, ".reflection/rubric.md"), "## Patterns\nP\n## Antipatterns\nMY-RULE") + const r = await loadRubric(dir) + assert.strictEqual(r.source, "project") + assert.match(r.antipatterns, /MY-RULE/) + }) + + it("falls back to default when no override exists", async () => { + const dir = mkdtempSync(join(tmpdir(), "rub-")) + const r = await loadRubric(dir) + assert.strictEqual(r.source, "default") + assert.match(r.antipatterns, /PERMISSION-SEEKING/) + }) + + it("falls back to default when override is missing a section", async () => { + const dir = mkdtempSync(join(tmpdir(), "rub-")) + mkdirSync(join(dir, ".reflection"), { recursive: true }) + writeFileSync(join(dir, ".reflection/rubric.md"), "## Patterns\nonly patterns, no antipatterns heading") + const r = await loadRubric(dir) + assert.strictEqual(r.source, "default") + assert.match(r.antipatterns, /FALSE-COMPLETE/) + }) +}) From c12b0ff3dbacca8d04540585cc1c9810018d5484 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 03:07:52 +0000 Subject: [PATCH 03/14] feat(supervisor): load rubric from file in both prompt builders Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 53 ++++++++++++------------------------ test/supervisor.unit.test.ts | 17 +++++++++++- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/reflection-3.ts b/reflection-3.ts index 15bf502..0e909f7 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -1121,11 +1121,12 @@ function extractLastAssistantText(messages: any[]): string { return "" } -function buildSelfAssessmentPrompt( +export function buildSelfAssessmentPrompt( context: TaskContext, agents: string, lastAssistantText?: string, - attemptCount?: number + attemptCount?: number, + rubric?: Rubric ): string { const safeContext = { ...context, @@ -1146,6 +1147,7 @@ function buildSelfAssessmentPrompt( ? `\n## Agent's Last Response\n${lastAssistantText.slice(0, 4000)}\n` : "" + const rb = rubric ?? parseRubric(DEFAULT_RUBRIC) const currentAttempt = attemptCount || 0 const attemptSection = currentAttempt > 0 ? `\n## Reflection History\n- This is reflection attempt ${currentAttempt + 1}/${MAX_ATTEMPTS} for this task.\n- Previous reflections found the task incomplete.\n- If you are repeating the same actions without progress, set "stuck": true and explain what is blocking you.\n` @@ -1197,23 +1199,9 @@ ${agents ? `## Project Instructions\n${agents.slice(0, 800)}\n\n` : ""}Return JS } Rules: -- If coding work is complete, confirm tests ran after the latest changes and passed. -- If local tests are required, provide the exact commands run in this session. -- If PR exists, verify CI checks and report status. -- Tests cannot be skipped or marked as flaky/not important. -- Direct pushes to main/master are not allowed; require a PR instead. -- If stuck, propose an alternate approach. -- If you need user action (auth, 2FA, credentials, access requests, uploads, approvals), list it in needs_user_action. -- PLANNING LOOP CHECK: If the task requires code changes (fix, implement, add, create, build, refactor, update) but the "Tool Commands Run" section shows ONLY read operations (read, glob, grep, git log, git status, git diff, webfetch, task/explore) and NO write operations (edit, write, bash with build/test/commit, github_create_pull_request, etc.), then the task is NOT complete. Set status to "in_progress", set stuck to true, and list "Implement the actual code changes" in remaining_work. Analyzing and recommending changes is not the same as making them. -- If you are repeating the same actions (deploy, test, build) without making progress, set "stuck": true. -- Do not retry the same failing approach more than twice — try something different or report stuck. +${rb.patterns} -PREMATURE-STOP ANTIPATTERNS (mined from 227 real agent stops where the user replied; 78% were premature — the user said "go"/"continue"/"yes do it" or corrected the agent). If the agent's last response matches one of these AND executable work remains, the task is NOT complete — set status "in_progress", and put the concrete next action in remaining_work and next_steps: -- PERMISSION-SEEKING (most common, ~40%): the response ends by asking to do work it can already do — "Want me to…?", "Would you like me to…?", "Should I…?", "Shall I proceed?", or "Try running it now"/"Please run X and confirm" (deferring a check it could run itself). DECISIVE TEST: if the final turn is a yes/no or "want me to X?" question AND X is something the agent can do with its own tools AND X carries no irreversible risk, the stop is premature — it should have just done X. Asking is only legitimate before a destructive/irreversible action (delete prod data, force-push, send an irreversible external message). -- STOPPED-WITH-TODOS (~30%): the response lists "Remaining Tasks"/"Next steps"/"Still TODO"/"What I did NOT do" or names a verify/run/check/create-PR step as "next" — then stops without doing it. Listing remaining work does not complete it; a self-contained named step must be DONE before stopping. Set status "in_progress" with that work in remaining_work. -- FALSE-COMPLETE: claims "done"/"complete"/"ready"/"all tasks complete" but the CORE requested action never happened, a required check was skipped, or there is no evidence. An empty/no-text response, or a response with no write/tool evidence on an action task, is NEVER complete. For an "add a " task, writing files is necessary but NOT sufficient — the new code must be WIRED IN (imported/registered/routed, not orphaned modules) AND verified (test/build/run); "ready to use" with no integration is incomplete (status "in_progress"). -- LEGITIMATE STOP (do NOT flag): genuine human-only block (OAuth consent, 2FA code, credential/API-key retrieval, captcha) → status "waiting_for_user" with the item in needs_user_action. Genuine completion WITH evidence (commands+output, tests passing, PR/CI verified) → status "complete"; do not invent missing work. -- SEVERITY/STUCK: a single recoverable technical snag mid-task (knows the fix) is not "stuck". But a policy/process violation — pushing to main when a PR was required, skipping mandated tests — is a real failure: status "in_progress" with the corrective action in remaining_work, never "complete".` +${rb.antipatterns}` } function parseSelfAssessmentJson(text: string | null | undefined): SelfAssessment | null { @@ -1424,8 +1412,10 @@ async function analyzeSelfAssessmentWithLLM( context: TaskContext, selfAssessment: string, judgeSessionIds: Set, - toolReflectionPrompt?: string | null + toolReflectionPrompt?: string | null, + rubric?: Rubric ): Promise { + const rb = rubric ?? parseRubric(DEFAULT_RUBRIC) const modelList = await loadReflectionModelList() const preferredModel = await loadPreferredModelSpec(directory) const attempts = modelList.length @@ -1459,21 +1449,9 @@ ${selfAssessment.slice(0, 4000)} ${buildToolReflectionGuidanceSection(toolReflectionPrompt || null)} Rules: -- If tests are required, agent must confirm tests ran AFTER latest changes and passed. -- If local test commands are required, agent must list the exact commands run in this session. -- If tests were skipped/flaky/not important, task is incomplete. -- Direct pushes to main/master are not allowed; require PR instead. -- If PR required, agent must provide PR link. -- If PR exists, CI checks must be verified and passing. -- If user action is required (auth/2FA/credentials), set requires_human_action true. -- If agent is stuck, require alternate approach and continued work. -- PLANNING LOOP: If the task requires code changes (fix, implement, add, create, build, refactor) but the Tool Signals show ONLY read operations (read, glob, grep, git log/status/diff, webfetch) and NO write operations (edit, write, bash with build/test/commit, PR creation), set complete to false and add "Implement actual code changes" to missing. Analysis alone does not fulfill an implementation task. -- PREMATURE-STOP ANTIPATTERNS (78% of real agent stops were premature). Set complete false, requires_human_action false, and put the concrete work in next_actions when the agent's response matches: - - PERMISSION-SEEKING: ends asking to do work it can already do ("Want me to…?", "Should I…?", "Try running it now", "Please run X and confirm"). DECISIVE TEST: final-turn yes/no question about something the agent can do with its own tools and no irreversible risk = premature; it should have done it. Includes "finished step N, asking which sub-task to do next" when the task named the work. Asking is legitimate only before destructive/irreversible actions, or when the task explicitly scoped the deliverable to just the part already done. - - STOPPED-WITH-TODOS: lists "Remaining Tasks"/"Next steps"/"What I did NOT do" or names a verify/run/check step as next, then stops. Listing ≠ doing. - - FALSE-COMPLETE: claims done/ready/"all tasks complete" but the core action never happened, a required check was skipped, or no evidence. Empty/no-tool response on an action task is never complete. For an "add a " task, written files alone are not enough — code must be wired in (imported/registered/routed) AND verified; "ready to use" with no integration is incomplete. -- LEGITIMATE STOP (do NOT penalize): genuine human-only block (OAuth consent, 2FA, credential/API-key retrieval, captcha) → complete false, requires_human_action true. Genuine completion WITH evidence → complete true; do not invent missing work. -- SEVERITY: a single recoverable technical snag mid-task is LOW/MEDIUM; a repeated retry loop, broken functionality, or red CI is HIGH; a policy/process violation (push to main when a PR was required, skipping mandated tests) is HIGH; a confirmed security/auth/data-loss/prod defect is BLOCKER. +${rb.patterns} + +${rb.antipatterns} Return JSON only: { @@ -1784,12 +1762,14 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { const lastAssistantText = extractLastAssistantText(messages) const customPrompt = await loadReflectionPrompt(directory) const agents = await getAgentsFile(directory) + const rubric = await loadRubric(directory) const currentAttemptCount = attempts.get(attemptKey) || 0 const defaultReflectionPrompt = buildSelfAssessmentPrompt( context, agents, lastAssistantText, - currentAttemptCount + currentAttemptCount, + rubric ) const resolvedPrompt = resolveReflectionPromptPrecedence(customPrompt, toolReflectionPrompt, defaultReflectionPrompt) const reflectionPrompt = resolvedPrompt.prompt @@ -1900,7 +1880,8 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { context, selfAssessment, judgeSessionIds, - effectiveToolReflectionPrompt + effectiveToolReflectionPrompt, + rubric ) } diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index 4a740ce..7f0956a 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,7 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt } from "../reflection-3.ts" describe("supervisor: rubric", () => { it("DEFAULT_RUBRIC has both sections and the mined antipatterns", () => { @@ -44,3 +44,18 @@ describe("supervisor: rubric", () => { assert.match(r.antipatterns, /FALSE-COMPLETE/) }) }) + +describe("supervisor: buildSelfAssessmentPrompt rubric interpolation", () => { + it("buildSelfAssessmentPrompt interpolates the provided rubric antipatterns", () => { + const ctx = { + taskSummary: "x", taskType: "coding", agentMode: "build", + requiresTests: false, requiresBuild: false, requiresPR: false, requiresCI: false, + requiresLocalTests: false, requiresLocalTestsEvidence: false, + humanMessages: [], toolsSummary: "none", detectedSignals: [], recentCommands: [], + pushedToDefaultBranch: false, + } as any + const prompt = buildSelfAssessmentPrompt(ctx, "AGENTS", "last response", 0, { patterns: "PP-RULE", antipatterns: "ZZ-RULE" }) + assert.match(prompt, /ZZ-RULE/) + assert.match(prompt, /PP-RULE/) + }) +}) From f375eb5efbbcc0be652d4c5c6b75b07f7d0953c7 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 03:19:34 +0000 Subject: [PATCH 04/14] test(supervisor): de-dupe buildSelfAssessmentPrompt helper + cover judge-prompt rubric Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.test-helpers.ts | 78 +--------------------------------- reflection-3.ts | 37 ++++++++++------ test/reflection-3.unit.test.ts | 2 +- test/supervisor.unit.test.ts | 28 +++++++++++- 4 files changed, 52 insertions(+), 93 deletions(-) diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts index c54547f..6bee44c 100644 --- a/reflection-3.test-helpers.ts +++ b/reflection-3.test-helpers.ts @@ -87,83 +87,7 @@ export function inferTaskType(text: string): TaskType { return "other" } -export function buildSelfAssessmentPrompt(context: TaskContext, agents: string, lastAssistantText?: string, attemptCount?: number): string { - const safeContext = { - ...context, - detectedSignals: Array.isArray(context.detectedSignals) ? context.detectedSignals : [] - } - const requirements: string[] = [] - if (safeContext.requiresTests) requirements.push("Tests required (run after latest changes)") - if (safeContext.requiresBuild) requirements.push("Build/compile required") - if (safeContext.requiresPR) requirements.push("PR required (include link)") - if (safeContext.requiresCI) requirements.push("CI checks required (verify status)") - if (safeContext.requiresLocalTests) requirements.push("Local tests required (must run in this session)") - if (safeContext.pushedToDefaultBranch) requirements.push("Detected direct push to default branch (must be avoided)") - if (requirements.length === 0) requirements.push("No explicit workflow gates detected") - - const signalSummary = safeContext.detectedSignals.length ? safeContext.detectedSignals.join(", ") : "none" - - const assistantSection = lastAssistantText - ? `\n## Agent's Last Response\n${lastAssistantText.slice(0, 4000)}\n` - : "" - - const currentAttempt = attemptCount || 0 - const attemptSection = currentAttempt > 0 - ? `\n## Reflection History\n- This is reflection attempt ${currentAttempt + 1}/${MAX_ATTEMPTS} for this task.\n- Previous reflections found the task incomplete.\n- If you are repeating the same actions without progress, set "stuck": true and explain what is blocking you.\n` - : "" - - return `SELF-ASSESS REFLECTION-3 - -You are evaluating an agent's work against workflow requirements. -Analyze the task context, the agent's last response, and the tool signals to determine whether the task is complete. - -## Task Context -- Summary: ${safeContext.taskSummary} -- Type: ${safeContext.taskType} -- Mode: ${safeContext.agentMode} -- Required checks: ${requirements.join("; ")} -- Detected signals: ${signalSummary} - -## Tool Commands Run -${safeContext.toolsSummary} -${assistantSection}${attemptSection} -${agents ? `## Project Instructions\n${agents.slice(0, 800)}\n\n` : ""}Return JSON only: -{ - "task_summary": "...", - "task_type": "feature|bugfix|refactor|docs|research|ops|other", - "status": "complete|in_progress|blocked|stuck|waiting_for_user", - "confidence": 0.0, - "evidence": { - "tests": { "ran": true/false, "results": "pass|fail|unknown", "ran_after_changes": true/false, "commands": ["..."] }, - "build": { "ran": true/false, "results": "pass|fail|unknown" }, - "pr": { "created": true/false, "url": "", "ci_status": "pass|fail|unknown", "checked": true/false } - }, - "remaining_work": ["..."], - "next_steps": ["..."], - "needs_user_action": ["..."], - "stuck": true/false, - "alternate_approach": "" -} - -Rules: -- If coding work is complete, confirm tests ran after the latest changes and passed. -- If local tests are required, provide the exact commands run in this session. -- If PR exists, verify CI checks and report status. -- If tests were skipped or marked flaky/not important, the task is incomplete. -- Direct pushes to main/master are not allowed; require a PR instead. -- Provide a PR URL and CI status when a PR is required. -- If stuck, propose an alternate approach. -- If you need user action (auth, 2FA, credentials), list it in needs_user_action. -- If you are repeating the same actions (deploy, test, build) without making progress, set "stuck": true. -- Do not retry the same failing approach more than twice — try something different or report stuck. - -PREMATURE-STOP ANTIPATTERNS (mined from 227 real agent stops where the user replied; 78% were premature — the user said "go"/"continue"/"yes do it" or corrected the agent). If the agent's last response matches one of these AND executable work remains, the task is NOT complete — set status "in_progress", and put the concrete next action in remaining_work and next_steps: -- PERMISSION-SEEKING (most common, ~40%): the response ends by asking to do work it can already do — "Want me to…?", "Would you like me to…?", "Should I…?", "Shall I proceed?", or "Try running it now"/"Please run X and confirm" (deferring a check it could run itself). DECISIVE TEST: if the final turn is a yes/no or "want me to X?" question AND X is something the agent can do with its own tools AND X carries no irreversible risk, the stop is premature — it should have just done X. Asking is only legitimate before a destructive/irreversible action (delete prod data, force-push, send an irreversible external message). -- STOPPED-WITH-TODOS (~30%): the response lists "Remaining Tasks"/"Next steps"/"Still TODO"/"What I did NOT do" or names a verify/run/check/create-PR step as "next" — then stops without doing it. Listing remaining work does not complete it; a self-contained named step must be DONE before stopping. Set status "in_progress" with that work in remaining_work. -- FALSE-COMPLETE: claims "done"/"complete"/"ready"/"all tasks complete" but the CORE requested action never happened, a required check was skipped, or there is no evidence. An empty/no-text response, or a response with no write/tool evidence on an action task, is NEVER complete. For an "add a " task, writing files is necessary but NOT sufficient — the new code must be WIRED IN (imported/registered/routed, not orphaned modules) AND verified (test/build/run); "ready to use" with no integration is incomplete (status "in_progress"). -- LEGITIMATE STOP (do NOT flag): genuine human-only block (OAuth consent, 2FA code, credential/API-key retrieval, captcha) → status "waiting_for_user" with the item in needs_user_action. Genuine completion WITH evidence (commands+output, tests passing, PR/CI verified) → status "complete"; do not invent missing work. -- SEVERITY/STUCK: a single recoverable technical snag mid-task (knows the fix) is not "stuck". But a policy/process violation — pushing to main when a PR was required, skipping mandated tests — is a real failure: status "in_progress" with the corrective action in remaining_work, never "complete".` -} +export { buildSelfAssessmentPrompt } from "./reflection-3.ts" export function buildToolReflectionGuidanceSection(toolReflectionPrompt: string | null): string { if (!toolReflectionPrompt) return "" diff --git a/reflection-3.ts b/reflection-3.ts index 0e909f7..87e5316 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -1406,25 +1406,14 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext } } -async function analyzeSelfAssessmentWithLLM( - client: any, - directory: string, +export function buildJudgePrompt( context: TaskContext, selfAssessment: string, - judgeSessionIds: Set, toolReflectionPrompt?: string | null, rubric?: Rubric -): Promise { +): string { const rb = rubric ?? parseRubric(DEFAULT_RUBRIC) - const modelList = await loadReflectionModelList() - const preferredModel = await loadPreferredModelSpec(directory) - const attempts = modelList.length - ? modelList - : preferredModel && !isBlockedJudgeModel(preferredModel) - ? [preferredModel] - : [""] - - const prompt = `ANALYZE REFLECTION-3 + return `ANALYZE REFLECTION-3 You are validating an agent's self-assessment against workflow requirements. @@ -1462,6 +1451,26 @@ Return JSON only: "next_actions": ["actions to take"], "requires_human_action": true/false }` +} + +async function analyzeSelfAssessmentWithLLM( + client: any, + directory: string, + context: TaskContext, + selfAssessment: string, + judgeSessionIds: Set, + toolReflectionPrompt?: string | null, + rubric?: Rubric +): Promise { + const modelList = await loadReflectionModelList() + const preferredModel = await loadPreferredModelSpec(directory) + const attempts = modelList.length + ? modelList + : preferredModel && !isBlockedJudgeModel(preferredModel) + ? [preferredModel] + : [""] + + const prompt = buildJudgePrompt(context, selfAssessment, toolReflectionPrompt, rubric) for (const modelSpec of attempts) { let judgeSession: any diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts index d81d92b..b2c057b 100644 --- a/test/reflection-3.unit.test.ts +++ b/test/reflection-3.unit.test.ts @@ -56,7 +56,7 @@ describe("reflection-3 unit", () => { assert.ok(prompt.includes("Return JSON only")) assert.ok(prompt.includes("Local tests required")) assert.ok(prompt.includes("Direct pushes")) - assert.ok(prompt.includes("Provide a PR URL")) + assert.ok(prompt.includes("verify CI checks")) }) it("self-assessment prompt includes premature-stop antipatterns", () => { diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index 7f0956a..de872c8 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,7 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt } from "../reflection-3.ts" describe("supervisor: rubric", () => { it("DEFAULT_RUBRIC has both sections and the mined antipatterns", () => { @@ -59,3 +59,29 @@ describe("supervisor: buildSelfAssessmentPrompt rubric interpolation", () => { assert.match(prompt, /PP-RULE/) }) }) + +describe("supervisor: buildJudgePrompt rubric interpolation", () => { + it("buildJudgePrompt interpolates a custom rubric's patterns and antipatterns", () => { + const ctx = { + taskSummary: "Fix the login bug", taskType: "coding", + requiresTests: true, requiresBuild: false, requiresPR: false, requiresCI: false, + requiresLocalTests: false, + toolsSummary: "npm test → pass", + } as any + const prompt = buildJudgePrompt(ctx, "assessment text", null, { patterns: "PP-RULE", antipatterns: "ZZ-RULE" }) + assert.match(prompt, /PP-RULE/, "custom patterns must appear in judge prompt") + assert.match(prompt, /ZZ-RULE/, "custom antipatterns must appear in judge prompt") + }) + + it("buildJudgePrompt uses DEFAULT_RUBRIC when no rubric is provided", () => { + const ctx = { + taskSummary: "Fix the login bug", taskType: "coding", + requiresTests: false, requiresBuild: false, requiresPR: false, requiresCI: false, + requiresLocalTests: false, + toolsSummary: "(none)", + } as any + const prompt = buildJudgePrompt(ctx, "assessment text") + assert.match(prompt, /PERMISSION-SEEKING/, "default rubric antipatterns must be present") + assert.match(prompt, /STOPPED-WITH-TODOS/, "default rubric antipatterns must be present") + }) +}) From 79985e0c889a18675382e584e1084bf8983a6119 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 03:28:45 +0000 Subject: [PATCH 05/14] feat(supervisor): configurable retry budget, default 16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames MAX_ATTEMPTS→DEFAULT_MAX_ATTEMPTS (3→16), exports pure resolveMaxAttempts() with sessionOverride>config>default priority and [1,100] clamping, and adds loadConfiguredMaxAttempts() to read maxAttempts: from ~/.config/opencode/reflection.yaml. runReflection now computes effectiveMaxAttempts at runtime. Tests and test-helpers updated to match new defaults (3/16, 2/16 display strings). Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.test-helpers.ts | 6 ++--- reflection-3.ts | 48 +++++++++++++++++++++++++++++----- test/reflection-3.unit.test.ts | 8 +++--- test/supervisor.unit.test.ts | 18 ++++++++++++- 4 files changed, 66 insertions(+), 14 deletions(-) diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts index 6bee44c..8234ccb 100644 --- a/reflection-3.test-helpers.ts +++ b/reflection-3.test-helpers.ts @@ -434,7 +434,7 @@ export function getGitHubCopilotModelForRouting(modelSpec: string | null | undef } const FEEDBACK_MARKER = "## Reflection-3:" -const MAX_ATTEMPTS = 3 +const DEFAULT_MAX_ATTEMPTS = 16 const ACTION_LOOP_MIN_COMMANDS = 4 const ACTION_LOOP_REPETITION_THRESHOLD = 0.6 @@ -525,7 +525,7 @@ Start coding NOW. No more planning.` } if (isActionLoop) { - return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${MAX_ATTEMPTS}) + return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) You are repeating the same commands without making progress. Running the same deploy/test/build cycle again will produce the same result. @@ -555,7 +555,7 @@ Please address these issues and continue.` const missingBrief = missingItems.length ? `Still missing: ${missingItems.slice(0, 3).join(", ")}.` : "" - return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${MAX_ATTEMPTS}) + return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) ${missingBrief} diff --git a/reflection-3.ts b/reflection-3.ts index 87e5316..38c53ce 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -22,7 +22,42 @@ async function reportError(err: unknown, context?: Record): Prom const SELF_ASSESSMENT_MARKER = "## Reflection-3 Self-Assessment" const FEEDBACK_MARKER = "## Reflection-3:" -const MAX_ATTEMPTS = 3 +const DEFAULT_MAX_ATTEMPTS = 16 + +/** + * Pure function: resolve the effective max attempts from opts (no I/O). + * Priority: sessionOverride > config > DEFAULT_MAX_ATTEMPTS (16). + * Result is clamped to [1, 100]. Non-finite values (NaN, Infinity) are ignored. + */ +export function resolveMaxAttempts(opts: { sessionOverride?: number; config?: number }): number { + const candidates = [opts.sessionOverride, opts.config, DEFAULT_MAX_ATTEMPTS] + for (const candidate of candidates) { + if (candidate !== undefined && Number.isFinite(candidate)) { + return Math.min(100, Math.max(1, Math.floor(candidate))) + } + } + return DEFAULT_MAX_ATTEMPTS +} + +/** + * Read the top-level `maxAttempts:` integer from ~/.config/opencode/reflection.yaml. + * Returns undefined if the file is missing, the key is absent, or the value is invalid. + */ +async function loadConfiguredMaxAttempts(): Promise { + try { + const content = await readFile(REFLECTION_CONFIG_PATH, "utf-8") + for (const rawLine of content.split(/\r?\n/)) { + const line = rawLine.trim() + if (!line || line.startsWith("#")) continue + const match = line.match(/^maxAttempts\s*:\s*(\S+)/) + if (match) { + const value = parseInt(match[1], 10) + if (Number.isFinite(value)) return value + } + } + } catch {} + return undefined +} // --------------------------------------------------------------------------- // Supervisor rubric (configurable patterns/antipatterns) @@ -473,7 +508,7 @@ Start coding NOW. No more planning.` } if (isActionLoop) { - return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${MAX_ATTEMPTS}) + return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) You are repeating the same commands without making progress. Running the same deploy/test/build cycle again will produce the same result. @@ -503,7 +538,7 @@ Please address these issues and continue.` const missingBrief = missingItems.length ? `Still missing: ${missingItems.slice(0, 3).join(", ")}.` : "" - return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${MAX_ATTEMPTS}) + return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) ${missingBrief} @@ -1150,7 +1185,7 @@ export function buildSelfAssessmentPrompt( const rb = rubric ?? parseRubric(DEFAULT_RUBRIC) const currentAttempt = attemptCount || 0 const attemptSection = currentAttempt > 0 - ? `\n## Reflection History\n- This is reflection attempt ${currentAttempt + 1}/${MAX_ATTEMPTS} for this task.\n- Previous reflections found the task incomplete.\n- If you are repeating the same actions without progress, set "stuck": true and explain what is blocking you.\n` + ? `\n## Reflection History\n- This is reflection attempt ${currentAttempt + 1}/${DEFAULT_MAX_ATTEMPTS} for this task.\n- Previous reflections found the task incomplete.\n- If you are repeating the same actions without progress, set "stuck": true and explain what is blocking you.\n` : "" return `SELF-ASSESS REFLECTION-3 @@ -1983,11 +2018,12 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { return } + const effectiveMaxAttempts = resolveMaxAttempts({ sessionOverride: undefined, config: await loadConfiguredMaxAttempts() }) const nextAttemptCount = (attempts.get(attemptKey) || 0) + 1 attempts.set(attemptKey, nextAttemptCount) - if (nextAttemptCount >= MAX_ATTEMPTS) { + if (nextAttemptCount >= effectiveMaxAttempts) { lastReflectedMsgId.set(sessionId, lastUserMsgId) - await showToast(client, directory, `Max attempts (${MAX_ATTEMPTS}) reached`, "warning") + await showToast(client, directory, `Max attempts (${effectiveMaxAttempts}) reached`, "warning") debug("Max attempts reached for", sessionId.slice(0, 8)) return } diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts index b2c057b..5bf1163 100644 --- a/test/reflection-3.unit.test.ts +++ b/test/reflection-3.unit.test.ts @@ -718,7 +718,7 @@ describe("buildEscalatingFeedback", () => { const verdict = { missing: ["Run tests", "Create PR", "Check CI", "Update docs"] } const result = buildEscalatingFeedback(3, "high", verdict, false) assert.ok(result.includes("Final Attempt")) - assert.ok(result.includes("3/3")) + assert.ok(result.includes("3/16")) // Should truncate to first 3 missing items assert.ok(result.includes("Run tests")) assert.ok(result.includes("Create PR")) @@ -753,7 +753,7 @@ describe("buildEscalatingFeedback", () => { it("action loop includes attempt count", () => { const result = buildEscalatingFeedback(2, "high", null, false, true) - assert.ok(result.includes("2/3")) + assert.ok(result.includes("2/16")) }) it("action loop ignores verdict content", () => { @@ -1097,14 +1097,14 @@ describe("buildSelfAssessmentPrompt attempt awareness", () => { it("includes reflection history on second attempt", () => { const result = buildSelfAssessmentPrompt(baseContext, "", undefined, 1) assert.ok(result.includes("## Reflection History")) - assert.ok(result.includes("reflection attempt 2/3")) + assert.ok(result.includes("reflection attempt 2/16")) assert.ok(result.includes("repeating the same actions")) assert.ok(result.includes('"stuck": true')) }) it("includes reflection history on third attempt", () => { const result = buildSelfAssessmentPrompt(baseContext, "", undefined, 2) - assert.ok(result.includes("reflection attempt 3/3")) + assert.ok(result.includes("reflection attempt 3/16")) }) it("includes loop-awareness rules", () => { diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index de872c8..6db972a 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,23 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts } from "../reflection-3.ts" + +describe("supervisor: resolveMaxAttempts", () => { + it("session override > config > default 16", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 5, config: 30 }), 5) + assert.strictEqual(resolveMaxAttempts({ sessionOverride: undefined, config: 30 }), 30) + assert.strictEqual(resolveMaxAttempts({}), 16) + }) + it("clamps to 1..100", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 0 }), 1) + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 999 }), 100) + assert.strictEqual(resolveMaxAttempts({ config: -4 }), 1) + }) + it("ignores NaN/non-finite and falls through", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: NaN, config: 20 }), 20) + }) +}) describe("supervisor: rubric", () => { it("DEFAULT_RUBRIC has both sections and the mined antipatterns", () => { From 28c9bbb44f41aa4ef075b2b73c0486d618997dc1 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 03:40:36 +0000 Subject: [PATCH 06/14] fix(supervisor): show effective retry cap in prompts; strict maxAttempts parse Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.test-helpers.ts | 77 +--------------------------------- reflection-3.ts | 31 +++++++++----- test/reflection-3.unit.test.ts | 25 +++++++++-- test/supervisor.unit.test.ts | 37 +++++++++++++++- 4 files changed, 80 insertions(+), 90 deletions(-) diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts index 8234ccb..a5810ed 100644 --- a/reflection-3.test-helpers.ts +++ b/reflection-3.test-helpers.ts @@ -433,11 +433,11 @@ export function getGitHubCopilotModelForRouting(modelSpec: string | null | undef return null } -const FEEDBACK_MARKER = "## Reflection-3:" -const DEFAULT_MAX_ATTEMPTS = 16 const ACTION_LOOP_MIN_COMMANDS = 4 const ACTION_LOOP_REPETITION_THRESHOLD = 0.6 +export { buildEscalatingFeedback } from "./reflection-3.ts" + /** * Detects when the agent is repeating the same commands/actions without progress. * Unlike detectPlanningLoop (read-heavy without writes), this catches action loops @@ -496,79 +496,6 @@ export function detectActionLoop(messages: any[]): { return { detected, repeatedCommands, totalCommands: commands.length } } -export function buildEscalatingFeedback( - attemptCount: number, - severity: string, - verdict: { feedback?: string; missing?: string[]; next_actions?: string[] } | undefined | null, - isPlanningLoop: boolean, - isActionLoop?: boolean -): string { - const safeVerdict = verdict ?? {} - const missingItems = Array.isArray(safeVerdict.missing) ? safeVerdict.missing : [] - const nextActionItems = Array.isArray(safeVerdict.next_actions) ? safeVerdict.next_actions : [] - const feedbackStr = safeVerdict.feedback || "" - if (isPlanningLoop) { - return `${FEEDBACK_MARKER} STOP: Planning Loop Detected - -You have been reading files, checking git status, and creating todo lists without writing any code. - -DO NOT: -- Run git status or git log again -- Create another todo list -- Read more files "for context" -- Say "let me get right to work" without actually working - -DO NOW: -Pick the FIRST item from your existing todo list and implement it. Open a file with Edit or Write and make changes. If you don't know where to start, create the simplest possible file first. - -Start coding NOW. No more planning.` - } - - if (isActionLoop) { - return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) - -You are repeating the same commands without making progress. Running the same deploy/test/build cycle again will produce the same result. - -STOP and do ONE of these: -1. If the same test/eval keeps failing, analyze the failure output and fix the root cause before re-running. -2. If you cannot fix the root cause, explain what is blocking you and ask the user for help. -3. Try a completely different approach (e.g., test locally instead of via deployment). - -Do NOT re-run the same command hoping for a different result.` - } - - if (attemptCount <= 2) { - const missing = missingItems.length - ? `\n### Missing\n${missingItems.map((m) => `- ${m}`).join("\n")}` - : "" - const nextActions = nextActionItems.length - ? `\n### Next Actions\n${nextActionItems.map((a) => `- ${a}`).join("\n")}` - : "" - return `${FEEDBACK_MARKER} Task Incomplete (${severity}) -${feedbackStr} -${missing} -${nextActions} - -Please address these issues and continue.` - } - - const missingBrief = missingItems.length - ? `Still missing: ${missingItems.slice(0, 3).join(", ")}.` - : "" - return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) - -${missingBrief} - -You have been asked ${attemptCount} times to complete this task. This is your LAST chance before reflection stops. - -If you cannot complete the remaining items: -- Explain clearly what is blocking you -- Set needs_user_action if you need user help -- Try a different approach instead of repeating the same steps - -Do NOT re-read files or re-plan. Either implement the fix now or explain why you cannot.` -} - export function shouldApplyPlanningLoop(taskType: TaskType, loopDetected: boolean): boolean { if (!loopDetected) return false return taskType === "coding" diff --git a/reflection-3.ts b/reflection-3.ts index 38c53ce..2fd7f20 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -51,8 +51,11 @@ async function loadConfiguredMaxAttempts(): Promise { if (!line || line.startsWith("#")) continue const match = line.match(/^maxAttempts\s*:\s*(\S+)/) if (match) { - const value = parseInt(match[1], 10) - if (Number.isFinite(value)) return value + const raw = match[1].trim() + if (/^\d+$/.test(raw)) { + const value = parseInt(raw, 10) + if (Number.isFinite(value)) return value + } } } } catch {} @@ -484,12 +487,14 @@ export function buildEscalatingFeedback( severity: string, verdict: { feedback?: string; missing?: string[]; next_actions?: string[] } | undefined | null, isPlanningLoop: boolean, - isActionLoop?: boolean + isActionLoop?: boolean, + maxAttempts?: number ): string { const safeVerdict = verdict ?? {} const missingItems = Array.isArray(safeVerdict.missing) ? safeVerdict.missing : [] const nextActionItems = Array.isArray(safeVerdict.next_actions) ? safeVerdict.next_actions : [] const feedbackStr = safeVerdict.feedback || "" + const effectiveCap = maxAttempts ?? DEFAULT_MAX_ATTEMPTS if (isPlanningLoop) { return `${FEEDBACK_MARKER} STOP: Planning Loop Detected @@ -508,7 +513,7 @@ Start coding NOW. No more planning.` } if (isActionLoop) { - return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) + return `${FEEDBACK_MARKER} STOP: Action Loop Detected (attempt ${attemptCount}/${effectiveCap}) You are repeating the same commands without making progress. Running the same deploy/test/build cycle again will produce the same result. @@ -520,7 +525,7 @@ STOP and do ONE of these: Do NOT re-run the same command hoping for a different result.` } - if (attemptCount <= 2) { + if (attemptCount < effectiveCap - 1) { const missing = missingItems.length ? `\n### Missing\n${missingItems.map((m) => `- ${m}`).join("\n")}` : "" @@ -538,7 +543,7 @@ Please address these issues and continue.` const missingBrief = missingItems.length ? `Still missing: ${missingItems.slice(0, 3).join(", ")}.` : "" - return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${DEFAULT_MAX_ATTEMPTS}) + return `${FEEDBACK_MARKER} Final Attempt (${attemptCount}/${effectiveCap}) ${missingBrief} @@ -1161,7 +1166,8 @@ export function buildSelfAssessmentPrompt( agents: string, lastAssistantText?: string, attemptCount?: number, - rubric?: Rubric + rubric?: Rubric, + maxAttempts?: number ): string { const safeContext = { ...context, @@ -1183,9 +1189,10 @@ export function buildSelfAssessmentPrompt( : "" const rb = rubric ?? parseRubric(DEFAULT_RUBRIC) + const effectiveCap = maxAttempts ?? DEFAULT_MAX_ATTEMPTS const currentAttempt = attemptCount || 0 const attemptSection = currentAttempt > 0 - ? `\n## Reflection History\n- This is reflection attempt ${currentAttempt + 1}/${DEFAULT_MAX_ATTEMPTS} for this task.\n- Previous reflections found the task incomplete.\n- If you are repeating the same actions without progress, set "stuck": true and explain what is blocking you.\n` + ? `\n## Reflection History\n- This is reflection attempt ${currentAttempt + 1}/${effectiveCap} for this task.\n- Previous reflections found the task incomplete.\n- If you are repeating the same actions without progress, set "stuck": true and explain what is blocking you.\n` : "" return `SELF-ASSESS REFLECTION-3 @@ -1808,12 +1815,14 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { const agents = await getAgentsFile(directory) const rubric = await loadRubric(directory) const currentAttemptCount = attempts.get(attemptKey) || 0 + const effectiveMaxAttempts = resolveMaxAttempts({ sessionOverride: undefined, config: await loadConfiguredMaxAttempts() }) const defaultReflectionPrompt = buildSelfAssessmentPrompt( context, agents, lastAssistantText, currentAttemptCount, - rubric + rubric, + effectiveMaxAttempts ) const resolvedPrompt = resolveReflectionPromptPrecedence(customPrompt, toolReflectionPrompt, defaultReflectionPrompt) const reflectionPrompt = resolvedPrompt.prompt @@ -2018,7 +2027,6 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { return } - const effectiveMaxAttempts = resolveMaxAttempts({ sessionOverride: undefined, config: await loadConfiguredMaxAttempts() }) const nextAttemptCount = (attempts.get(attemptKey) || 0) + 1 attempts.set(attemptKey, nextAttemptCount) if (nextAttemptCount >= effectiveMaxAttempts) { @@ -2040,7 +2048,8 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { next_actions: analysis.nextActions }, usePlanningLoopMessage, - actionLoopCheck.detected + actionLoopCheck.detected, + effectiveMaxAttempts ) // Apply task-based model routing to feedback injection diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts index 5bf1163..efe94d7 100644 --- a/test/reflection-3.unit.test.ts +++ b/test/reflection-3.unit.test.ts @@ -714,11 +714,12 @@ describe("buildEscalatingFeedback", () => { assert.ok(!result.includes("Some feedback")) }) - it("escalates to final attempt message after attempt 2", () => { + it("escalates to final attempt message at the penultimate attempt relative to cap", () => { const verdict = { missing: ["Run tests", "Create PR", "Check CI", "Update docs"] } - const result = buildEscalatingFeedback(3, "high", verdict, false) + // With default cap (16), attempt 15 is the final attempt (15 >= 16-1) + const result = buildEscalatingFeedback(15, "high", verdict, false) assert.ok(result.includes("Final Attempt")) - assert.ok(result.includes("3/16")) + assert.ok(result.includes("15/16")) // Should truncate to first 3 missing items assert.ok(result.includes("Run tests")) assert.ok(result.includes("Create PR")) @@ -729,6 +730,24 @@ describe("buildEscalatingFeedback", () => { assert.ok(result.includes("needs_user_action")) }) + it("escalates to final attempt when custom cap makes attemptCount the penultimate", () => { + const verdict = { missing: ["Run tests", "Create PR", "Check CI", "Update docs"] } + // With cap=4, attempt 3 is final (3 >= 4-1) + const result = buildEscalatingFeedback(3, "high", verdict, false, false, 4) + assert.ok(result.includes("Final Attempt")) + assert.ok(result.includes("3/4")) + assert.ok(result.includes("Run tests")) + assert.ok(result.includes("LAST chance")) + }) + + it("stays in Task Incomplete tier when below penultimate attempt", () => { + const verdict = { missing: ["Run tests"] } + // With default cap (16), attempt 3 is still in early tier (3 < 15) + const result = buildEscalatingFeedback(3, "high", verdict, false) + assert.ok(result.includes("Task Incomplete")) + assert.ok(!result.includes("Final Attempt")) + }) + it("handles verdict with empty arrays", () => { const verdict = { feedback: "", missing: [], next_actions: [] } const result = buildEscalatingFeedback(1, "low", verdict, false) diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index 6db972a..b385f42 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,7 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback } from "../reflection-3.ts" describe("supervisor: resolveMaxAttempts", () => { it("session override > config > default 16", () => { @@ -18,6 +18,12 @@ describe("supervisor: resolveMaxAttempts", () => { it("ignores NaN/non-finite and falls through", () => { assert.strictEqual(resolveMaxAttempts({ sessionOverride: NaN, config: 20 }), 20) }) + it("Infinity sessionOverride is non-finite so falls through to config", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: Infinity, config: 20 }), 20) + }) + it("float sessionOverride is floored and clamped", () => { + assert.strictEqual(resolveMaxAttempts({ sessionOverride: 1.9 }), 1) + }) }) describe("supervisor: rubric", () => { @@ -74,6 +80,35 @@ describe("supervisor: buildSelfAssessmentPrompt rubric interpolation", () => { assert.match(prompt, /ZZ-RULE/) assert.match(prompt, /PP-RULE/) }) + + it("buildSelfAssessmentPrompt uses maxAttempts as the denominator in attempt history", () => { + const ctx = { + taskSummary: "x", taskType: "coding", agentMode: "build", + requiresTests: false, requiresBuild: false, requiresPR: false, requiresCI: false, + requiresLocalTests: false, requiresLocalTestsEvidence: false, + humanMessages: [], toolsSummary: "none", detectedSignals: [], recentCommands: [], + pushedToDefaultBranch: false, + } as any + const prompt = buildSelfAssessmentPrompt(ctx, "", undefined, 1, undefined, 50) + assert.ok(prompt.includes("/50"), "should show /50 not /16") + assert.ok(!prompt.includes("/16"), "should NOT show the old default /16") + }) +}) + +describe("supervisor: buildEscalatingFeedback maxAttempts denominator", () => { + it("renders the passed maxAttempts as denominator in action loop message", () => { + const result = buildEscalatingFeedback(5, "high", null, false, true, 50) + assert.ok(result.includes("/50"), "should show /50") + assert.ok(!result.includes("/16"), "should NOT show old default /16") + }) + + it("renders the passed maxAttempts as denominator in final attempt message", () => { + // With maxAttempts=50, attempt 49 is final (49 >= 50-1) + const result = buildEscalatingFeedback(49, "high", { missing: ["Do X"] }, false, false, 50) + assert.ok(result.includes("/50"), "should show /50") + assert.ok(!result.includes("/16"), "should NOT show old default /16") + assert.ok(result.includes("Final Attempt")) + }) }) describe("supervisor: buildJudgePrompt rubric interpolation", () => { From adcb18a76fab456d9bde1d00c090f80edc95536b Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 05:00:13 +0000 Subject: [PATCH 07/14] feat(supervisor): per-session goal+retry store Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 100 ++++++++++++++++++++++++++++++++++- test/supervisor.unit.test.ts | 41 +++++++++++++- 2 files changed, 139 insertions(+), 2 deletions(-) diff --git a/reflection-3.ts b/reflection-3.ts index 2fd7f20..d77bde5 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -7,7 +7,7 @@ */ import type { Plugin } from "@opencode-ai/plugin" -import { readFile, writeFile, mkdir, stat, appendFile } from "fs/promises" +import { readFile, writeFile, mkdir, stat, appendFile, readdir } from "fs/promises" import { join } from "path" import { homedir } from "os" @@ -62,6 +62,104 @@ async function loadConfiguredMaxAttempts(): Promise { return undefined } +// --------------------------------------------------------------------------- +// Supervisor per-session persistence (supervisorStore) +// +// Stores goal + retry state at /.reflection/supervisor/.json +// so supervisor decisions survive across tool invocations within a session. +// --------------------------------------------------------------------------- + +export interface SupervisorGoal { + condition: string + status: "active" | "paused" | "achieved" | "cleared" | "exhausted" + attempts: number + tokenBaseline: number + startedAt: number + deadline: number + lastReason: string +} + +export interface SupervisorState { + maxAttempts?: number // /supervisor:retry override (optional) + goal?: SupervisorGoal +} + +function supervisorDir(directory: string): string { + return join(directory, ".reflection", "supervisor") +} + +function supervisorPath(directory: string, sessionId: string): string { + return join(supervisorDir(directory), `${sessionId}.json`) +} + +export const supervisorStore = { + async load(directory: string, sessionId: string): Promise { + try { + const content = await readFile(supervisorPath(directory, sessionId), "utf-8") + return JSON.parse(content) as SupervisorState + } catch { + return {} + } + }, + + async save(directory: string, sessionId: string, state: SupervisorState): Promise { + const dir = supervisorDir(directory) + await mkdir(dir, { recursive: true }) + await writeFile(supervisorPath(directory, sessionId), JSON.stringify(state, null, 2), { mode: 0o600 }) + }, + + async setGoal( + directory: string, + sessionId: string, + condition: string, + opts?: { tokenBaseline?: number; now?: number; maxDurationMs?: number } + ): Promise { + const state = await supervisorStore.load(directory, sessionId) + const now = opts?.now ?? Date.now() + const maxDurationMs = opts?.maxDurationMs ?? 1800000 + state.goal = { + condition, + status: "active", + attempts: 0, + tokenBaseline: opts?.tokenBaseline ?? 0, + startedAt: now, + deadline: now + maxDurationMs, + lastReason: "", + } + await supervisorStore.save(directory, sessionId, state) + return state + }, + + async clearGoal(directory: string, sessionId: string): Promise { + const state = await supervisorStore.load(directory, sessionId) + delete state.goal + await supervisorStore.save(directory, sessionId, state) + return state + }, + + async setRetry(directory: string, sessionId: string, n: number): Promise { + const state = await supervisorStore.load(directory, sessionId) + state.maxAttempts = n + await supervisorStore.save(directory, sessionId, state) + return state + }, + + async list(directory: string): Promise { + try { + const dir = supervisorDir(directory) + const entries = await readdir(dir) + // Build result in a local array to ensure consistent realm in ESM environments + const ids: string[] = [] + for (const name of entries) { + if (name.endsWith(".json")) ids.push(name.slice(0, -".json".length)) + } + return ids + } catch { + return [] + } + }, +} + // --------------------------------------------------------------------------- // Supervisor rubric (configurable patterns/antipatterns) // diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index b385f42..60d487a 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,46 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore } from "../reflection-3.ts" + +describe("supervisorStore", () => { + it("saves and loads goal + retry, clears goal but keeps retry", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + await supervisorStore.setRetry(dir, "s1", 12) + await supervisorStore.setGoal(dir, "s1", "tests pass", { now: 1000, maxDurationMs: 5000 }) + let st = await supervisorStore.load(dir, "s1") + assert.strictEqual(st.maxAttempts, 12) + assert.strictEqual(st.goal?.status, "active") + assert.strictEqual(st.goal?.condition, "tests pass") + assert.strictEqual(st.goal?.deadline, 6000) + await supervisorStore.clearGoal(dir, "s1") + st = await supervisorStore.load(dir, "s1") + assert.strictEqual(st.goal, undefined) + assert.strictEqual(st.maxAttempts, 12) // retry survives goal clear + }) + it("load returns {} for missing/corrupt files", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + assert.deepStrictEqual(await supervisorStore.load(dir, "nope"), {}) + mkdirSync(join(dir, ".reflection", "supervisor"), { recursive: true }) + writeFileSync(join(dir, ".reflection", "supervisor", "bad.json"), "{not json") + assert.deepStrictEqual(await supervisorStore.load(dir, "bad"), {}) + }) + it("list returns session ids with state", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + assert.deepStrictEqual(await supervisorStore.list(dir), []) + await supervisorStore.setRetry(dir, "alpha", 4) + await supervisorStore.setRetry(dir, "beta", 4) + const ids = (await supervisorStore.list(dir)).sort() + assert.deepStrictEqual(ids, ["alpha", "beta"]) + }) + it("writes files with 0600 perms", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + await supervisorStore.setRetry(dir, "s1", 4) + const { statSync } = await import("node:fs") + const mode = statSync(join(dir, ".reflection", "supervisor", "s1.json")).mode & 0o777 + assert.strictEqual(mode, 0o600) + }) +}) describe("supervisor: resolveMaxAttempts", () => { it("session override > config > default 16", () => { From 1f6be47937fb7174405fdf1d31a6be6ad235742c Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 05:11:14 +0000 Subject: [PATCH 08/14] fix(supervisor): enforce 0600 on store update; guard sessionId path traversal Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 23 +++++++++++----- test/supervisor.unit.test.ts | 51 +++++++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/reflection-3.ts b/reflection-3.ts index d77bde5..559b332 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -7,7 +7,7 @@ */ import type { Plugin } from "@opencode-ai/plugin" -import { readFile, writeFile, mkdir, stat, appendFile, readdir } from "fs/promises" +import { readFile, writeFile, mkdir, stat, appendFile, readdir, chmod } from "fs/promises" import { join } from "path" import { homedir } from "os" @@ -23,6 +23,7 @@ async function reportError(err: unknown, context?: Record): Prom const SELF_ASSESSMENT_MARKER = "## Reflection-3 Self-Assessment" const FEEDBACK_MARKER = "## Reflection-3:" const DEFAULT_MAX_ATTEMPTS = 16 +const DEFAULT_MAX_GOAL_DURATION_MS = 30 * 60 * 1000 /** * Pure function: resolve the effective max attempts from opts (no I/O). @@ -88,6 +89,13 @@ function supervisorDir(directory: string): string { return join(directory, ".reflection", "supervisor") } +/** Guard against path-traversal attacks via a malformed sessionId. */ +function assertSafeSessionId(sessionId: string): void { + if (!sessionId || sessionId.includes("/") || sessionId.includes("\\") || sessionId.includes("..")) { + throw new Error(`Invalid sessionId: ${JSON.stringify(sessionId)}`) + } +} + function supervisorPath(directory: string, sessionId: string): string { return join(supervisorDir(directory), `${sessionId}.json`) } @@ -103,9 +111,12 @@ export const supervisorStore = { }, async save(directory: string, sessionId: string, state: SupervisorState): Promise { + assertSafeSessionId(sessionId) const dir = supervisorDir(directory) await mkdir(dir, { recursive: true }) - await writeFile(supervisorPath(directory, sessionId), JSON.stringify(state, null, 2), { mode: 0o600 }) + const filePath = supervisorPath(directory, sessionId) + await writeFile(filePath, JSON.stringify(state, null, 2), { mode: 0o600 }) + await chmod(filePath, 0o600) }, async setGoal( @@ -116,7 +127,7 @@ export const supervisorStore = { ): Promise { const state = await supervisorStore.load(directory, sessionId) const now = opts?.now ?? Date.now() - const maxDurationMs = opts?.maxDurationMs ?? 1800000 + const maxDurationMs = opts?.maxDurationMs ?? DEFAULT_MAX_GOAL_DURATION_MS state.goal = { condition, status: "active", @@ -147,11 +158,11 @@ export const supervisorStore = { async list(directory: string): Promise { try { const dir = supervisorDir(directory) - const entries = await readdir(dir) + const entries = await readdir(dir, { withFileTypes: true }) // Build result in a local array to ensure consistent realm in ESM environments const ids: string[] = [] - for (const name of entries) { - if (name.endsWith(".json")) ids.push(name.slice(0, -".json".length)) + for (const e of entries) { + if (e.isFile() && e.name.endsWith(".json")) ids.push(e.name.slice(0, -".json".length)) } return ids } catch { diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index 60d487a..59aacd2 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -1,5 +1,5 @@ import assert from "node:assert" -import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs" +import { mkdtempSync, writeFileSync, mkdirSync, chmodSync, statSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore } from "../reflection-3.ts" @@ -41,6 +41,55 @@ describe("supervisorStore", () => { const mode = statSync(join(dir, ".reflection", "supervisor", "s1.json")).mode & 0o777 assert.strictEqual(mode, 0o600) }) + + it("I-1: enforces 0600 on update (not just create)", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + // Create the file initially + await supervisorStore.setRetry(dir, "s1", 4) + const filePath = join(dir, ".reflection", "supervisor", "s1.json") + // Widen perms to 0o644 to simulate an externally widened file + chmodSync(filePath, 0o644) + assert.strictEqual(statSync(filePath).mode & 0o777, 0o644, "test setup: perms widened") + // A subsequent save must restore 0600 + await supervisorStore.setRetry(dir, "s1", 7) + const mode = statSync(filePath).mode & 0o777 + assert.strictEqual(mode, 0o600, "save must restore 0600 on existing file") + }) + + it("I-2: rejects sessionId with path-traversal characters on write paths", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + await assert.rejects(() => supervisorStore.save(dir, "../evil", {}), /Invalid sessionId/) + await assert.rejects(() => supervisorStore.setRetry(dir, "a/b", 3), /Invalid sessionId/) + await assert.rejects(() => supervisorStore.setGoal(dir, "", "cond"), /Invalid sessionId/) + await assert.rejects(() => supervisorStore.clearGoal(dir, ".."), /Invalid sessionId/) + }) + + it("I-2: load still returns {} for a legitimate missing file (guard does not interfere)", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + const result = await supervisorStore.load(dir, "missing-but-safe") + assert.deepStrictEqual(result, {}) + }) + + it("M-2: list() skips subdirectories, only counts .json files", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + await supervisorStore.setRetry(dir, "alpha", 4) + // Create a subdirectory inside the supervisor dir — list() must skip it + const supDir = join(dir, ".reflection", "supervisor") + mkdirSync(join(supDir, "subdir.json"), { recursive: true }) + const ids = await supervisorStore.list(dir) + assert.deepStrictEqual(ids, ["alpha"], "subdirectory named *.json must not appear in list()") + }) + + it("M-4: setGoal over an existing goal replaces condition, preserves retry, resets attempts", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-")) + await supervisorStore.setRetry(dir, "s", 7) + await supervisorStore.setGoal(dir, "s", "first", { now: 1000 }) + await supervisorStore.setGoal(dir, "s", "second", { now: 2000 }) + const st = await supervisorStore.load(dir, "s") + assert.strictEqual(st.goal?.condition, "second", "goal should be replaced with second") + assert.strictEqual(st.maxAttempts, 7, "retry must survive goal replacement") + assert.strictEqual(st.goal?.attempts, 0, "attempts must reset on new goal") + }) }) describe("supervisor: resolveMaxAttempts", () => { From 447dd89fe2e26b0e00c27e560ddba36e4a9e6279 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 05:17:05 +0000 Subject: [PATCH 09/14] feat(supervisor): parseSupervisorCommand parser Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 41 ++++++++++++++++++++++++++++++++++++ test/supervisor.unit.test.ts | 30 +++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/reflection-3.ts b/reflection-3.ts index 559b332..b0649f8 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -171,6 +171,47 @@ export const supervisorStore = { }, } +// --------------------------------------------------------------------------- +// parseSupervisorCommand — pure parser for /supervisor:* command names +// +// Maps a command name + its argument string to a discriminated action. +// No I/O. Safe to call from any context. +// --------------------------------------------------------------------------- + +export type SupervisorCommand = + | { kind: "goal-set"; condition: string } + | { kind: "goal-status" } + | { kind: "goal-clear" } + | { kind: "retry-set"; n: number } + | { kind: "retry-status" } + | { kind: "unknown"; name: string } + +const GOAL_CLEAR_ALIASES = new Set(["clear", "stop", "off", "reset", "none", "cancel"]) + +/** + * Parse a supervisor slash-command into a typed action. + * + * @param name - command name, e.g. "goal" or "retry" + * @param args - raw argument string (may have leading/trailing whitespace) + */ +export function parseSupervisorCommand(name: string, args: string): SupervisorCommand { + const trimmed = args.trim() + + if (name === "goal") { + if (trimmed === "") return { kind: "goal-status" } + if (GOAL_CLEAR_ALIASES.has(trimmed.toLowerCase())) return { kind: "goal-clear" } + return { kind: "goal-set", condition: trimmed.slice(0, 4000) } + } + + if (name === "retry") { + if (trimmed === "") return { kind: "retry-status" } + if (/^\d+$/.test(trimmed)) return { kind: "retry-set", n: parseInt(trimmed, 10) } + return { kind: "retry-status" } + } + + return { kind: "unknown", name } +} + // --------------------------------------------------------------------------- // Supervisor rubric (configurable patterns/antipatterns) // diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index 59aacd2..fd3f436 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,7 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync, chmodSync, statSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore, parseSupervisorCommand } from "../reflection-3.ts" describe("supervisorStore", () => { it("saves and loads goal + retry, clears goal but keeps retry", async () => { @@ -224,3 +224,31 @@ describe("supervisor: buildJudgePrompt rubric interpolation", () => { assert.match(prompt, /STOPPED-WITH-TODOS/, "default rubric antipatterns must be present") }) }) + +describe("supervisor: parseSupervisorCommand", () => { + it("goal set/status/clear + aliases", () => { + assert.deepStrictEqual(parseSupervisorCommand("goal", "tests pass"), { kind: "goal-set", condition: "tests pass" }) + assert.deepStrictEqual(parseSupervisorCommand("goal", " "), { kind: "goal-status" }) + assert.deepStrictEqual(parseSupervisorCommand("goal", ""), { kind: "goal-status" }) + for (const a of ["clear","stop","off","reset","none","cancel","CLEAR"," Stop "]) { + assert.deepStrictEqual(parseSupervisorCommand("goal", a), { kind: "goal-clear" }) + } + assert.deepStrictEqual(parseSupervisorCommand("goal", " do the thing "), { kind: "goal-set", condition: "do the thing" }) + }) + it("caps condition at 4000 chars", () => { + const long = "x".repeat(5000) + const r = parseSupervisorCommand("goal", long) + assert.strictEqual(r.kind, "goal-set") + assert.strictEqual((r as any).condition.length, 4000) + }) + it("retry set/status + junk", () => { + assert.deepStrictEqual(parseSupervisorCommand("retry", "12"), { kind: "retry-set", n: 12 }) + assert.deepStrictEqual(parseSupervisorCommand("retry", " 7 "), { kind: "retry-set", n: 7 }) + assert.deepStrictEqual(parseSupervisorCommand("retry", ""), { kind: "retry-status" }) + assert.deepStrictEqual(parseSupervisorCommand("retry", "abc"), { kind: "retry-status" }) + assert.deepStrictEqual(parseSupervisorCommand("retry", "1.5"), { kind: "retry-status" }) + }) + it("unknown command name", () => { + assert.deepStrictEqual(parseSupervisorCommand("frobnicate", "x"), { kind: "unknown", name: "frobnicate" }) + }) +}) From 9afb0d7da62d40e876ec3dc226e27d0aaefe45ea Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 05:23:48 +0000 Subject: [PATCH 10/14] feat(supervisor): buildGoalRequirementSection Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 31 +++++++++++++++++++++++++++++++ test/supervisor.unit.test.ts | 20 +++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/reflection-3.ts b/reflection-3.ts index b0649f8..c68ee0f 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -212,6 +212,37 @@ export function parseSupervisorCommand(name: string, args: string): SupervisorCo return { kind: "unknown", name } } +// --------------------------------------------------------------------------- +// buildGoalRequirementSection — prompt fragment for active session goals +// +// Appended to the judge/self-assessment rubric when a goal is active. Pure, +// no I/O. Returns an empty string for blank conditions (caller should skip +// appending in that case). +// --------------------------------------------------------------------------- + +/** + * Build a prompt fragment that tells the evaluator a user-set goal condition + * is a MANDATORY completion requirement, additional to the existing workflow + * gates. A claim that the condition is met must be backed by transcript + * evidence (commands + output / tests actually run). + * + * @param condition - the goal condition string (will be trimmed) + * @returns a labeled prompt block, or "" if condition is blank + */ +export function buildGoalRequirementSection(condition: string): string { + const trimmed = condition.trim() + if (!trimmed) return "" + return `## GOAL (mandatory completion requirement) + +MANDATORY: The following user-set goal condition MUST be demonstrably met before the task can be considered complete: + + "${trimmed}" + +This is an ADDITIONAL requirement on top of the existing workflow gates (tests, build, PR, CI checks) — it does not replace them. The task is NOT complete until this condition is satisfied AND all normal completion criteria are met. + +Evidence rule: a claim that this goal condition is met must be backed by evidence already surfaced in the transcript — commands run and their output, tests actually executed, artefacts produced. A bare assertion that the condition is satisfied does NOT count as evidence.` +} + // --------------------------------------------------------------------------- // Supervisor rubric (configurable patterns/antipatterns) // diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index fd3f436..7d8a195 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,7 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync, chmodSync, statSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore, parseSupervisorCommand } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore, parseSupervisorCommand, buildGoalRequirementSection } from "../reflection-3.ts" describe("supervisorStore", () => { it("saves and loads goal + retry, clears goal but keeps retry", async () => { @@ -252,3 +252,21 @@ describe("supervisor: parseSupervisorCommand", () => { assert.deepStrictEqual(parseSupervisorCommand("frobnicate", "x"), { kind: "unknown", name: "frobnicate" }) }) }) + +describe("supervisor: buildGoalRequirementSection", () => { + it("embeds the condition and a mandatory marker + evidence rule", () => { + const s = buildGoalRequirementSection("all tests in test/auth pass") + assert.match(s, /MANDATORY/) + assert.match(s, /all tests in test\/auth pass/) + assert.match(s, /evidence/i) + }) + it("trims the condition", () => { + const s = buildGoalRequirementSection(" do X ") + assert.match(s, /do X/) + assert.ok(!s.includes(" do X ")) + }) + it("returns empty string for blank condition", () => { + assert.strictEqual(buildGoalRequirementSection(" "), "") + assert.strictEqual(buildGoalRequirementSection(""), "") + }) +}) From 6f8c5f3537af1bd2d8d0782c4c32c2a3f527ab17 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:08:49 +0000 Subject: [PATCH 11/14] feat(supervisor): integrate goal loop into runReflection Goal-active sessions augment the judge rubric with a mandatory completion requirement (bypassing file/tool prompt precedence so it composes with the gates), and a pure decideGoalTransition drives budget-exhaustion / achieved / continue. Completion = applicable gates AND condition met; achieved goals auto-clear, exhausted goals pause without continuation. Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 102 ++++++++++++++-- test/supervisor.integration.test.ts | 177 ++++++++++++++++++++++++++++ test/supervisor.unit.test.ts | 87 +++++++++++++- 3 files changed, 358 insertions(+), 8 deletions(-) create mode 100644 test/supervisor.integration.test.ts diff --git a/reflection-3.ts b/reflection-3.ts index c68ee0f..070ad15 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -243,6 +243,38 @@ This is an ADDITIONAL requirement on top of the existing workflow gates (tests, Evidence rule: a claim that this goal condition is met must be backed by evidence already surfaced in the transcript — commands run and their output, tests actually executed, artefacts produced. A bare assertion that the condition is satisfied does NOT count as evidence.` } +// --------------------------------------------------------------------------- +// decideGoalTransition — pure decision for an active session goal +// +// Given the current goal, the judge's verdict, and the budget, decide the next +// goal state. No I/O; returns a NEW goal object (never mutates the input). +// Applied in strict order: budget exhaustion FIRST (attempts cap or deadline), +// then achievement, then continuation (which increments attempts). +// --------------------------------------------------------------------------- + +export type GoalTransition = + | { action: "exhausted"; goal: SupervisorGoal } + | { action: "achieved"; goal: SupervisorGoal } + | { action: "continue"; goal: SupervisorGoal } + +export function decideGoalTransition(params: { + goal: SupervisorGoal + complete: boolean + now: number + maxAttempts: number + reason?: string +}): GoalTransition { + const { goal, complete, now, maxAttempts, reason } = params + const lastReason = reason ?? goal.lastReason + if (goal.attempts >= maxAttempts || now >= goal.deadline) { + return { action: "exhausted", goal: { ...goal, status: "exhausted", lastReason } } + } + if (complete) { + return { action: "achieved", goal: { ...goal, status: "achieved", lastReason } } + } + return { action: "continue", goal: { ...goal, status: "active", attempts: goal.attempts + 1, lastReason } } +} + // --------------------------------------------------------------------------- // Supervisor rubric (configurable patterns/antipatterns) // @@ -1996,21 +2028,43 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { const agents = await getAgentsFile(directory) const rubric = await loadRubric(directory) const currentAttemptCount = attempts.get(attemptKey) || 0 - const effectiveMaxAttempts = resolveMaxAttempts({ sessionOverride: undefined, config: await loadConfiguredMaxAttempts() }) + // Supervisor goal/retry state. A /supervisor:retry override applies even + // without a goal (sessionOverride). An active goal augments the judge + // rubric with a mandatory completion requirement and drives the goal loop. + const sup = await supervisorStore.load(directory, sessionId) + const goal = sup.goal && sup.goal.status === "active" ? sup.goal : null + const effectiveMaxAttempts = resolveMaxAttempts({ sessionOverride: sup.maxAttempts, config: await loadConfiguredMaxAttempts() }) + // When a goal is active, its condition is the strongest stated intent and + // MUST compose with the existing gates: build the DEFAULT rubric prompt + // with the goal requirement appended, bypassing file/tool precedence. + const effRubric: Rubric = goal + ? { ...rubric, antipatterns: `${rubric.antipatterns}\n\n${buildGoalRequirementSection(goal.condition)}` } + : rubric const defaultReflectionPrompt = buildSelfAssessmentPrompt( context, agents, lastAssistantText, currentAttemptCount, - rubric, + effRubric, effectiveMaxAttempts ) - const resolvedPrompt = resolveReflectionPromptPrecedence(customPrompt, toolReflectionPrompt, defaultReflectionPrompt) - const reflectionPrompt = resolvedPrompt.prompt - const effectiveToolReflectionPrompt = resolvedPrompt.effectiveToolReflectionPrompt + let reflectionPrompt: string + let effectiveToolReflectionPrompt: string | null + let promptSource: "file" | "tool" | "default" | "goal" + if (goal) { + // Goal active: use the default (goal-augmented) prompt directly. + reflectionPrompt = defaultReflectionPrompt + effectiveToolReflectionPrompt = null + promptSource = "goal" + } else { + const resolvedPrompt = resolveReflectionPromptPrecedence(customPrompt, toolReflectionPrompt, defaultReflectionPrompt) + reflectionPrompt = resolvedPrompt.prompt + effectiveToolReflectionPrompt = resolvedPrompt.effectiveToolReflectionPrompt + promptSource = resolvedPrompt.source + } await showToast(client, directory, "Requesting reflection self-assessment...", "info") - debug("Requesting reflection self-assessment (source:", resolvedPrompt.source, ")") + debug("Requesting reflection self-assessment (source:", promptSource, ")") // Issue #98: Run self-assessment in a separate ephemeral session instead // of prompting the active agent session. Asking the active session to @@ -2115,7 +2169,7 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { selfAssessment, judgeSessionIds, effectiveToolReflectionPrompt, - rubric + effRubric ) } @@ -2128,6 +2182,39 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { debug("Reflection analysis completed") + // Supervisor goal loop: when an active goal exists, decide its next state + // (budget exhaustion FIRST, then achievement, then continuation). An + // exhausted goal pauses the loop WITHOUT injecting a continuation; an + // achieved goal is persisted and falls through to the normal complete + // bookkeeping; a continuing goal persists attempts+1 and lets the existing + // feedback injection proceed unchanged. + if (goal) { + const t = decideGoalTransition({ + goal, + complete: analysis.complete, + now: Date.now(), + maxAttempts: effectiveMaxAttempts, + reason: analysis.reason, + }) + if (t.action === "exhausted") { + await supervisorStore.save(directory, sessionId, { ...sup, goal: t.goal }) + attempts.delete(attemptKey) + lastReflectedMsgId.set(sessionId, lastUserMsgId) + await showToast( + client, + directory, + `Goal budget exhausted (${t.goal.attempts} attempts) — paused`, + "warning" + ) + debug("Goal budget exhausted, pausing without continuation") + return + } + // achieved | continue: persist the new goal state. For "achieved", the + // existing analysis.complete branch below handles the bookkeeping + + // return; for "continue", the existing feedback injection proceeds. + await supervisorStore.save(directory, sessionId, { ...sup, goal: t.goal }) + } + let crossReview: { modelSpec: string; response: string } | null = null if (analysis.complete) { debug("Task complete, attempting cross-model review (assessmentModel:", assessmentModelSpec || "unknown", ")") @@ -2168,6 +2255,7 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { if (analysis.complete) { attempts.delete(attemptKey) lastReflectedMsgId.set(sessionId, lastUserMsgId) + if (goal) await showToast(client, directory, `Goal achieved ✓`, "success") await showToast(client, directory, `Task complete ✓ (${analysis.severity})`, "success") debug("Reflection complete") return diff --git a/test/supervisor.integration.test.ts b/test/supervisor.integration.test.ts new file mode 100644 index 0000000..afb1376 --- /dev/null +++ b/test/supervisor.integration.test.ts @@ -0,0 +1,177 @@ +/** + * Integration tests for the supervisor goal loop wired into runReflection. + * + * These drive the plugin's session.idle handler through a mocked OpenCode + * client and assert the end-to-end goal behavior: + * (a) goal active + judge NOT complete → continuation injected, attempts++ + * (b) goal active + judge complete → goal status "achieved", no continuation + * (c) goal at attempts cap → goal status "exhausted", no continuation + * + * The mock client returns a deterministic self-assessment JSON for every + * ephemeral judge/classifier session, so the parse-then-evaluate path is taken + * (no real LLM). We use a non-coding task summary so no workflow gates apply and + * the goal condition is the sole completion driver. + */ + +import assert from "node:assert" +import { mkdtempSync } from "node:fs" +import { tmpdir } from "node:os" +import { join } from "node:path" +import { Reflection3Plugin, supervisorStore, resolveMaxAttempts } from "../reflection-3.ts" + +const SID = "ses_integration" + +// A non-coding task so requiresTests/PR/CI are all false and `complete` is +// driven solely by the self-assessment status/confidence. +const USER_TEXT = "Tell me a short story about a friendly cloud." + +function mainMessages() { + return [ + { + id: "msg_user_1", + info: { role: "user", time: { start: 1000 } }, + parts: [{ type: "text", text: USER_TEXT }], + }, + { + id: "msg_asst_1", + info: { role: "assistant", time: { start: 2000, completed: 3000 } }, + parts: [{ type: "text", text: "Here is a little story." }], + }, + ] +} + +/** + * Build a mock client. `assessment` is the JSON string the ephemeral judge / + * classifier sessions "respond" with. Records all promptAsync calls so we can + * detect a continuation injected into the MAIN session. + */ +function makeClient(assessment: string) { + const prompts: Array<{ id: string; text: string }> = [] + let counter = 0 + const ephemeralIds = new Set() + const client: any = { + session: { + async create() { + const id = `judge_${++counter}` + ephemeralIds.add(id) + return { data: { id } } + }, + async messages({ path }: any) { + if (ephemeralIds.has(path.id)) { + return { + data: [ + { + id: `${path.id}_resp`, + info: { role: "assistant", time: { start: 1, completed: 2 } }, + parts: [{ type: "text", text: assessment }], + }, + ], + } + } + return { data: mainMessages() } + }, + async promptAsync({ path, body }: any) { + const text = (body?.parts || []).map((p: any) => p.text).join("") + prompts.push({ id: path.id, text }) + return {} + }, + async delete() { + return {} + }, + }, + } + return { + client, + prompts, + continuationToMain: () => prompts.filter(p => p.id === SID), + } +} + +async function fireIdle(plugin: any) { + await plugin.event({ event: { type: "session.idle", properties: { sessionID: SID } } }) +} + +describe("supervisor integration: goal loop in runReflection", () => { + it("(a) goal active + judge NOT complete → continuation injected and attempts incremented", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-int-a-")) + await supervisorStore.setGoal(dir, SID, "do X") + + const notComplete = JSON.stringify({ + task_summary: "story", + task_type: "other", + status: "in_progress", + confidence: 0.9, + remaining_work: ["Make the story longer"], + next_steps: ["Expand the story"], + }) + const { client, continuationToMain } = makeClient(notComplete) + const plugin = await Reflection3Plugin({ client, directory: dir } as any) + + await fireIdle(plugin) + + const cont = continuationToMain() + assert.ok(cont.length >= 1, "a continuation should be injected into the main session") + assert.match(cont[0].text, /Reflection-3:/, "injected text should be reflection feedback") + + const st = await supervisorStore.load(dir, SID) + assert.strictEqual(st.goal?.status, "active") + assert.strictEqual(st.goal?.attempts, 1, "attempts should be incremented to 1") + assert.match(st.goal?.lastReason || "", /.+/, "lastReason should be recorded") + }, 60000) + + it("(b) goal active + judge complete → status achieved and NO continuation", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-int-b-")) + await supervisorStore.setGoal(dir, SID, "do X") + + const complete = JSON.stringify({ + task_summary: "story", + task_type: "other", + status: "complete", + confidence: 0.95, + remaining_work: [], + next_steps: [], + }) + const { client, continuationToMain } = makeClient(complete) + const plugin = await Reflection3Plugin({ client, directory: dir } as any) + + await fireIdle(plugin) + + assert.strictEqual(continuationToMain().length, 0, "no continuation when goal achieved") + const st = await supervisorStore.load(dir, SID) + assert.strictEqual(st.goal?.status, "achieved") + assert.strictEqual(st.goal?.attempts, 0, "attempts unchanged on achieved") + }, 60000) + + it("(c) goal at attempts cap → status exhausted and NO continuation", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-int-c-")) + // No config file / package.json in temp dir → config resolves to default 16. + const cap = resolveMaxAttempts({}) + await supervisorStore.setGoal(dir, SID, "do X") + const pre = await supervisorStore.load(dir, SID) + // Mutate attempts to the cap so the FIRST decision exhausts the budget. + await supervisorStore.save(dir, SID, { + ...pre, + goal: { ...pre.goal!, attempts: cap }, + }) + + // Judge says NOT complete — but exhaustion is checked first, so it must + // not matter; assert no continuation regardless. + const notComplete = JSON.stringify({ + task_summary: "story", + task_type: "other", + status: "in_progress", + confidence: 0.9, + remaining_work: ["more"], + next_steps: ["more"], + }) + const { client, continuationToMain } = makeClient(notComplete) + const plugin = await Reflection3Plugin({ client, directory: dir } as any) + + await fireIdle(plugin) + + assert.strictEqual(continuationToMain().length, 0, "no continuation when budget exhausted") + const st = await supervisorStore.load(dir, SID) + assert.strictEqual(st.goal?.status, "exhausted") + assert.strictEqual(st.goal?.attempts, cap, "attempts not incremented on exhaustion") + }, 60000) +}) diff --git a/test/supervisor.unit.test.ts b/test/supervisor.unit.test.ts index 7d8a195..03ec383 100644 --- a/test/supervisor.unit.test.ts +++ b/test/supervisor.unit.test.ts @@ -2,7 +2,20 @@ import assert from "node:assert" import { mkdtempSync, writeFileSync, mkdirSync, chmodSync, statSync } from "node:fs" import { tmpdir } from "node:os" import { join } from "node:path" -import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore, parseSupervisorCommand, buildGoalRequirementSection } from "../reflection-3.ts" +import { DEFAULT_RUBRIC, parseRubric, loadRubric, buildSelfAssessmentPrompt, buildJudgePrompt, resolveMaxAttempts, buildEscalatingFeedback, supervisorStore, parseSupervisorCommand, buildGoalRequirementSection, decideGoalTransition } from "../reflection-3.ts" + +function mkGoal(over: Partial = {}): any { + return { + condition: "do X", + status: "active", + attempts: 0, + tokenBaseline: 0, + startedAt: 1000, + deadline: 10000, + lastReason: "", + ...over, + } +} describe("supervisorStore", () => { it("saves and loads goal + retry, clears goal but keeps retry", async () => { @@ -269,4 +282,76 @@ describe("supervisor: buildGoalRequirementSection", () => { assert.strictEqual(buildGoalRequirementSection(" "), "") assert.strictEqual(buildGoalRequirementSection(""), "") }) + it("states the goal is ADDITIONAL to the gates and that they still apply", () => { + const s = buildGoalRequirementSection("do X") + assert.match(s, /ADDITIONAL/) + assert.match(s, /does not replace/) + assert.match(s, /workflow gates/) + }) +}) + +describe("supervisor: decideGoalTransition", () => { + it("attempts at cap (complete=false) → exhausted, not continue", () => { + const goal = mkGoal({ attempts: 16 }) + const t = decideGoalTransition({ goal, complete: false, now: 5000, maxAttempts: 16, reason: "still failing" }) + assert.strictEqual(t.action, "exhausted") + assert.strictEqual(t.goal.status, "exhausted") + assert.strictEqual(t.goal.lastReason, "still failing") + // attempts not incremented on exhaustion + assert.strictEqual(t.goal.attempts, 16) + }) + + it("attempts over cap → exhausted even if complete", () => { + const goal = mkGoal({ attempts: 20 }) + const t = decideGoalTransition({ goal, complete: true, now: 5000, maxAttempts: 16 }) + assert.strictEqual(t.action, "exhausted") + assert.strictEqual(t.goal.status, "exhausted") + }) + + it("deadline exceeded with attempts under cap → exhausted", () => { + const goal = mkGoal({ attempts: 2, deadline: 4000 }) + const t = decideGoalTransition({ goal, complete: false, now: 5000, maxAttempts: 16, reason: "timed out" }) + assert.strictEqual(t.action, "exhausted") + assert.strictEqual(t.goal.status, "exhausted") + assert.strictEqual(t.goal.lastReason, "timed out") + }) + + it("now exactly at deadline → exhausted (>=)", () => { + const goal = mkGoal({ attempts: 1, deadline: 5000 }) + const t = decideGoalTransition({ goal, complete: false, now: 5000, maxAttempts: 16 }) + assert.strictEqual(t.action, "exhausted") + }) + + it("complete and within budget → achieved", () => { + const goal = mkGoal({ attempts: 3 }) + const t = decideGoalTransition({ goal, complete: true, now: 5000, maxAttempts: 16, reason: "tests pass" }) + assert.strictEqual(t.action, "achieved") + assert.strictEqual(t.goal.status, "achieved") + assert.strictEqual(t.goal.lastReason, "tests pass") + assert.strictEqual(t.goal.attempts, 3, "attempts unchanged on achieved") + }) + + it("not complete and within budget → continue, attempts incremented", () => { + const goal = mkGoal({ attempts: 3 }) + const t = decideGoalTransition({ goal, complete: false, now: 5000, maxAttempts: 16, reason: "missing tests" }) + assert.strictEqual(t.action, "continue") + assert.strictEqual(t.goal.status, "active") + assert.strictEqual(t.goal.attempts, 4) + assert.strictEqual(t.goal.lastReason, "missing tests") + }) + + it("falls back to existing lastReason when reason omitted", () => { + const goal = mkGoal({ attempts: 1, lastReason: "previous" }) + const t = decideGoalTransition({ goal, complete: false, now: 5000, maxAttempts: 16 }) + assert.strictEqual(t.action, "continue") + assert.strictEqual(t.goal.lastReason, "previous") + }) + + it("does not mutate the input goal", () => { + const goal = mkGoal({ attempts: 1 }) + decideGoalTransition({ goal, complete: false, now: 5000, maxAttempts: 16, reason: "x" }) + assert.strictEqual(goal.attempts, 1) + assert.strictEqual(goal.status, "active") + assert.strictEqual(goal.lastReason, "") + }) }) From 9398602609022b32a121d029403014b17c3d4b9c Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:24:04 +0000 Subject: [PATCH 12/14] polish(supervisor): guard blank goal section, single achieve toast, clarify dual-counter Addresses code-review minors on goal-loop integration. Refs #143 Co-Authored-By: Claude Opus 4.8 --- reflection-3.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/reflection-3.ts b/reflection-3.ts index 070ad15..6f9531c 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -2037,8 +2037,9 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { // When a goal is active, its condition is the strongest stated intent and // MUST compose with the existing gates: build the DEFAULT rubric prompt // with the goal requirement appended, bypassing file/tool precedence. - const effRubric: Rubric = goal - ? { ...rubric, antipatterns: `${rubric.antipatterns}\n\n${buildGoalRequirementSection(goal.condition)}` } + const goalSection = goal ? buildGoalRequirementSection(goal.condition) : "" + const effRubric: Rubric = goalSection + ? { ...rubric, antipatterns: `${rubric.antipatterns}\n\n${goalSection}` } : rubric const defaultReflectionPrompt = buildSelfAssessmentPrompt( context, @@ -2255,8 +2256,7 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { if (analysis.complete) { attempts.delete(attemptKey) lastReflectedMsgId.set(sessionId, lastUserMsgId) - if (goal) await showToast(client, directory, `Goal achieved ✓`, "success") - await showToast(client, directory, `Task complete ✓ (${analysis.severity})`, "success") + await showToast(client, directory, goal ? `Goal achieved ✓ (${analysis.severity})` : `Task complete ✓ (${analysis.severity})`, "success") debug("Reflection complete") return } @@ -2296,6 +2296,9 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { return } + // When a goal is active, goal.attempts (persisted) guards the budget via + // decideGoalTransition above; this per-message counter still fires as the + // fallback cap for ordinary sessions without a goal. They coexist on purpose. const nextAttemptCount = (attempts.get(attemptKey) || 0) + 1 attempts.set(attemptKey, nextAttemptCount) if (nextAttemptCount >= effectiveMaxAttempts) { From f3dd9706555c1f704e68f230068637741d2b1e41 Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:26:36 +0000 Subject: [PATCH 13/14] docs(supervisor): document goal, retry, and configurable rubric Refs #143 Co-Authored-By: Claude Opus 4.8 --- README.md | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/README.md b/README.md index 0e29da5..d7843ca 100644 --- a/README.md +++ b/README.md @@ -231,6 +231,89 @@ cd ~/.config/opencode && \ --- +## Supervisor Mode + +`reflection-3` v3+ ships a **supervisor control surface** layered on top of the always-on reflection engine: a configurable rubric, a configurable retry budget, and a session-scoped goal command. All three features share the same independent-judge loop that already drives reflection — no new evaluator model, no separate feedback channel. + +> **Status:** The engine and state layer (`supervisorStore`, rubric loader, goal-loop integration) are implemented in `reflection-3.ts`. The slash-command surface (`/supervisor:goal`, `/supervisor:retry`) is being finalized — the `.opencode/command/supervisor/` command files are not yet shipped. The sections below describe the intended UX. + +### Configurable rubric (`rubric.md`) + +The judge's completion rules are no longer hardcoded. They live in a single Markdown file with two sections: + +- `## Patterns` — what "done" looks like (positive completion criteria) +- `## Antipatterns` — the mined premature-stop rules: PERMISSION-SEEKING, STOPPED-WITH-TODOS, FALSE-COMPLETE, and others + +The plugin ships an embedded default (seeded from the 227-session dataset), so the single-file install path keeps working with no extra files to manage. Create a `rubric.md` only when you want to tune the judge for your workflow. + +**Resolution order (first found wins):** + +| Priority | Path | +|----------|------| +| 1 | `.reflection/rubric.md` (project-level) | +| 2 | `~/.config/opencode/supervisor/rubric.md` (global) | +| 3 | Embedded default | + +A file missing either `## Patterns` or `## Antipatterns` is treated as malformed and the embedded default is used in full — the judge never runs with an empty rubric. + +**Minimal override example** (project-level): + +```markdown +## Patterns +- All acceptance criteria in the task description are addressed with evidence +- Any modified code has a corresponding test that was run after the change + +## Antipatterns +- PERMISSION-SEEKING: agent asks "should I run X?" when it has the tool to do so +- STOPPED-WITH-TODOS: response ends with a "next steps" list and no further action +- FALSE-COMPLETE: claims done but no test run appears in the tool calls +``` + +### Configurable retry budget + +The default retry cap is **16** (raised from 3). This governs all reflection re-prompts — both the always-on judge and the goal loop when active. + +**Global default** — add `maxAttempts` to `~/.config/opencode/reflection.yaml`: + +```yaml +maxAttempts: 32 +``` + +**Per-session override** (once command files are shipped): + +``` +/supervisor:retry 24 # set the cap for this session +/supervisor:retry # show current effective value (config → default: 16) +``` + +The value is clamped to 1–100. Secondary safety caps (`goalMaxTokens`, `goalMaxDurationMs`) are also configurable in `reflection.yaml` and terminate the loop on spend or wall-clock time regardless of attempt count. + +### Session goals (`/supervisor:goal`) + +Set a session-scoped completion condition. The independent judge then keeps the agent working until **both** the condition is met **and** all applicable workflow gates pass. A docs-only task won't require a PR or green CI; a coding task will. + +| Command | Effect | +|---------|--------| +| `/supervisor:goal ` | Set (or replace) the session goal and start working toward it (≤ 4000 chars) | +| `/supervisor:goal` | Show status: condition, attempts used / budget, last judge reason | +| `/supervisor:goal clear` | Clear the active goal (aliases: `stop`, `off`, `reset`, `none`, `cancel`) | + +The goal is layered **on top of** the rubric as a mandatory completion requirement — it does not replace the workflow gates. Completion auto-clears the goal. Exhausting the retry budget pauses it; no further auto-continuation fires until the user re-sets the goal or raises the budget. + +**Example:** + +``` +/supervisor:goal all tests in test/auth/ pass and the PR is open with green CI +``` + +The judge evaluates: "do the applicable gates pass **and** is this condition met with evidence?" A bare "condition met" claim with no evidence (tests not run, no PR URL) yields `complete=false` and the loop continues. + +### Provider note for long unattended runs + +For extended autonomous or overnight runs, prefer the **`anthropic`** provider for the main agent session. The `github-copilot` provider rejects assistant-message prefill with a 400 error, which can silently break auto-continuation in some OpenCode internals. The supervisor's continuation mechanism injects a **user turn** via `promptAsync` — this is provider-safe by design — but the note stands as a general best practice for unattended work. + +--- + ## Reflection Plugin Evaluates task completion after each agent response and provides feedback if work is incomplete. From 2c7b165abc1215e4f6e4dd927724c097e550cc2c Mon Sep 17 00:00:00 2001 From: Dzianis Vashchuk <2119348+dzianisv@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:50:28 +0000 Subject: [PATCH 14/14] fix(supervisor): only burn goal budget when a continuation is actually injected Refs #143 Co-Authored-By: Claude Opus 4.8 --- README.md | 4 +- reflection-3.ts | 28 ++++++-- test/supervisor.integration.test.ts | 100 ++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d7843ca..9615bdd 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,9 @@ maxAttempts: 32 /supervisor:retry # show current effective value (config → default: 16) ``` -The value is clamped to 1–100. Secondary safety caps (`goalMaxTokens`, `goalMaxDurationMs`) are also configurable in `reflection.yaml` and terminate the loop on spend or wall-clock time regardless of attempt count. +The value is clamped to 1–100. In addition to the attempt cap, an active goal also terminates on a **fixed 30-minute wall-clock timeout** (the current goal timeout; not yet configurable). + +> **Note:** per-goal `goalMaxTokens` (token-spend cap) and a configurable `goalMaxDurationMs` are **planned but not yet wired** — they are not read from `reflection.yaml` today. The goal timeout is currently the hardcoded 30 minutes above, and there is no token-based cap yet. ### Session goals (`/supervisor:goal`) diff --git a/reflection-3.ts b/reflection-3.ts index 6f9531c..9b6db31 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -2187,8 +2187,12 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { // (budget exhaustion FIRST, then achievement, then continuation). An // exhausted goal pauses the loop WITHOUT injecting a continuation; an // achieved goal is persisted and falls through to the normal complete - // bookkeeping; a continuing goal persists attempts+1 and lets the existing - // feedback injection proceed unchanged. + // bookkeeping; a continuing goal must ONLY burn budget (attempts+1) when a + // continuation is actually injected — so we defer that persist until after + // the feedback prompt succeeds (see pendingGoalContinue below). This avoids + // silently losing budget on interrupted passes (human-action, new-message, + // abort, or per-message max-attempts rechecks that early-return). + let pendingGoalContinue: SupervisorState | null = null if (goal) { const t = decideGoalTransition({ goal, @@ -2210,10 +2214,16 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { debug("Goal budget exhausted, pausing without continuation") return } - // achieved | continue: persist the new goal state. For "achieved", the - // existing analysis.complete branch below handles the bookkeeping + - // return; for "continue", the existing feedback injection proceeds. - await supervisorStore.save(directory, sessionId, { ...sup, goal: t.goal }) + if (t.action === "achieved") { + // Persist the achieved status now; the existing analysis.complete + // branch below handles the bookkeeping + return. No attempts increment. + await supervisorStore.save(directory, sessionId, { ...sup, goal: t.goal }) + } else { + // "continue": DO NOT persist the attempts++ here. Stash it and only + // write it after a continuation is actually injected (post-promptAsync + // success). Any early-return below leaves goal.attempts unchanged. + pendingGoalContinue = { ...sup, goal: t.goal } + } } let crossReview: { modelSpec: string; response: string } | null = null @@ -2348,6 +2358,12 @@ export const Reflection3Plugin: Plugin = async ({ client, directory }) => { // start another reflection cycle for the same user message. lastReflectedMsgId.set(sessionId, lastUserMsgId) + // Goal budget is burned ONLY here, after a continuation was actually + // injected. If any earlier recheck early-returned (or promptAsync above + // threw and returned), pendingGoalContinue was never set / never reached, + // so goal.attempts stays unchanged — no silent budget loss. + if (pendingGoalContinue) await supervisorStore.save(directory, sessionId, pendingGoalContinue) + debug("Reflection pushed continuation") const routingInfo = routingModel ? ` [${routingCategory} → ${routingModel.modelID}]` : "" diff --git a/test/supervisor.integration.test.ts b/test/supervisor.integration.test.ts index afb1376..110e789 100644 --- a/test/supervisor.integration.test.ts +++ b/test/supervisor.integration.test.ts @@ -6,6 +6,9 @@ * (a) goal active + judge NOT complete → continuation injected, attempts++ * (b) goal active + judge complete → goal status "achieved", no continuation * (c) goal at attempts cap → goal status "exhausted", no continuation + * (d) goal active + judge NOT complete BUT a new user message arrives during + * analysis (pre-feedback recheck) → NO continuation injected AND + * goal.attempts stays 0 (budget not burned on an interrupted pass). * * The mock client returns a deterministic self-assessment JSON for every * ephemeral judge/classifier session, so the parse-then-evaluate path is taken @@ -87,6 +90,68 @@ function makeClient(assessment: string) { } } +/** + * Variant of makeClient where the MAIN session's messages() returns the original + * two messages for the first `interruptOnMainCall - 1` main-session reads, then + * (on the Nth main-session read) appends a NEW, non-reflection user message with + * a different id. runReflection reads the main session three times: initial, + * the post-assessment recheck (currentMessages), and the pre-feedback recheck + * (preFeedbackMessages). Interrupting on the THIRD read simulates "a new user + * message arrived during analysis" exactly at the pre-feedback recheck, which + * must early-return WITHOUT injecting a continuation. + */ +function makeInterruptingClient(assessment: string, interruptOnMainCall: number) { + const prompts: Array<{ id: string; text: string }> = [] + let counter = 0 + let mainCalls = 0 + const ephemeralIds = new Set() + const interruptedMessages = () => [ + ...mainMessages(), + { + id: "msg_user_2", + info: { role: "user", time: { start: 4000 } }, + parts: [{ type: "text", text: "Actually, never mind — do something else." }], + }, + ] + const client: any = { + session: { + async create() { + const id = `judge_${++counter}` + ephemeralIds.add(id) + return { data: { id } } + }, + async messages({ path }: any) { + if (ephemeralIds.has(path.id)) { + return { + data: [ + { + id: `${path.id}_resp`, + info: { role: "assistant", time: { start: 1, completed: 2 } }, + parts: [{ type: "text", text: assessment }], + }, + ], + } + } + mainCalls++ + return { data: mainCalls >= interruptOnMainCall ? interruptedMessages() : mainMessages() } + }, + async promptAsync({ path, body }: any) { + const text = (body?.parts || []).map((p: any) => p.text).join("") + prompts.push({ id: path.id, text }) + return {} + }, + async delete() { + return {} + }, + }, + } + return { + client, + prompts, + continuationToMain: () => prompts.filter(p => p.id === SID), + } +} + async function fireIdle(plugin: any) { await plugin.event({ event: { type: "session.idle", properties: { sessionID: SID } } }) } @@ -174,4 +239,39 @@ describe("supervisor integration: goal loop in runReflection", () => { assert.strictEqual(st.goal?.status, "exhausted") assert.strictEqual(st.goal?.attempts, cap, "attempts not incremented on exhaustion") }, 60000) + + it("(d) goal active + judge NOT complete BUT new user message during analysis → NO continuation and attempts NOT burned", async () => { + const dir = mkdtempSync(join(tmpdir(), "sup-int-d-")) + await supervisorStore.setGoal(dir, SID, "do X") + + const before = await supervisorStore.load(dir, SID) + assert.strictEqual(before.goal?.attempts, 0, "precondition: attempts start at 0") + + const notComplete = JSON.stringify({ + task_summary: "story", + task_type: "other", + status: "in_progress", + confidence: 0.9, + remaining_work: ["Make the story longer"], + next_steps: ["Expand the story"], + }) + // Interrupt on the THIRD main-session read (the pre-feedback recheck): the + // initial read and the post-assessment recheck see the original last-user + // message, then a new user message appears right before feedback injection. + const { client, continuationToMain } = makeInterruptingClient(notComplete, 3) + const plugin = await Reflection3Plugin({ client, directory: dir } as any) + + await fireIdle(plugin) + + assert.strictEqual( + continuationToMain().length, + 0, + "no continuation should be injected when a new user message arrived during analysis" + ) + const st = await supervisorStore.load(dir, SID) + // The goal must remain active and, crucially, attempts must NOT have been + // burned — the increment only persists when a continuation is injected. + assert.strictEqual(st.goal?.status, "active", "goal stays active on an interrupted pass") + assert.strictEqual(st.goal?.attempts, 0, "attempts must NOT be incremented on an interrupted pass") + }, 60000) })