fix(core): recent-turn verbatim window + relax tool-input cap

hqhq1025 · hqhq1025 · commit 4e0de272fdc6 · 2026-04-21T10:56:49.000+08:00
After the prompt OVERRIDE block in b692ec7 eliminated the `<artifact>` text-dump vector, the windowless per-block compaction from v2 became over-aggressive: the model's own most recent str_replace new_str got summarized before the next turn, forcing guess-based old_str selection for follow-up edits. v3 splits behavior by block type: - assistant.content[*].text — still capped (8 KB) on ALL turns. This is the regression guard against a `<artifact>` text-dump slipping back in; its cost is near-zero. - assistant.toolCall.input + toolResult.text — capped only outside a 3-turn window. Inside the window they stay verbatim so the model reads its own just-written section and the latest view() output in full fidelity. Tool-input cap for older turns raised 8 KB -> 24 KB, covering a full section's str_replace in one block. HARD_CAP_BYTES safety net retained: if per-block pass still exceeds 200 KB, the aggressive pass (2 KB caps, window dropped) runs. Also: done.ts description bumps its self-advertised round limit 3 -> 5 so upcoming craft-surplus verifier passes have room to iterate. - packages/core/src/context-prune.ts: split caps by recency + type - packages/core/src/context-prune.test.ts: window behavior coverage - packages/core/src/tools/done.ts: stop-after 3 -> 5 rounds
diff --git a/packages/core/src/context-prune.test.ts b/packages/core/src/context-prune.test.ts
@@ -39,7 +39,7 @@ function assistantText(text: string): AgentMessage {
   } as unknown as AgentMessage;
 }
 
-describe('buildTransformContext — size-based block compaction', () => {
+describe('buildTransformContext — size-based block compaction with recent-turn window', () => {
   it('is a no-op when every block is under its cap', async () => {
     const transform = buildTransformContext();
     const messages: AgentMessage[] = [
@@ -53,8 +53,8 @@ describe('buildTransformContext — size-based block compaction', () => {
   });
 
   it('stubs a large assistant text block even on the LATEST message', async () => {
-    // The production bug: model streamed a 9MB artifact as assistant text
-    // on the final turn. v1 window-based prune preserved it verbatim.
+    // Text cap applies to ALL turns. Guards against the `<artifact>` text
+    // dump regression (assistant streamed 9 MB JSX as prose on the final turn).
     const transform = buildTransformContext();
     const huge = 'x'.repeat(50_000);
     const messages: AgentMessage[] = [userMsg('build it'), assistantText(huge)];
@@ -65,7 +65,9 @@ describe('buildTransformContext — size-based block compaction', () => {
     expect(text).toContain('50000B');
   });
 
-  it('summarizes a large toolCall.input, preserving name + id', async () => {
+  it('keeps a large toolCall.input verbatim inside the recent window', async () => {
+    // The model's own just-written str_replace must stay full-fidelity so it
+    // can pick the next old_str from memory instead of guessing.
     const transform = buildTransformContext();
     const bulk = 'a'.repeat(20_000);
     const messages: AgentMessage[] = [
@@ -75,17 +77,39 @@ describe('buildTransformContext — size-based block compaction', () => {
     ];
     const out = await transform(messages);
     const a = out[1] as {
-      content: Array<{ type?: string; id?: string; name?: string; input?: unknown }>;
+      content: Array<{ type?: string; id?: string; input?: { inputArg?: string } }>;
     };
     const tc = a.content.find((c) => c.type === 'toolCall');
     expect(tc?.id).toBe('call-0');
-    expect(tc?.name).toBe('str_replace_based_edit_tool');
-    const input = tc?.input as { _summarized?: boolean; _origBytes?: number };
-    expect(input?._summarized).toBe(true);
-    expect(input?._origBytes).toBeGreaterThan(10_000);
+    expect(tc?.input?.inputArg).toBe(bulk);
   });
 
-  it('stubs a large toolResult body, keeping toolCallId for pi-ai shape', async () => {
+  it('summarizes a large toolCall.input for older turns outside the window', async () => {
+    const transform = buildTransformContext();
+    const bulk = 'a'.repeat(30_000);
+    const messages: AgentMessage[] = [userMsg('build')];
+    messages.push(assistantWithToolCall('call-old', bulk));
+    messages.push(toolResult('call-old', 'ok'));
+    // Three more turns push call-old out of the 3-turn window.
+    for (let i = 0; i < 3; i += 1) {
+      messages.push(assistantWithToolCall(`t${i}`, 'small'));
+      messages.push(toolResult(`t${i}`, 'ok'));
+    }
+    const out = await transform(messages);
+    const oldAssistant = out[1] as {
+      content: Array<{
+        type?: string;
+        id?: string;
+        input?: { _summarized?: boolean; _origBytes?: number };
+      }>;
+    };
+    const tc = oldAssistant.content.find((c) => c.type === 'toolCall');
+    expect(tc?.id).toBe('call-old');
+    expect(tc?.input?._summarized).toBe(true);
+    expect(tc?.input?._origBytes ?? 0).toBeGreaterThan(20_000);
+  });
+
+  it('keeps a large toolResult verbatim inside the recent window', async () => {
     const transform = buildTransformContext();
     const bulk = 'y'.repeat(20_000);
     const messages: AgentMessage[] = [
@@ -96,6 +120,22 @@ describe('buildTransformContext — size-based block compaction', () => {
     const out = await transform(messages);
     const tr = out[2] as { toolCallId?: string; content: Array<{ text?: string }> };
     expect(tr.toolCallId).toBe('call-0');
+    expect(tr.content[0]?.text).toBe(bulk);
+  });
+
+  it('stubs large toolResult bodies for older turns outside the window', async () => {
+    const transform = buildTransformContext();
+    const bulk = 'y'.repeat(20_000);
+    const messages: AgentMessage[] = [userMsg('x')];
+    messages.push(assistantWithToolCall('call-old', 'a'));
+    messages.push(toolResult('call-old', bulk));
+    for (let i = 0; i < 3; i += 1) {
+      messages.push(assistantWithToolCall(`t${i}`, 'small'));
+      messages.push(toolResult(`t${i}`, 'ok'));
+    }
+    const out = await transform(messages);
+    const tr = out[2] as { toolCallId?: string; content: Array<{ text?: string }> };
+    expect(tr.toolCallId).toBe('call-old');
     expect(tr.content[0]?.text?.startsWith('[tool result dropped')).toBe(true);
   });
 
@@ -118,13 +158,9 @@ describe('buildTransformContext — size-based block compaction', () => {
     expect(out[0]).toBe(opening);
   });
 
-  it('tightens to aggressive caps when HARD_CAP_BYTES is exceeded', async () => {
+  it('tightens to aggressive caps (ignoring window) when HARD_CAP_BYTES is exceeded', async () => {
     const transform = buildTransformContext();
     const messages: AgentMessage[] = [userMsg('go')];
-    // 30 rounds with tool input just over the 8KB cap = 30 summarized at first
-    // pass, but the metadata itself adds up. Force the hard cap by also adding
-    // many text blocks between 2KB and 8KB — first pass keeps them, aggressive
-    // compacts them.
     const midText = 'p'.repeat(6_000);
     for (let i = 0; i < 40; i += 1) {
       messages.push(assistantText(midText));
@@ -142,7 +178,6 @@ describe('buildTransformContext — size-based block compaction', () => {
         }
       }
     }
-    // Aggressive cap is 2KB — the 6KB midText blocks should all be stubbed.
     expect(droppedTextCount).toBeGreaterThanOrEqual(35);
   });
 });
diff --git a/packages/core/src/context-prune.ts b/packages/core/src/context-prune.ts
@@ -2,49 +2,65 @@
  * Per-message size-based context compaction for pi-agent-core's
  * `transformContext` hook. Runs before every LLM call.
  *
- * Philosophy: **history is intent tracking, not payload storage.**
+ * Philosophy: **history is intent tracking, not payload storage.** The model
+ * needs the decision trail — which tools, in what order, with what shape —
+ * not verbatim 9 MB artifact dumps or whole-file view returns from ten turns
+ * ago. Current file state is always recoverable via ranged `view()`.
  *
- * What the model needs from prior turns is the DECISION trail — which tools
- * it called, in what order, with what shape. What it does NOT need is its own
- * verbatim 9 MB artifact dump, or the full view() return of a 100 KB file it
- * already touched. Current state is always recoverable via view(), so there
- * is no information loss from stubbing prior payloads.
+ * Evolution:
+ *   - v1 (window): kept last N turns verbatim, stubbed older. Missed the
+ *     dominant failure mode — a 9 MB `<artifact>` text dump sat inside the
+ *     keep-verbatim window and shipped 3.97 M tokens.
+ *   - v2 (windowless): stubbed every block over its cap regardless of
+ *     position. Safe, but over-aggressive after the prompt OVERRIDE block
+ *     eliminated the text-dump vector — the model's own latest str_replace
+ *     new_str got summarized, so picking the next old_str required guessing.
+ *   - v3 (this file): split behavior by block type.
+ *        · `assistant.content[*].text` is always capped (8 KB, all turns).
+ *          This is the regression guard: the one class of block that must
+ *          never be allowed to balloon, because a bad prompt interaction
+ *          can resurrect the `<artifact>` dump.
+ *        · `assistant.content[*].toolCall.input` and
+ *          `toolResult.content[*].text` are capped only outside a small
+ *          recent-turn window. Inside the window they stay verbatim so the
+ *          model reads its own just-written section and the latest view()
+ *          output in full fidelity. Outside the window, large payloads
+ *          collapse to a one-line stub.
  *
- * v1 (window-based) failed in production because the 9 MB payload was the
- * LATEST assistant text block (the model streaming out the final artifact as
- * prose right before `done`). It sat inside the "keep verbatim" window, and
- * the request still shipped 3.97 M tokens.
- *
- * v3 (this file) drops the window entirely. Every message is inspected per
- * block; any block whose serialized size exceeds its per-type cap is stubbed.
- * Caps are small on purpose — 8 KB fits any reasonable paragraph of reasoning
- * or normal tool argument, but cannot carry an HTML/JSX artifact, a base64
- * image, or a whole-file view return. The model's most recent decision is
- * still visible at full fidelity for small outputs.
- *
- * Three block-level caps:
- *   - `assistant.content[*].text` — model prose / streamed artifact text.
- *   - `assistant.content[*].toolCall.input` — args the model sent to a tool.
- *   - `toolResult.content[*].text` — host-side tool return payload.
+ * Block-level caps:
+ *   - TEXT_BLOCK_LIMIT     — assistant prose, ALL turns.
+ *   - TOOL_INPUT_LIMIT     — assistant.toolCall.input, older turns only.
+ *   - TOOL_RESULT_LIMIT    — toolResult.text, older turns only.
  *
  * Stub format carries bytes + a short preview so the model can tell what
  * got dropped, and (for tool calls) keeps tool NAME + id so pi-ai's shape
  * validation remains happy.
  *
  * Safety net: after per-block stubbing, if the grand total still exceeds
- * `HARD_CAP_BYTES`, we shrink caps further and re-run. Catches pathological
- * runs with many just-under-threshold blocks.
+ * `HARD_CAP_BYTES`, we shrink caps further (including within the window)
+ * and re-run. Catches pathological runs with many just-under-threshold
+ * blocks.
  */
 
 import type { AgentMessage } from '@mariozechner/pi-agent-core';
 import { type CoreLogger, NOOP_LOGGER } from './logger.js';
 
 const TEXT_BLOCK_LIMIT = 8 * 1024;
-const TOOL_INPUT_LIMIT = 8 * 1024;
+const TOOL_INPUT_LIMIT = 24 * 1024;
 const TOOL_RESULT_LIMIT = 8 * 1024;
 const HARD_CAP_BYTES = 200_000;
 const AGGRESSIVE_BLOCK_LIMIT = 2 * 1024;
 
+/**
+ * Number of most-recent non-user messages whose tool payloads (toolCall.input
+ * and toolResult.text) stay verbatim. Assistant TEXT is still capped inside
+ * this window — see TEXT_BLOCK_LIMIT rationale above.
+ *
+ * 3 covers "current turn is reading the previous turn's str_replace + its
+ * toolResult" in the typical one-section-per-turn polish cadence.
+ */
+const RECENT_WINDOW = 3;
+
 function estimateBytes(messages: AgentMessage[]): number {
   let total = 0;
   for (const m of messages) {
@@ -66,7 +82,11 @@ function stubText(text: string, label: string): string {
   return `[${label} — ${text.length}B, head: "${preview(text)}"]`;
 }
 
-function compactAssistant(m: AgentMessage, textLimit: number, toolLimit: number): AgentMessage {
+function compactAssistant(
+  m: AgentMessage,
+  textLimit: number,
+  toolLimit: number | null,
+): AgentMessage {
   const original = m as unknown as {
     role: 'assistant';
     content?: Array<Record<string, unknown>>;
@@ -81,7 +101,7 @@ function compactAssistant(m: AgentMessage, textLimit: number, toolLimit: number)
       changed = true;
       return { ...block, text: stubText(text, 'prior assistant output dropped') };
     }
-    if (type === 'toolCall') {
+    if (type === 'toolCall' && toolLimit !== null) {
       const input = block['input'];
       let origBytes = 0;
       let preview = '';
@@ -105,7 +125,8 @@ function compactAssistant(m: AgentMessage, textLimit: number, toolLimit: number)
   return { ...(original as object), content: nextContent } as unknown as AgentMessage;
 }
 
-function compactToolResult(m: AgentMessage, limit: number): AgentMessage {
+function compactToolResult(m: AgentMessage, limit: number | null): AgentMessage {
+  if (limit === null) return m;
   const original = m as unknown as {
     role: 'toolResult';
     content?: Array<{ type: string; text?: string }>;
@@ -123,15 +144,48 @@ function compactToolResult(m: AgentMessage, limit: number): AgentMessage {
   return { ...(original as object), content: nextContent } as unknown as AgentMessage;
 }
 
-function applyCaps(
-  messages: AgentMessage[],
-  textLimit: number,
-  toolInputLimit: number,
-  toolResultLimit: number,
-): AgentMessage[] {
-  return messages.map((m) => {
-    if (m.role === 'assistant') return compactAssistant(m, textLimit, toolInputLimit);
-    if (m.role === 'toolResult') return compactToolResult(m, toolResultLimit);
+/**
+ * Index threshold (inclusive) — messages at or after this index are "recent"
+ * and their tool payloads stay verbatim. Counts assistant + toolResult roles
+ * from the tail; user messages are never a prune target but also don't
+ * consume window slots.
+ */
+function computeWindowStart(messages: AgentMessage[], windowTurns: number): number {
+  if (windowTurns <= 0) return messages.length;
+  let seen = 0;
+  for (let i = messages.length - 1; i >= 0; i -= 1) {
+    const role = messages[i]?.role;
+    if (role === 'assistant' || role === 'toolResult') {
+      seen += 1;
+      if (seen >= windowTurns) return i;
+    }
+  }
+  return 0;
+}
+
+interface CapConfig {
+  textLimit: number;
+  toolInputLimitOld: number;
+  toolResultLimitOld: number;
+  toolInputLimitRecent: number | null;
+  toolResultLimitRecent: number | null;
+  windowTurns: number;
+}
+
+function applyCaps(messages: AgentMessage[], cfg: CapConfig): AgentMessage[] {
+  const windowStart = computeWindowStart(messages, cfg.windowTurns);
+  return messages.map((m, idx) => {
+    const isRecent = idx >= windowStart;
+    if (m.role === 'assistant') {
+      return compactAssistant(
+        m,
+        cfg.textLimit,
+        isRecent ? cfg.toolInputLimitRecent : cfg.toolInputLimitOld,
+      );
+    }
+    if (m.role === 'toolResult') {
+      return compactToolResult(m, isRecent ? cfg.toolResultLimitRecent : cfg.toolResultLimitOld);
+    }
     return m;
   });
 }
@@ -143,25 +197,36 @@ export function buildTransformContext(
     if (messages.length === 0) return messages;
 
     const before = estimateBytes(messages);
-    const first = applyCaps(messages, TEXT_BLOCK_LIMIT, TOOL_INPUT_LIMIT, TOOL_RESULT_LIMIT);
+    const first = applyCaps(messages, {
+      textLimit: TEXT_BLOCK_LIMIT,
+      toolInputLimitOld: TOOL_INPUT_LIMIT,
+      toolResultLimitOld: TOOL_RESULT_LIMIT,
+      toolInputLimitRecent: null,
+      toolResultLimitRecent: null,
+      windowTurns: RECENT_WINDOW,
+    });
     const firstSize = estimateBytes(first);
 
     log.info('[context-prune] step=caps', {
       messages: messages.length,
       before,
       after: firstSize,
       textLimit: TEXT_BLOCK_LIMIT,
-      toolLimit: TOOL_INPUT_LIMIT,
+      toolInputLimit: TOOL_INPUT_LIMIT,
+      toolResultLimit: TOOL_RESULT_LIMIT,
+      window: RECENT_WINDOW,
     });
 
     if (firstSize <= HARD_CAP_BYTES) return first;
 
-    const aggressive = applyCaps(
-      messages,
-      AGGRESSIVE_BLOCK_LIMIT,
-      AGGRESSIVE_BLOCK_LIMIT,
-      AGGRESSIVE_BLOCK_LIMIT,
-    );
+    const aggressive = applyCaps(messages, {
+      textLimit: AGGRESSIVE_BLOCK_LIMIT,
+      toolInputLimitOld: AGGRESSIVE_BLOCK_LIMIT,
+      toolResultLimitOld: AGGRESSIVE_BLOCK_LIMIT,
+      toolInputLimitRecent: AGGRESSIVE_BLOCK_LIMIT,
+      toolResultLimitRecent: AGGRESSIVE_BLOCK_LIMIT,
+      windowTurns: 0,
+    });
     const aggressiveSize = estimateBytes(aggressive);
     log.info('[context-prune] step=aggressive', {
       messages: messages.length,
diff --git a/packages/core/src/tools/done.ts b/packages/core/src/tools/done.ts
@@ -284,7 +284,7 @@ export function makeDoneTool(
       'console errors / load failures, then replies with ' +
       '`{ status: "ok" | "has_errors", errors: [...] }`. If errors come back, ' +
       'fix them with str_replace_based_edit_tool and call `done` again. ' +
-      'Stop calling once status is "ok" or after 3 rounds.',
+      'Stop calling once status is "ok" or after 5 rounds.',
     parameters: DoneParams,
     async execute(_id, params): Promise<AgentToolResult<DoneDetails>> {
       const path = params.path ?? 'index.html';