Skip to content

Commit 4e0de27

Browse files
committed
fix(core): recent-turn verbatim window + relax tool-input cap
After the prompt OVERRIDE block in b692ec7 eliminated the `<artifact>` text-dump vector, the windowless per-block compaction from v2 became over-aggressive: the model's own most recent str_replace new_str got summarized before the next turn, forcing guess-based old_str selection for follow-up edits. v3 splits behavior by block type: - assistant.content[*].text — still capped (8 KB) on ALL turns. This is the regression guard against a `<artifact>` text-dump slipping back in; its cost is near-zero. - assistant.toolCall.input + toolResult.text — capped only outside a 3-turn window. Inside the window they stay verbatim so the model reads its own just-written section and the latest view() output in full fidelity. Tool-input cap for older turns raised 8 KB -> 24 KB, covering a full section's str_replace in one block. HARD_CAP_BYTES safety net retained: if per-block pass still exceeds 200 KB, the aggressive pass (2 KB caps, window dropped) runs. Also: done.ts description bumps its self-advertised round limit 3 -> 5 so upcoming craft-surplus verifier passes have room to iterate. - packages/core/src/context-prune.ts: split caps by recency + type - packages/core/src/context-prune.test.ts: window behavior coverage - packages/core/src/tools/done.ts: stop-after 3 -> 5 rounds
1 parent 69e35c5 commit 4e0de27

3 files changed

Lines changed: 162 additions & 62 deletions

File tree

packages/core/src/context-prune.test.ts

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ function assistantText(text: string): AgentMessage {
3939
} as unknown as AgentMessage;
4040
}
4141

42-
describe('buildTransformContext — size-based block compaction', () => {
42+
describe('buildTransformContext — size-based block compaction with recent-turn window', () => {
4343
it('is a no-op when every block is under its cap', async () => {
4444
const transform = buildTransformContext();
4545
const messages: AgentMessage[] = [
@@ -53,8 +53,8 @@ describe('buildTransformContext — size-based block compaction', () => {
5353
});
5454

5555
it('stubs a large assistant text block even on the LATEST message', async () => {
56-
// The production bug: model streamed a 9MB artifact as assistant text
57-
// on the final turn. v1 window-based prune preserved it verbatim.
56+
// Text cap applies to ALL turns. Guards against the `<artifact>` text
57+
// dump regression (assistant streamed 9 MB JSX as prose on the final turn).
5858
const transform = buildTransformContext();
5959
const huge = 'x'.repeat(50_000);
6060
const messages: AgentMessage[] = [userMsg('build it'), assistantText(huge)];
@@ -65,7 +65,9 @@ describe('buildTransformContext — size-based block compaction', () => {
6565
expect(text).toContain('50000B');
6666
});
6767

68-
it('summarizes a large toolCall.input, preserving name + id', async () => {
68+
it('keeps a large toolCall.input verbatim inside the recent window', async () => {
69+
// The model's own just-written str_replace must stay full-fidelity so it
70+
// can pick the next old_str from memory instead of guessing.
6971
const transform = buildTransformContext();
7072
const bulk = 'a'.repeat(20_000);
7173
const messages: AgentMessage[] = [
@@ -75,17 +77,39 @@ describe('buildTransformContext — size-based block compaction', () => {
7577
];
7678
const out = await transform(messages);
7779
const a = out[1] as {
78-
content: Array<{ type?: string; id?: string; name?: string; input?: unknown }>;
80+
content: Array<{ type?: string; id?: string; input?: { inputArg?: string } }>;
7981
};
8082
const tc = a.content.find((c) => c.type === 'toolCall');
8183
expect(tc?.id).toBe('call-0');
82-
expect(tc?.name).toBe('str_replace_based_edit_tool');
83-
const input = tc?.input as { _summarized?: boolean; _origBytes?: number };
84-
expect(input?._summarized).toBe(true);
85-
expect(input?._origBytes).toBeGreaterThan(10_000);
84+
expect(tc?.input?.inputArg).toBe(bulk);
8685
});
8786

88-
it('stubs a large toolResult body, keeping toolCallId for pi-ai shape', async () => {
87+
it('summarizes a large toolCall.input for older turns outside the window', async () => {
88+
const transform = buildTransformContext();
89+
const bulk = 'a'.repeat(30_000);
90+
const messages: AgentMessage[] = [userMsg('build')];
91+
messages.push(assistantWithToolCall('call-old', bulk));
92+
messages.push(toolResult('call-old', 'ok'));
93+
// Three more turns push call-old out of the 3-turn window.
94+
for (let i = 0; i < 3; i += 1) {
95+
messages.push(assistantWithToolCall(`t${i}`, 'small'));
96+
messages.push(toolResult(`t${i}`, 'ok'));
97+
}
98+
const out = await transform(messages);
99+
const oldAssistant = out[1] as {
100+
content: Array<{
101+
type?: string;
102+
id?: string;
103+
input?: { _summarized?: boolean; _origBytes?: number };
104+
}>;
105+
};
106+
const tc = oldAssistant.content.find((c) => c.type === 'toolCall');
107+
expect(tc?.id).toBe('call-old');
108+
expect(tc?.input?._summarized).toBe(true);
109+
expect(tc?.input?._origBytes ?? 0).toBeGreaterThan(20_000);
110+
});
111+
112+
it('keeps a large toolResult verbatim inside the recent window', async () => {
89113
const transform = buildTransformContext();
90114
const bulk = 'y'.repeat(20_000);
91115
const messages: AgentMessage[] = [
@@ -96,6 +120,22 @@ describe('buildTransformContext — size-based block compaction', () => {
96120
const out = await transform(messages);
97121
const tr = out[2] as { toolCallId?: string; content: Array<{ text?: string }> };
98122
expect(tr.toolCallId).toBe('call-0');
123+
expect(tr.content[0]?.text).toBe(bulk);
124+
});
125+
126+
it('stubs large toolResult bodies for older turns outside the window', async () => {
127+
const transform = buildTransformContext();
128+
const bulk = 'y'.repeat(20_000);
129+
const messages: AgentMessage[] = [userMsg('x')];
130+
messages.push(assistantWithToolCall('call-old', 'a'));
131+
messages.push(toolResult('call-old', bulk));
132+
for (let i = 0; i < 3; i += 1) {
133+
messages.push(assistantWithToolCall(`t${i}`, 'small'));
134+
messages.push(toolResult(`t${i}`, 'ok'));
135+
}
136+
const out = await transform(messages);
137+
const tr = out[2] as { toolCallId?: string; content: Array<{ text?: string }> };
138+
expect(tr.toolCallId).toBe('call-old');
99139
expect(tr.content[0]?.text?.startsWith('[tool result dropped')).toBe(true);
100140
});
101141

@@ -118,13 +158,9 @@ describe('buildTransformContext — size-based block compaction', () => {
118158
expect(out[0]).toBe(opening);
119159
});
120160

121-
it('tightens to aggressive caps when HARD_CAP_BYTES is exceeded', async () => {
161+
it('tightens to aggressive caps (ignoring window) when HARD_CAP_BYTES is exceeded', async () => {
122162
const transform = buildTransformContext();
123163
const messages: AgentMessage[] = [userMsg('go')];
124-
// 30 rounds with tool input just over the 8KB cap = 30 summarized at first
125-
// pass, but the metadata itself adds up. Force the hard cap by also adding
126-
// many text blocks between 2KB and 8KB — first pass keeps them, aggressive
127-
// compacts them.
128164
const midText = 'p'.repeat(6_000);
129165
for (let i = 0; i < 40; i += 1) {
130166
messages.push(assistantText(midText));
@@ -142,7 +178,6 @@ describe('buildTransformContext — size-based block compaction', () => {
142178
}
143179
}
144180
}
145-
// Aggressive cap is 2KB — the 6KB midText blocks should all be stubbed.
146181
expect(droppedTextCount).toBeGreaterThanOrEqual(35);
147182
});
148183
});

packages/core/src/context-prune.ts

Lines changed: 110 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,49 +2,65 @@
22
* Per-message size-based context compaction for pi-agent-core's
33
* `transformContext` hook. Runs before every LLM call.
44
*
5-
* Philosophy: **history is intent tracking, not payload storage.**
5+
* Philosophy: **history is intent tracking, not payload storage.** The model
6+
* needs the decision trail — which tools, in what order, with what shape —
7+
* not verbatim 9 MB artifact dumps or whole-file view returns from ten turns
8+
* ago. Current file state is always recoverable via ranged `view()`.
69
*
7-
* What the model needs from prior turns is the DECISION trail — which tools
8-
* it called, in what order, with what shape. What it does NOT need is its own
9-
* verbatim 9 MB artifact dump, or the full view() return of a 100 KB file it
10-
* already touched. Current state is always recoverable via view(), so there
11-
* is no information loss from stubbing prior payloads.
10+
* Evolution:
11+
* - v1 (window): kept last N turns verbatim, stubbed older. Missed the
12+
* dominant failure mode — a 9 MB `<artifact>` text dump sat inside the
13+
* keep-verbatim window and shipped 3.97 M tokens.
14+
* - v2 (windowless): stubbed every block over its cap regardless of
15+
* position. Safe, but over-aggressive after the prompt OVERRIDE block
16+
* eliminated the text-dump vector — the model's own latest str_replace
17+
* new_str got summarized, so picking the next old_str required guessing.
18+
* - v3 (this file): split behavior by block type.
19+
* · `assistant.content[*].text` is always capped (8 KB, all turns).
20+
* This is the regression guard: the one class of block that must
21+
* never be allowed to balloon, because a bad prompt interaction
22+
* can resurrect the `<artifact>` dump.
23+
* · `assistant.content[*].toolCall.input` and
24+
* `toolResult.content[*].text` are capped only outside a small
25+
* recent-turn window. Inside the window they stay verbatim so the
26+
* model reads its own just-written section and the latest view()
27+
* output in full fidelity. Outside the window, large payloads
28+
* collapse to a one-line stub.
1229
*
13-
* v1 (window-based) failed in production because the 9 MB payload was the
14-
* LATEST assistant text block (the model streaming out the final artifact as
15-
* prose right before `done`). It sat inside the "keep verbatim" window, and
16-
* the request still shipped 3.97 M tokens.
17-
*
18-
* v3 (this file) drops the window entirely. Every message is inspected per
19-
* block; any block whose serialized size exceeds its per-type cap is stubbed.
20-
* Caps are small on purpose — 8 KB fits any reasonable paragraph of reasoning
21-
* or normal tool argument, but cannot carry an HTML/JSX artifact, a base64
22-
* image, or a whole-file view return. The model's most recent decision is
23-
* still visible at full fidelity for small outputs.
24-
*
25-
* Three block-level caps:
26-
* - `assistant.content[*].text` — model prose / streamed artifact text.
27-
* - `assistant.content[*].toolCall.input` — args the model sent to a tool.
28-
* - `toolResult.content[*].text` — host-side tool return payload.
30+
* Block-level caps:
31+
* - TEXT_BLOCK_LIMIT — assistant prose, ALL turns.
32+
* - TOOL_INPUT_LIMIT — assistant.toolCall.input, older turns only.
33+
* - TOOL_RESULT_LIMIT — toolResult.text, older turns only.
2934
*
3035
* Stub format carries bytes + a short preview so the model can tell what
3136
* got dropped, and (for tool calls) keeps tool NAME + id so pi-ai's shape
3237
* validation remains happy.
3338
*
3439
* Safety net: after per-block stubbing, if the grand total still exceeds
35-
* `HARD_CAP_BYTES`, we shrink caps further and re-run. Catches pathological
36-
* runs with many just-under-threshold blocks.
40+
* `HARD_CAP_BYTES`, we shrink caps further (including within the window)
41+
* and re-run. Catches pathological runs with many just-under-threshold
42+
* blocks.
3743
*/
3844

3945
import type { AgentMessage } from '@mariozechner/pi-agent-core';
4046
import { type CoreLogger, NOOP_LOGGER } from './logger.js';
4147

4248
const TEXT_BLOCK_LIMIT = 8 * 1024;
43-
const TOOL_INPUT_LIMIT = 8 * 1024;
49+
const TOOL_INPUT_LIMIT = 24 * 1024;
4450
const TOOL_RESULT_LIMIT = 8 * 1024;
4551
const HARD_CAP_BYTES = 200_000;
4652
const AGGRESSIVE_BLOCK_LIMIT = 2 * 1024;
4753

54+
/**
55+
* Number of most-recent non-user messages whose tool payloads (toolCall.input
56+
* and toolResult.text) stay verbatim. Assistant TEXT is still capped inside
57+
* this window — see TEXT_BLOCK_LIMIT rationale above.
58+
*
59+
* 3 covers "current turn is reading the previous turn's str_replace + its
60+
* toolResult" in the typical one-section-per-turn polish cadence.
61+
*/
62+
const RECENT_WINDOW = 3;
63+
4864
function estimateBytes(messages: AgentMessage[]): number {
4965
let total = 0;
5066
for (const m of messages) {
@@ -66,7 +82,11 @@ function stubText(text: string, label: string): string {
6682
return `[${label}${text.length}B, head: "${preview(text)}"]`;
6783
}
6884

69-
function compactAssistant(m: AgentMessage, textLimit: number, toolLimit: number): AgentMessage {
85+
function compactAssistant(
86+
m: AgentMessage,
87+
textLimit: number,
88+
toolLimit: number | null,
89+
): AgentMessage {
7090
const original = m as unknown as {
7191
role: 'assistant';
7292
content?: Array<Record<string, unknown>>;
@@ -81,7 +101,7 @@ function compactAssistant(m: AgentMessage, textLimit: number, toolLimit: number)
81101
changed = true;
82102
return { ...block, text: stubText(text, 'prior assistant output dropped') };
83103
}
84-
if (type === 'toolCall') {
104+
if (type === 'toolCall' && toolLimit !== null) {
85105
const input = block['input'];
86106
let origBytes = 0;
87107
let preview = '';
@@ -105,7 +125,8 @@ function compactAssistant(m: AgentMessage, textLimit: number, toolLimit: number)
105125
return { ...(original as object), content: nextContent } as unknown as AgentMessage;
106126
}
107127

108-
function compactToolResult(m: AgentMessage, limit: number): AgentMessage {
128+
function compactToolResult(m: AgentMessage, limit: number | null): AgentMessage {
129+
if (limit === null) return m;
109130
const original = m as unknown as {
110131
role: 'toolResult';
111132
content?: Array<{ type: string; text?: string }>;
@@ -123,15 +144,48 @@ function compactToolResult(m: AgentMessage, limit: number): AgentMessage {
123144
return { ...(original as object), content: nextContent } as unknown as AgentMessage;
124145
}
125146

126-
function applyCaps(
127-
messages: AgentMessage[],
128-
textLimit: number,
129-
toolInputLimit: number,
130-
toolResultLimit: number,
131-
): AgentMessage[] {
132-
return messages.map((m) => {
133-
if (m.role === 'assistant') return compactAssistant(m, textLimit, toolInputLimit);
134-
if (m.role === 'toolResult') return compactToolResult(m, toolResultLimit);
147+
/**
148+
* Index threshold (inclusive) — messages at or after this index are "recent"
149+
* and their tool payloads stay verbatim. Counts assistant + toolResult roles
150+
* from the tail; user messages are never a prune target but also don't
151+
* consume window slots.
152+
*/
153+
function computeWindowStart(messages: AgentMessage[], windowTurns: number): number {
154+
if (windowTurns <= 0) return messages.length;
155+
let seen = 0;
156+
for (let i = messages.length - 1; i >= 0; i -= 1) {
157+
const role = messages[i]?.role;
158+
if (role === 'assistant' || role === 'toolResult') {
159+
seen += 1;
160+
if (seen >= windowTurns) return i;
161+
}
162+
}
163+
return 0;
164+
}
165+
166+
interface CapConfig {
167+
textLimit: number;
168+
toolInputLimitOld: number;
169+
toolResultLimitOld: number;
170+
toolInputLimitRecent: number | null;
171+
toolResultLimitRecent: number | null;
172+
windowTurns: number;
173+
}
174+
175+
function applyCaps(messages: AgentMessage[], cfg: CapConfig): AgentMessage[] {
176+
const windowStart = computeWindowStart(messages, cfg.windowTurns);
177+
return messages.map((m, idx) => {
178+
const isRecent = idx >= windowStart;
179+
if (m.role === 'assistant') {
180+
return compactAssistant(
181+
m,
182+
cfg.textLimit,
183+
isRecent ? cfg.toolInputLimitRecent : cfg.toolInputLimitOld,
184+
);
185+
}
186+
if (m.role === 'toolResult') {
187+
return compactToolResult(m, isRecent ? cfg.toolResultLimitRecent : cfg.toolResultLimitOld);
188+
}
135189
return m;
136190
});
137191
}
@@ -143,25 +197,36 @@ export function buildTransformContext(
143197
if (messages.length === 0) return messages;
144198

145199
const before = estimateBytes(messages);
146-
const first = applyCaps(messages, TEXT_BLOCK_LIMIT, TOOL_INPUT_LIMIT, TOOL_RESULT_LIMIT);
200+
const first = applyCaps(messages, {
201+
textLimit: TEXT_BLOCK_LIMIT,
202+
toolInputLimitOld: TOOL_INPUT_LIMIT,
203+
toolResultLimitOld: TOOL_RESULT_LIMIT,
204+
toolInputLimitRecent: null,
205+
toolResultLimitRecent: null,
206+
windowTurns: RECENT_WINDOW,
207+
});
147208
const firstSize = estimateBytes(first);
148209

149210
log.info('[context-prune] step=caps', {
150211
messages: messages.length,
151212
before,
152213
after: firstSize,
153214
textLimit: TEXT_BLOCK_LIMIT,
154-
toolLimit: TOOL_INPUT_LIMIT,
215+
toolInputLimit: TOOL_INPUT_LIMIT,
216+
toolResultLimit: TOOL_RESULT_LIMIT,
217+
window: RECENT_WINDOW,
155218
});
156219

157220
if (firstSize <= HARD_CAP_BYTES) return first;
158221

159-
const aggressive = applyCaps(
160-
messages,
161-
AGGRESSIVE_BLOCK_LIMIT,
162-
AGGRESSIVE_BLOCK_LIMIT,
163-
AGGRESSIVE_BLOCK_LIMIT,
164-
);
222+
const aggressive = applyCaps(messages, {
223+
textLimit: AGGRESSIVE_BLOCK_LIMIT,
224+
toolInputLimitOld: AGGRESSIVE_BLOCK_LIMIT,
225+
toolResultLimitOld: AGGRESSIVE_BLOCK_LIMIT,
226+
toolInputLimitRecent: AGGRESSIVE_BLOCK_LIMIT,
227+
toolResultLimitRecent: AGGRESSIVE_BLOCK_LIMIT,
228+
windowTurns: 0,
229+
});
165230
const aggressiveSize = estimateBytes(aggressive);
166231
log.info('[context-prune] step=aggressive', {
167232
messages: messages.length,

packages/core/src/tools/done.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ export function makeDoneTool(
284284
'console errors / load failures, then replies with ' +
285285
'`{ status: "ok" | "has_errors", errors: [...] }`. If errors come back, ' +
286286
'fix them with str_replace_based_edit_tool and call `done` again. ' +
287-
'Stop calling once status is "ok" or after 3 rounds.',
287+
'Stop calling once status is "ok" or after 5 rounds.',
288288
parameters: DoneParams,
289289
async execute(_id, params): Promise<AgentToolResult<DoneDetails>> {
290290
const path = params.path ?? 'index.html';

0 commit comments

Comments
 (0)