fix(core): retry first-turn agent generation on transient errors (#125) (#163)

hqhq1025 · web-flow · commit 63fa316fc418 · 2026-04-23T10:53:12.000+08:00
## Summary \`USE_AGENT_RUNTIME\` 默认 ON，99% 用户走 \`generateViaAgent → agent.prompt()\` 单次调用，**没有任何重试**。旧的 \`completeWithRetry\`（3 次指数退避）只在 USE_AGENT_RUNTIME=0 时才走。pi-ai 自身也无内建重试。结果：429/5xx/transient network 直接抛错给用户。 ## What changed - **\`packages/providers/src/retry.ts\`**：抽出通用 \`withBackoff<T>(fn, opts)\`，承担 sleep / jitter / classify / Retry-After / abort 全部逻辑。\`completeWithRetry\` 改写为薄 wrapper（注入 provider-error normalization + \`provider.error\` 日志），行为完全不变，原 15 个测试不动 - **\`packages/providers/src/index.ts\`**：export \`withBackoff\` / \`BackoffOptions\` / \`RetryDecision\` - **\`packages/core/src/agent.ts\`**：\`input.history.length === 0\`（first turn，**幂等**）时用 \`withBackoff({ maxRetries: 3 })\` 包 \`agent.prompt() + agent.waitForIdle()\`。non-first-turn 仍走 \`sendOnce()\`，避免多 turn 对话被重发污染状态 - 重试通过 \`log.warn('[generate] step=send_request.retry', ...)\` 暴露，UI feedback 留作 follow-up - \`USE_AGENT_RUNTIME\` 默认值未动；Codex 401 路径未动（follow-up） ## Test plan - [x] 9 new \`withBackoff\` tests：first-try success / 503→ok / 429 Retry-After / 4xx no retry / exhaustion / pre-aborted signal / mid-backoff abort / custom classify - [x] 4 new agent tests：first-turn 500→success / first-turn 3×500 exhaustion / first-turn 401 no retry / **non-first-turn 500 no retry**（关键：保护 multi-turn 状态） - [x] \`pnpm test\` providers (139) + core (220) all green - [x] \`pnpm typecheck\` + \`pnpm lint\` clean - [x] changeset added (patch bump) Closes #125 --------- Signed-off-by: hqhq1025 <1506751656@qq.com>
diff --git a/.changeset/agent-first-turn-retry.md b/.changeset/agent-first-turn-retry.md
@@ -0,0 +1,6 @@
+---
+"@open-codesign/core": patch
+"@open-codesign/providers": patch
+---
+
+Fix: retry first-turn agent generation on transient provider errors (5xx, 429, network). The agent runtime now wraps `agent.prompt()` + `waitForIdle()` in a backoff loop for the first turn only — multi-turn requests still fail fast to avoid corrupting mid-session tool state. Extracted a generic `withBackoff` helper in `@open-codesign/providers` that shares the existing classify/jitter/Retry-After/abort logic with `completeWithRetry`. (#125)
diff --git a/packages/core/src/agent.test.ts b/packages/core/src/agent.test.ts
@@ -37,6 +37,20 @@ interface AgentScript {
   stopReason?: 'stop' | 'error' | 'aborted';
   errorMessage?: string;
   promptThrows?: Error;
+  /**
+   * When > 0, `promptThrows` is thrown only on the first N prompt() calls;
+   * subsequent calls resolve normally. Lets tests script "transient failure
+   * then success" sequences for first-turn retry coverage.
+   */
+  promptThrowsTimes?: number;
+  /**
+   * When true together with `promptThrows`, the mock pushes a partial
+   * assistant message onto `agent.state.messages` BEFORE throwing on
+   * each failing attempt. Simulates "model streamed tokens / tool call
+   * then the connection dropped" — the real pi-agent-core path where a
+   * retry at the outer send boundary would replay tool side effects.
+   */
+  promptPushesAssistantBeforeThrow?: boolean;
   /**
    * When set, the mock invokes `options.getApiKey` before emitting the
    * assistant response and — if it throws — converts the throw into an
@@ -64,7 +78,34 @@ vi.mock('@mariozechner/pi-agent-core', () => {
     }
     async prompt(message: unknown): Promise<void> {
       this.call.prompts.push({ message });
-      if (scriptedAgent.promptThrows) throw scriptedAgent.promptThrows;
+      if (scriptedAgent.promptThrows) {
+        const limit = scriptedAgent.promptThrowsTimes ?? Number.POSITIVE_INFINITY;
+        if (this.call.prompts.length <= limit) {
+          if (scriptedAgent.promptPushesAssistantBeforeThrow) {
+            const partial: AgentMessage = {
+              role: 'assistant',
+              // biome-ignore lint/suspicious/noExplicitAny: same.
+              api: 'anthropic-messages' as any,
+              // biome-ignore lint/suspicious/noExplicitAny: same.
+              provider: 'anthropic' as any,
+              model: 'mock-model',
+              content: [{ type: 'text', text: 'partial tokens before drop' }],
+              usage: {
+                input: 0,
+                output: 0,
+                cacheRead: 0,
+                cacheWrite: 0,
+                totalTokens: 0,
+                cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+              },
+              stopReason: 'error',
+              timestamp: Date.now(),
+            };
+            this.state.messages.push(partial);
+          }
+          throw scriptedAgent.promptThrows;
+        }
+      }
 
       // Simulate pi-agent-core's per-turn getApiKey invocation. Real
       // runAgentLoop calls `await config.getApiKey(provider)` (line 156 of
@@ -406,9 +447,14 @@ describe('generateViaAgent() — Phase 1 pass-through', () => {
       signal: controller.signal,
     });
     controller.abort();
-    // Mock's prompt() is synchronous enough to complete; just verify the wire-up
-    // registered by confirming the Agent.abort() call counter after settlement.
-    await promise;
+    // With first-turn withBackoff the pre-call signal check may short-circuit
+    // the prompt entirely (throwing PROVIDER_ABORTED), or the prompt may have
+    // already completed; either way the `signal → agent.abort()` listener
+    // registered before sending should have fired.
+    await promise.catch(() => {
+      // Expected when abort arrives before the withBackoff loop enters its
+      // first iteration.
+    });
     expect(agentCalls[0]?.aborted).toBe(true);
   });
 
@@ -474,6 +520,131 @@ describe('generateViaAgent() — Phase 1 pass-through', () => {
   });
 });
 
+describe('generateViaAgent() — first-turn retry', () => {
+  class HttpError extends Error {
+    constructor(
+      message: string,
+      public readonly status: number,
+    ) {
+      super(message);
+      this.name = 'HttpError';
+    }
+  }
+
+  it('retries a transient 500 on the first turn and resolves on the second attempt', async () => {
+    vi.useFakeTimers();
+    try {
+      scriptedAgent = {
+        assistantText: RESPONSE_WITH_ARTIFACT,
+        promptThrows: new HttpError('upstream 500', 500),
+        promptThrowsTimes: 1,
+      };
+      const onRetry = vi.fn();
+      const promise = generateViaAgent(
+        {
+          prompt: 'design a meditation app',
+          history: [],
+          model: MODEL,
+          apiKey: 'sk-test',
+        },
+        { onRetry },
+      );
+      await vi.runAllTimersAsync();
+      const result = await promise;
+      expect(result.artifacts).toHaveLength(1);
+      expect(agentCalls[0]?.prompts.length).toBe(2);
+      expect(onRetry).toHaveBeenCalledTimes(1);
+      expect(onRetry.mock.calls[0]?.[0].reason).toMatch(/server error/);
+    } finally {
+      vi.useRealTimers();
+    }
+  });
+
+  it('throws after three consecutive 500s on the first turn (retries exhausted)', async () => {
+    vi.useFakeTimers();
+    try {
+      scriptedAgent = {
+        assistantText: '',
+        promptThrows: new HttpError('still down', 500),
+      };
+      const promise = generateViaAgent({
+        prompt: 'design a dashboard',
+        history: [],
+        model: MODEL,
+        apiKey: 'sk-test',
+      });
+      // Swallow the expected rejection while we drain timers so the test
+      // does not surface it as an unhandled promise.
+      const settled = promise.catch((err: unknown) => ({ rejected: err }));
+      await vi.runAllTimersAsync();
+      const outcome = (await settled) as { rejected?: unknown };
+      expect(outcome.rejected).toBeDefined();
+      expect(agentCalls[0]?.prompts.length).toBe(3);
+    } finally {
+      vi.useRealTimers();
+    }
+  });
+
+  it('does not retry 4xx client errors (no 401 replay)', async () => {
+    scriptedAgent = {
+      assistantText: '',
+      promptThrows: new HttpError('unauthorized', 401),
+    };
+    await expect(
+      generateViaAgent({
+        prompt: 'design a dashboard',
+        history: [],
+        model: MODEL,
+        apiKey: 'sk-test',
+      }),
+    ).rejects.toBeTruthy();
+    expect(agentCalls[0]?.prompts.length).toBe(1);
+  });
+
+  it('does not retry once the agent has produced an assistant message (side-effect guard)', async () => {
+    // First-turn + transient 500, BUT the mock pushes a partial assistant
+    // message before throwing, simulating "model already emitted tokens /
+    // tool calls before the connection dropped". Replaying would re-run
+    // any text_editor / set_todos side effects, so retry must be blocked
+    // regardless of the HTTP status. A single attempt is the only safe move.
+    scriptedAgent = {
+      assistantText: '',
+      promptThrows: new HttpError('upstream 500', 500),
+      promptPushesAssistantBeforeThrow: true,
+    };
+    await expect(
+      generateViaAgent({
+        prompt: 'design a dashboard',
+        history: [],
+        model: MODEL,
+        apiKey: 'sk-test',
+      }),
+    ).rejects.toBeTruthy();
+    expect(agentCalls[0]?.prompts.length).toBe(1);
+  });
+
+  it('does not retry when history is non-empty (protects multi-turn agent state)', async () => {
+    scriptedAgent = {
+      assistantText: '',
+      promptThrows: new HttpError('upstream 500', 500),
+    };
+    await expect(
+      generateViaAgent({
+        prompt: 'refine this',
+        history: [
+          { role: 'user', content: 'first request' },
+          { role: 'assistant', content: 'first reply' },
+        ],
+        model: MODEL,
+        apiKey: 'sk-test',
+      }),
+    ).rejects.toBeTruthy();
+    // Single attempt: replaying a partial multi-turn session would corrupt
+    // tool state, so the second+ turn must surface transient errors directly.
+    expect(agentCalls[0]?.prompts.length).toBe(1);
+  });
+});
+
 describe('FRAME_TEMPLATES — device frame starter assets', () => {
   it('exposes iphone, ipad, watch, android, and macos-safari frames as JSX modules with EDITMODE markers', async () => {
     const { FRAME_TEMPLATES } = await import('./frames/index.js');
diff --git a/packages/core/src/agent.ts b/packages/core/src/agent.ts
@@ -30,11 +30,13 @@ import {
 } from '@mariozechner/pi-agent-core';
 import type { Message as PiAiMessage, Model as PiAiModel } from '@mariozechner/pi-ai';
 import { type ArtifactEvent, createArtifactParser } from '@open-codesign/artifacts';
-import type { RetryReason } from '@open-codesign/providers';
+import type { RetryDecision, RetryReason } from '@open-codesign/providers';
 import {
+  classifyError,
   claudeCodeIdentityHeaders,
   looksLikeClaudeOAuthToken,
   shouldForceClaudeCodeIdentity,
+  withBackoff,
 } from '@open-codesign/providers';
 import {
   type Artifact,
@@ -822,9 +824,59 @@ export async function generateViaAgent(
 
   log.info('[generate] step=send_request', ctx);
   const sendStart = Date.now();
+  // First-turn-only retry, further guarded by a side-effect check. Multi-turn
+  // requests carry half-complete agent state (tool calls mid-flight, transcript
+  // accumulated in pi-agent-core's internal loop) — retrying would replay
+  // partial progress and corrupt the session. Even on the first turn, retrying
+  // is safe only before any assistant message has landed in `agent.state`:
+  // once the model has emitted tokens or tool calls, side effects (text_editor
+  // writes, set_todos state) have already fired and a retry would re-run them.
+  // The pre-attempt snapshot of `agent.state.messages.length` lets us detect
+  // whether the failed attempt produced any such artefact and, if so, mark the
+  // error as non-retryable.
+  const isFirstTurn = input.history.length === 0;
+  const RETRY_BLOCKED = Symbol.for('open-codesign.retry.blocked');
+  type RetryBlockedError = Error & { [RETRY_BLOCKED]?: true };
+  const sendOnce = async (): Promise<void> => {
+    const preLen = agent.state.messages.length;
+    try {
+      await agent.prompt(userContent);
+      await agent.waitForIdle();
+    } catch (err) {
+      if (agent.state.messages.length > preLen) {
+        const tagged = (err instanceof Error ? err : new Error(String(err))) as RetryBlockedError;
+        tagged[RETRY_BLOCKED] = true;
+        throw tagged;
+      }
+      throw err;
+    }
+  };
   try {
-    await agent.prompt(userContent);
-    await agent.waitForIdle();
+    if (isFirstTurn) {
+      const retryOpts: Parameters<typeof withBackoff>[1] = {
+        maxRetries: 3,
+        classify: (err): RetryDecision => {
+          if ((err as RetryBlockedError)[RETRY_BLOCKED]) {
+            return { retry: false, reason: 'agent already produced side effects' };
+          }
+          return classifyError(err);
+        },
+        onRetry: (info: RetryReason) => {
+          log.warn('[generate] step=send_request.retry', {
+            ...ctx,
+            attempt: info.attempt,
+            totalAttempts: info.totalAttempts,
+            delayMs: info.delayMs,
+            reason: info.reason,
+          });
+          deps.onRetry?.(info);
+        },
+      };
+      if (input.signal) retryOpts.signal = input.signal;
+      await withBackoff(sendOnce, retryOpts);
+    } else {
+      await sendOnce();
+    }
   } catch (err) {
     log.error('[generate] step=send_request.fail', {
       ...ctx,
diff --git a/packages/providers/src/index.ts b/packages/providers/src/index.ts
@@ -411,8 +411,13 @@ export {
   withClaudeCodeIdentity,
 } from './claude-code-compat';
 
-export { completeWithRetry, classifyError, sleepWithAbort } from './retry';
-export type { CompleteWithRetryOptions, RetryReason } from './retry';
+export { completeWithRetry, classifyError, sleepWithAbort, withBackoff } from './retry';
+export type {
+  BackoffOptions,
+  CompleteWithRetryOptions,
+  RetryDecision,
+  RetryReason,
+} from './retry';
 
 export { injectSkillsIntoMessages, formatSkillsForPrompt, filterActive } from './skill-injector';
 
diff --git a/packages/providers/src/retry.test.ts b/packages/providers/src/retry.test.ts
diff --git a/packages/providers/src/retry.ts b/packages/providers/src/retry.ts