fix: send screenshot attachments to ChatGPT Codex (#157)

Sun-sunshine06 · web-flow · commit a965e581ff45 · 2026-04-22T23:17:36.000+08:00
## Summary

Fixes a `chatgpt-codex` bug where uploaded screenshots were only reduced
to filename hints, so models like `gpt-5.4` could ignore the attached
visual reference entirely. This patch keeps text attachments unchanged,
encodes supported image attachments as data URLs in prompt preparation,
and forwards them as Responses `input_image` parts on the ChatGPT Codex
generate path.

## Type of change

- [x] Bug fix
- [ ] New feature
- [ ] Refactor (no behavior change)
- [ ] Documentation
- [ ] Build / CI / tooling
- [ ] Breaking change

## Linked issue

- Refs local repro: attached screenshot uploads were not being used by
the model response.

## Checklist

- [ ] I read [`docs/VISION.md`](../docs/VISION.md),
[`docs/PRINCIPLES.md`](../docs/PRINCIPLES.md), and
[`CLAUDE.md`](../CLAUDE.md) before starting
- [x] Commits are signed with DCO (`git commit -s`)
- [ ] `pnpm lint &amp;&amp; pnpm typecheck &amp;&amp; pnpm test` passes locally
- [x] Added/updated tests for the change
- [x] Added a changeset (`pnpm changeset`) if user-visible
- [ ] Updated docs if behavior changed

Validation run for this PR:
- `corepack pnpm --filter @open-codesign/desktop test -- --run
src/main/prompt-context.test.ts src/main/codex-generate.test.ts`
- `corepack pnpm --filter @open-codesign/desktop typecheck`
- pre-push checks passed: workspace typecheck + `biome check .`

Note on full test suite:
- `pnpm test` still has existing Windows baseline failures unrelated to
this patch (provider token-store permission assertion, core
builtin-skill loader expectation, exporter PDF timeout, and opencode
path separator assertions).

## Screenshots / recordings (UI changes)

- N/A

---------

Signed-off-by: Sun-sunshine06 &lt;Sun-sunshine06@users.noreply.github.com&gt;
Co-authored-by: Sun-sunshine06 &lt;Sun-sunshine06@users.noreply.github.com&gt;
Co-authored-by: Qihan &lt;&gt;
diff --git a/.changeset/codex-image-attachments.md b/.changeset/codex-image-attachments.md
@@ -0,0 +1,10 @@
+---
+'@open-codesign/desktop': patch
+'@open-codesign/core': patch
+---
+
+fix: send attached screenshots to ChatGPT Codex as image inputs
+
+Image attachments in the desktop app were previously reduced to filename-only hints on the `chatgpt-codex` route, so models like `gpt-5.4` could ignore uploaded screenshots entirely.
+
+This change keeps the existing text-attachment behavior, but reads supported image files into data URLs and forwards them as Responses `input_image` parts for ChatGPT Codex generations.
diff --git a/apps/desktop/src/main/prompt-context.test.ts b/apps/desktop/src/main/prompt-context.test.ts
@@ -32,15 +32,34 @@ describe('preparePromptContext', () => {
   });
 
   it('allows binary attachments (png) up to 10MB - 500KB png passes', async () => {
-    // Binary attachments only get filename, no content read - allowed larger
     await expect(
       preparePromptContext({
         attachments: [{ path: 'C:/repo/image.png', name: 'image.png', size: 543_034 }],
       }),
     ).rejects.toMatchObject({
       code: 'ATTACHMENT_READ_FAILED',
     });
-    // It fails because the file doesn't exist, but importantly - NOT ATTACHMENT_TOO_LARGE
+  });
+
+  it('encodes supported image attachments as data URLs', async () => {
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), 'codesign-image-attachment-'));
+    const filePath = path.join(dir, 'shot.png');
+    const pngBytes = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00]);
+    await fs.writeFile(filePath, pngBytes);
+
+    const result = await preparePromptContext({
+      attachments: [{ path: filePath, name: 'shot.png', size: pngBytes.length }],
+    });
+
+    expect(result.attachments).toHaveLength(1);
+    expect(result.attachments[0]).toMatchObject({
+      name: 'shot.png',
+      mediaType: 'image/png',
+    });
+    expect(result.attachments[0]?.imageDataUrl).toBe(
+      `data:image/png;base64,${pngBytes.toString('base64')}`,
+    );
+    expect(result.attachments[0]?.excerpt).toBeUndefined();
   });
 
   it('throws ATTACHMENT_TOO_LARGE for unknown extension text > 256KB', async () => {
diff --git a/apps/desktop/src/main/prompt-context.ts b/apps/desktop/src/main/prompt-context.ts
@@ -28,9 +28,19 @@ const TEXT_EXTS = new Set([
   '.yml',
 ]);
 
+const IMAGE_MIME_TYPES: Record<string, string> = {
+  '.avif': 'image/avif',
+  '.bmp': 'image/bmp',
+  '.gif': 'image/gif',
+  '.jpeg': 'image/jpeg',
+  '.jpg': 'image/jpeg',
+  '.png': 'image/png',
+  '.webp': 'image/webp',
+};
+
 const MAX_ATTACHMENT_CHARS = 6_000;
 const MAX_TEXT_ATTACHMENT_BYTES = 256_000;
-const MAX_BINARY_ATTACHMENT_BYTES = 10_000_000; // 10MB - binary attachments only need filename, not content
+const MAX_BINARY_ATTACHMENT_BYTES = 10_000_000;
 const MAX_URL_EXCERPT_CHARS = 1_200;
 const MAX_URL_RESPONSE_BYTES = 256_000;
 const REFERENCE_CONTENT_TYPES = ['text/html', 'application/xhtml+xml'];
@@ -60,9 +70,8 @@ function isProbablyText(buffer: Buffer, extension: string): boolean {
 
 async function readAttachment(file: LocalInputFile): Promise<AttachmentContext> {
   const extension = extname(file.name).toLowerCase();
+  const imageMimeType = IMAGE_MIME_TYPES[extension];
 
-  // Binary attachments (images, etc) only need filename - we don't send content to LLM
-  // So allow much larger size limit
   const isKnownTextExtension = TEXT_EXTS.has(extension);
   const maxFileBytes = isKnownTextExtension
     ? MAX_TEXT_ATTACHMENT_BYTES
@@ -97,8 +106,18 @@ async function readAttachment(file: LocalInputFile): Promise<AttachmentContext>
     }
 
     if (!looksText) {
-      // Definitely binary - we don't need any more content
-      buffer = probe;
+      if (imageMimeType) {
+        const length = Math.max(
+          1,
+          Math.min(file.size || MAX_BINARY_ATTACHMENT_BYTES, maxFileBytes),
+        );
+        const fullBuffer = Buffer.alloc(length);
+        const { bytesRead } = await handle.read(fullBuffer, 0, fullBuffer.length, 0);
+        buffer = fullBuffer.subarray(0, bytesRead);
+      } else {
+        // Non-image binary files stay filename-only for now.
+        buffer = probe;
+      }
     } else {
       // It looks like text and fits within limit - read the whole thing
       const length = Math.max(
@@ -127,6 +146,15 @@ async function readAttachment(file: LocalInputFile): Promise<AttachmentContext>
   }
 
   if (!isProbablyText(buffer, extension)) {
+    if (imageMimeType) {
+      return {
+        name: file.name,
+        path: file.path,
+        note: 'Attached as an image input. Use the visual content directly, not just the filename.',
+        mediaType: imageMimeType,
+        imageDataUrl: `data:${imageMimeType};base64,${buffer.toString('base64')}`,
+      };
+    }
     return {
       name: file.name,
       path: file.path,
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
@@ -61,6 +61,8 @@ export interface AttachmentContext {
   path: string;
   excerpt?: string | undefined;
   note?: string | undefined;
+  mediaType?: string | undefined;
+  imageDataUrl?: string | undefined;
 }
 
 export interface ReferenceUrlContext {
@@ -160,11 +162,34 @@ interface ModelRunInput {
   signal?: AbortSignal | undefined;
   onRetry?: ((info: RetryReason) => void) | undefined;
   messages: ChatMessage[];
+  userImages?: Array<{ data: string; mimeType: string }> | undefined;
   logger?: CoreLogger | undefined;
   /** Log step namespace, e.g. 'generate' or 'apply_comment'. Defaults to 'generate'. */
   logScope?: string | undefined;
 }
 
+function attachmentToImageInput(
+  attachment: AttachmentContext,
+): { data: string; mimeType: string } | null {
+  if (!attachment.imageDataUrl || !attachment.mediaType) return null;
+  const prefix = `data:${attachment.mediaType};base64,`;
+  if (!attachment.imageDataUrl.startsWith(prefix)) return null;
+  return {
+    data: attachment.imageDataUrl.slice(prefix.length),
+    mimeType: attachment.mediaType,
+  };
+}
+
+function imageInputsForWire(
+  attachments: AttachmentContext[] | undefined,
+  wire: WireApi | undefined,
+): Array<{ data: string; mimeType: string }> {
+  if (wire !== 'openai-codex-responses') return [];
+  return (attachments ?? [])
+    .map((attachment) => attachmentToImageInput(attachment))
+    .filter((image): image is { data: string; mimeType: string } => image !== null);
+}
+
 function createHtmlArtifact(content: string, index: number): Artifact {
   return {
     id: `design-${index + 1}`,
@@ -333,6 +358,7 @@ async function runModel(input: ModelRunInput): Promise<GenerateOutput> {
           ...(input.baseUrl !== undefined ? { baseUrl: input.baseUrl } : {}),
           ...(input.wire !== undefined ? { wire: input.wire } : {}),
           ...(input.httpHeaders !== undefined ? { httpHeaders: input.httpHeaders } : {}),
+          ...(input.userImages !== undefined ? { userImages: input.userImages } : {}),
           ...(input.allowKeyless === true ? { allowKeyless: true } : {}),
           ...(input.signal !== undefined ? { signal: input.signal } : {}),
           maxTokens: MAX_OUTPUT_TOKENS,
@@ -645,6 +671,7 @@ export async function generate(input: GenerateInput): Promise<GenerateOutput> {
     signal: input.signal,
     onRetry: input.onRetry,
     messages,
+    userImages: imageInputsForWire(input.attachments, input.wire),
     logger: input.logger,
   });
   return skillResult.warnings.length > 0
@@ -698,6 +725,7 @@ export async function applyComment(input: ApplyCommentInput): Promise<GenerateOu
     signal: input.signal,
     onRetry: input.onRetry,
     messages,
+    userImages: imageInputsForWire(input.attachments, input.wire),
     logger: input.logger,
     logScope: 'apply_comment',
   });
diff --git a/packages/providers/src/index.test.ts b/packages/providers/src/index.test.ts
@@ -197,4 +197,110 @@ describe('complete', () => {
 
     expect(result.content).toBe('ok');
   });
+
+  it('appends image inputs to the final user turn for openai-codex-responses', async () => {
+    getModelMock.mockReturnValue({
+      id: 'gpt-5.4',
+      api: 'openai-codex-responses',
+      provider: 'openai-codex',
+      input: ['text', 'image'],
+    });
+    completeSimpleMock.mockImplementationOnce(async (_model, context) => {
+      expect(context.messages).toEqual([
+        {
+          role: 'user',
+          content: 'earlier turn',
+          timestamp: 1,
+        },
+        {
+          role: 'assistant',
+          content: [{ type: 'text', text: 'tell me more' }],
+          api: 'openai-codex-responses',
+          provider: 'openai-codex',
+          model: 'gpt-5.4',
+          usage: {
+            input: 0,
+            output: 0,
+            cacheRead: 0,
+            cacheWrite: 0,
+            totalTokens: 0,
+            cost: {
+              input: 0,
+              output: 0,
+              cacheRead: 0,
+              cacheWrite: 0,
+              total: 0,
+            },
+          },
+          stopReason: 'stop',
+          timestamp: 2,
+        },
+        {
+          role: 'user',
+          content: [
+            { type: 'text', text: 'use this screenshot' },
+            { type: 'image', data: 'AAAA', mimeType: 'image/png' },
+          ],
+          timestamp: 3,
+        },
+      ]);
+      return {
+        role: 'assistant',
+        content: [{ type: 'text', text: 'ok' }],
+        api: 'openai-codex-responses',
+        provider: 'openai-codex',
+        model: 'gpt-5.4',
+        usage: {
+          input: 1,
+          output: 1,
+          cacheRead: 0,
+          cacheWrite: 0,
+          totalTokens: 2,
+          cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+        },
+        stopReason: 'stop',
+        timestamp: Date.now(),
+      };
+    });
+
+    const result = await complete(
+      { provider: 'chatgpt-codex', modelId: 'gpt-5.4' },
+      [
+        { role: 'user', content: 'earlier turn' },
+        { role: 'assistant', content: 'tell me more' },
+        { role: 'user', content: 'use this screenshot' },
+      ],
+      {
+        apiKey: 'token',
+        wire: 'openai-codex-responses',
+        userImages: [{ data: 'AAAA', mimeType: 'image/png' }],
+      },
+    );
+
+    expect(result.content).toBe('ok');
+  });
+
+  it('rejects oversized combined image inputs for openai-codex-responses', async () => {
+    getModelMock.mockReturnValue({
+      id: 'gpt-5.4',
+      api: 'openai-codex-responses',
+      provider: 'openai-codex',
+      input: ['text', 'image'],
+    });
+
+    await expect(
+      complete(
+        { provider: 'chatgpt-codex', modelId: 'gpt-5.4' },
+        [{ role: 'user', content: 'use these screenshots' }],
+        {
+          apiKey: 'token',
+          wire: 'openai-codex-responses',
+          userImages: [
+            { data: 'A'.repeat(2_700_000), mimeType: 'image/png' },
+            { data: 'A'.repeat(2_700_000), mimeType: 'image/png' },
+          ],
+        },
+      ),
+    ).rejects.toMatchObject({ code: 'ATTACHMENT_TOO_LARGE' });
+  });
 });
diff --git a/packages/providers/src/index.ts b/packages/providers/src/index.ts