From 00b26d4eaf039d8ba136659217503ac904caf9c6 Mon Sep 17 00:00:00 2001
From: Dennis V <2119348+dzianisv@users.noreply.github.com>
Date: Tue, 9 Jun 2026 08:53:54 +0000
Subject: [PATCH 1/5] fix(ci): bump opencode Azure apiVersion to support
 /responses endpoint (#22)

The CUA smoke probe was returning MODEL_CAPABLE=false because opencode's
@ai-sdk/azure provider got 'API version not supported' from Azure on
/openai/v1/responses with apiVersion=2024-08-01-preview. Split into two
envs: keep the CUA driver on 2024-08-01-preview (chat-completions only)
and bump the opencode-side provider config to 2025-04-01-preview, which
supports the new responses API.

Effect: send_message/multi_turn scenarios get included again in CUA smoke
when the probe succeeds.
---
 .github/workflows/cua-smoke.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml
index 9b0aeb2..fa7b60a 100644
--- a/.github/workflows/cua-smoke.yml
+++ b/.github/workflows/cua-smoke.yml
@@ -18,7 +18,13 @@ jobs:
     env:
       AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+      # CUA driver uses chat-completions; 2024-08-01-preview is sufficient.
       AZURE_OPENAI_API_VERSION: "2024-08-01-preview"
+      # opencode's @ai-sdk/azure provider hits the new /openai/v1/responses
+      # endpoint, which requires 2025-03-01-preview or newer. Without this,
+      # probe returns "API version not supported" -> MODEL_CAPABLE=false ->
+      # send_message/multi_turn scenarios get skipped (#22).
+      OPENCODE_AZURE_API_VERSION: "2025-04-01-preview"
       # Android emulator reaches the runner host loopback via 10.0.2.2.
       # A real `opencode serve` runs on the host (see steps below), making this a true E2E.
       OPENCODE_URL: "http://10.0.2.2:4096"
@@ -92,7 +98,7 @@ jobs:
                 "options": {
                   "resourceName": "${RESOURCE_NAME}",
                   "apiKey": "{env:AZURE_OPENAI_API_KEY}",
-                  "apiVersion": "${AZURE_OPENAI_API_VERSION}"
+                  "apiVersion": "${OPENCODE_AZURE_API_VERSION}"
                 },
                 "models": {
                   "${AZURE_OPENAI_MODEL}": { "name": "Azure ${AZURE_OPENAI_MODEL}" }

From 900dc2b6e3d1e23b1c795d2c4629aee68c4bbcfc Mon Sep 17 00:00:00 2001
From: dzianisv <dzianis.varabyou@gmail.com>
Date: Tue, 9 Jun 2026 10:27:58 +0000
Subject: [PATCH 2/5] ci(cua): bound curl timeouts + diag dump on server-start
 hang (#22)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 11 'Start opencode server' has hung past the 45-min job timeout in
two consecutive runs (27195348071, 27198238465). Local boot of
opencode-ai 1.16.2 with the same heredoc config is healthy in 3s, so
something is wrong specifically on the GH-hosted runner — likely curl
post-loop waiting indefinitely on an unresponsive server.

Adds:
- set -x for command tracing
- --connect-timeout 2 -m 5 on every curl so hangs cannot exceed 5s
- HEALTHY flag + explicit exit 1 (drops the unbounded post-loop curl)
- Periodic dump every 10s: server log tail, ss listening sockets,
  process liveness — so we can see WHY the server isn't replying

Pure diagnostics; no behaviour change for the green path.
---
 .github/workflows/cua-smoke.yml | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml
index fa7b60a..0caa2e4 100644
--- a/.github/workflows/cua-smoke.yml
+++ b/.github/workflows/cua-smoke.yml
@@ -113,18 +113,40 @@ jobs:
 
       - name: Start opencode server on runner host
         run: |
+          set -x
           # Bind all interfaces so the emulator can reach it via 10.0.2.2.
           nohup opencode serve --hostname 0.0.0.0 --port 4096 > /tmp/opencode-server.log 2>&1 &
+          SRV_PID=$!
+          echo "opencode pid=$SRV_PID"
           echo "Waiting for opencode server /global/health ..."
+          HEALTHY=0
           for i in $(seq 1 60); do
-            if curl -sf http://127.0.0.1:4096/global/health > /dev/null; then
-              echo "opencode server healthy after ${i}s"; break
+            if curl -sf --connect-timeout 2 -m 5 http://127.0.0.1:4096/global/health > /dev/null; then
+              echo "opencode server healthy after ${i}s"
+              HEALTHY=1
+              break
+            fi
+            # Periodic state dump every 10s while waiting
+            if [ $((i % 10)) -eq 0 ]; then
+              echo "--- @${i}s: server log so far ---"
+              tail -20 /tmp/opencode-server.log || true
+              echo "--- listening ports ---"
+              ss -tlnp 2>/dev/null | grep -E ':4096|opencode' || echo "no listener on 4096"
+              echo "--- pid alive? ---"
+              kill -0 $SRV_PID 2>/dev/null && echo "pid $SRV_PID alive" || echo "pid $SRV_PID DEAD"
             fi
             sleep 1
           done
-          curl -sf http://127.0.0.1:4096/global/health || {
-            echo "::error::opencode server failed to become healthy"; cat /tmp/opencode-server.log; exit 1;
-          }
+          if [ "$HEALTHY" != "1" ]; then
+            echo "::error::opencode server failed to become healthy in 60s"
+            echo "--- final server log ---"
+            cat /tmp/opencode-server.log || true
+            echo "--- ss listing ---"
+            ss -tlnp 2>/dev/null || true
+            echo "--- ps tree ---"
+            ps -ef | grep -E 'opencode|node|npm' | head -20 || true
+            exit 1
+          fi
 
       - name: Probe opencode model capability (can it reply?)
         id: probe

From 094c076fc27c8ed7991b86ee5b5a9ac2cf7ae608 Mon Sep 17 00:00:00 2001
From: Dennis V <2119348+dzianisv@users.noreply.github.com>
Date: Tue, 9 Jun 2026 11:00:13 +0000
Subject: [PATCH 3/5] fix(ci): use api-version=preview for /openai/v1/responses
 (#22)

Reproduced the probe failure locally against the same Azure resource:
all date-based api-versions (2024-08-01-preview, 2024-12-01-preview,
2025-01-01-preview, 2025-03-01-preview, 2025-04-01-preview) return:

    {"error":{"code":"BadRequest","message":"API version not supported"}}

Only api-version=preview and api-version=v1 succeed (200). This is the
new Azure OpenAI v1 responses-API style; date strings are reserved for
the legacy /openai/deployments/{model}/chat/completions endpoint.

@ai-sdk/azure 3.x already defaults apiVersion to "preview" (per the
type definition: "Custom api version to use. Defaults to `preview`."),
so this aligns the workflow with the SDK default. Probe should now
return MODEL_CAPABLE=true and the send_message scenario will run.
---
 .github/workflows/cua-smoke.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml
index 0caa2e4..f876369 100644
--- a/.github/workflows/cua-smoke.yml
+++ b/.github/workflows/cua-smoke.yml
@@ -21,10 +21,12 @@ jobs:
       # CUA driver uses chat-completions; 2024-08-01-preview is sufficient.
       AZURE_OPENAI_API_VERSION: "2024-08-01-preview"
       # opencode's @ai-sdk/azure provider hits the new /openai/v1/responses
-      # endpoint, which requires 2025-03-01-preview or newer. Without this,
-      # probe returns "API version not supported" -> MODEL_CAPABLE=false ->
-      # send_message/multi_turn scenarios get skipped (#22).
-      OPENCODE_AZURE_API_VERSION: "2025-04-01-preview"
+      # endpoint. That endpoint accepts ONLY api-version=preview or v1 — every
+      # date-based value (2024-*-preview, 2025-*-preview, 2025-04-01-preview)
+      # returns 400 "API version not supported" and forces MODEL_CAPABLE=false,
+      # which makes send_message/multi_turn scenarios get skipped (#22).
+      # `preview` is also the @ai-sdk/azure 3.x default.
+      OPENCODE_AZURE_API_VERSION: "preview"
       # Android emulator reaches the runner host loopback via 10.0.2.2.
       # A real `opencode serve` runs on the host (see steps below), making this a true E2E.
       OPENCODE_URL: "http://10.0.2.2:4096"

From d73a6d4e88ac57856ed62033eeeddd2665fe81bb Mon Sep 17 00:00:00 2001
From: Dennis V <2119348+dzianisv@users.noreply.github.com>
Date: Tue, 9 Jun 2026 11:28:15 +0000
Subject: [PATCH 4/5] test(cua): extend send_message and multi_turn waits to
 30s

Assistant bubbles can take 15+ seconds to appear after send. Previous
5-second wait was too short and caused false failures even when API
calls succeeded. Re-check screenshots periodically up to 30s total.
---
 scripts/android-cua-smoke.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scripts/android-cua-smoke.py b/scripts/android-cua-smoke.py
index c0bc943..037a2d0 100755
--- a/scripts/android-cua-smoke.py
+++ b/scripts/android-cua-smoke.py
@@ -565,16 +565,21 @@ def run_cua(goal: str, max_steps: int = 30, model: str = "gpt-4o",
         "goal": (
             "You see the OpenCode mobile app. Tap the '+' button (top-right) to create a new session. "
             "Tap the text input at the bottom. Type 'ping'. Press back to dismiss keyboard. "
-            "Use the send action. Wait 5 seconds. "
-            "Report success if you see both a 'You' bubble and an 'Assistant' bubble."
+            "Use the send action. Wait 5 seconds, then take another screenshot. "
+            "If you don't yet see an assistant reply, wait another 10 seconds and re-check (assistant replies can take 15+ seconds). "
+            "If still no assistant bubble, wait another 15 seconds and re-check one more time. "
+            "Report success if you see both a 'You' bubble and an 'Assistant' bubble. "
+            "Report failure only after at least 30 seconds of total waiting with no assistant bubble."
         ),
     },
     {
         "name": "multi_turn",
         "goal": (
             "You see the OpenCode mobile app. Tap '+' (top-right) to create a new session. "
-            "Tap the text input. Type 'what is 2+2'. Press back. Use send action. Wait 5 seconds. "
-            "Then tap the text input again, type 'and 3+3?'. Press back. Use send action. Wait 5 seconds. "
+            "Tap the text input. Type 'what is 2+2'. Press back. Use send action. "
+            "Wait up to 30 seconds for an assistant reply (re-check every 10 seconds). "
+            "Then tap the text input again, type 'and 3+3?'. Press back. Use send action. "
+            "Wait up to 30 seconds for the second assistant reply (re-check every 10 seconds). "
             "Report success if you see two assistant reply bubbles."
         ),
     },

From 308f2f0535fa678d27990a6edff8dac39d40955e Mon Sep 17 00:00:00 2001
From: Dennis V <2119348+dzianisv@users.noreply.github.com>
Date: Tue, 9 Jun 2026 17:19:13 +0000
Subject: [PATCH 5/5] fix(cua): screen-relative send button threshold for #22
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The send action's auto-locate filtered for y1 > 2200 and fell back to
hardcoded (996, 2358) — both assume a 1080x2400 panel. The CI emulator
(API 30 google_apis pixel profile) is 1080x1920, so:
  - the bottom_buttons filter never matched any clickable element
  - the fallback tap landed off-screen
  → 'ping' message never sent, scenario timed out with no bubbles.

Switch to a screen-relative threshold (bottom 25%) and a fallback that
uses get_screen_size() to land in the bottom-right corner regardless of
device resolution. This was masked until now because send_message was
gated by MODEL_CAPABLE=false in earlier CI runs.

Refs: #22
---
 scripts/android-cua-smoke.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/scripts/android-cua-smoke.py b/scripts/android-cua-smoke.py
index 037a2d0..d44fa84 100755
--- a/scripts/android-cua-smoke.py
+++ b/scripts/android-cua-smoke.py
@@ -357,14 +357,16 @@ def execute_action(action: dict) -> str:
         return f"swiped ({x1},{y1})->({x2},{y2})"
 
     elif act == "send":
-        # Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar
+        # Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar.
+        # Threshold is screen-relative (bottom 25%) so it works on any emulator
+        # resolution — API 30 default profile is 1080x1920, not the 2400-tall pixel
+        # we previously hardcoded against.
+        screen_w, screen_h = get_screen_size()
+        bottom_threshold = int(screen_h * 0.75)
         xml = ui_dump()
-        # Find the EditText (message input) and the clickable element immediately after it
-        # The send button is the last clickable ViewGroup in the input row
         matches = re.findall(r'clickable="true"[^>]*bounds="\[(\d+),(\d+)\]\[(\d+),(\d+)\]"', xml)
         if matches:
-            # Find the rightmost clickable element near the bottom (y > 2200)
-            bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > 2200]
+            bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > bottom_threshold]
             if bottom_buttons:
                 # Rightmost = highest x1
                 send_btn = max(bottom_buttons, key=lambda b: b[0])
@@ -372,9 +374,11 @@ def execute_action(action: dict) -> str:
                 cy = (send_btn[1] + send_btn[3]) // 2
                 adb("shell", "input", "tap", str(cx), str(cy))
                 return f"send button tapped ({cx}, {cy})"
-        # Fallback: tap known location
-        adb("shell", "input", "tap", "996", "2358")
-        return "send button tapped (fallback 996, 2358)"
+        # Fallback: tap bottom-right corner of the screen, offset slightly inward
+        fx = screen_w - 80
+        fy = screen_h - 120
+        adb("shell", "input", "tap", str(fx), str(fy))
+        return f"send button tapped (fallback {fx}, {fy})"
 
     elif act == "wait":
         secs = float(action.get("seconds", 2))