From 00b26d4eaf039d8ba136659217503ac904caf9c6 Mon Sep 17 00:00:00 2001 From: Dennis V <2119348+dzianisv@users.noreply.github.com> Date: Tue, 9 Jun 2026 08:53:54 +0000 Subject: [PATCH 1/5] fix(ci): bump opencode Azure apiVersion to support /responses endpoint (#22) The CUA smoke probe was returning MODEL_CAPABLE=false because opencode's @ai-sdk/azure provider got 'API version not supported' from Azure on /openai/v1/responses with apiVersion=2024-08-01-preview. Split into two envs: keep the CUA driver on 2024-08-01-preview (chat-completions only) and bump the opencode-side provider config to 2025-04-01-preview, which supports the new responses API. Effect: send_message/multi_turn scenarios get included again in CUA smoke when the probe succeeds. --- .github/workflows/cua-smoke.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml index 9b0aeb2..fa7b60a 100644 --- a/.github/workflows/cua-smoke.yml +++ b/.github/workflows/cua-smoke.yml @@ -18,7 +18,13 @@ jobs: env: AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + # CUA driver uses chat-completions; 2024-08-01-preview is sufficient. AZURE_OPENAI_API_VERSION: "2024-08-01-preview" + # opencode's @ai-sdk/azure provider hits the new /openai/v1/responses + # endpoint, which requires 2025-03-01-preview or newer. Without this, + # probe returns "API version not supported" -> MODEL_CAPABLE=false -> + # send_message/multi_turn scenarios get skipped (#22). + OPENCODE_AZURE_API_VERSION: "2025-04-01-preview" # Android emulator reaches the runner host loopback via 10.0.2.2. # A real `opencode serve` runs on the host (see steps below), making this a true E2E. OPENCODE_URL: "http://10.0.2.2:4096" @@ -92,7 +98,7 @@ jobs: "options": { "resourceName": "${RESOURCE_NAME}", "apiKey": "{env:AZURE_OPENAI_API_KEY}", - "apiVersion": "${AZURE_OPENAI_API_VERSION}" + "apiVersion": "${OPENCODE_AZURE_API_VERSION}" }, "models": { "${AZURE_OPENAI_MODEL}": { "name": "Azure ${AZURE_OPENAI_MODEL}" } From 900dc2b6e3d1e23b1c795d2c4629aee68c4bbcfc Mon Sep 17 00:00:00 2001 From: dzianisv Date: Tue, 9 Jun 2026 10:27:58 +0000 Subject: [PATCH 2/5] ci(cua): bound curl timeouts + diag dump on server-start hang (#22) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 11 'Start opencode server' has hung past the 45-min job timeout in two consecutive runs (27195348071, 27198238465). Local boot of opencode-ai 1.16.2 with the same heredoc config is healthy in 3s, so something is wrong specifically on the GH-hosted runner — likely curl post-loop waiting indefinitely on an unresponsive server. Adds: - set -x for command tracing - --connect-timeout 2 -m 5 on every curl so hangs cannot exceed 5s - HEALTHY flag + explicit exit 1 (drops the unbounded post-loop curl) - Periodic dump every 10s: server log tail, ss listening sockets, process liveness — so we can see WHY the server isn't replying Pure diagnostics; no behaviour change for the green path. --- .github/workflows/cua-smoke.yml | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml index fa7b60a..0caa2e4 100644 --- a/.github/workflows/cua-smoke.yml +++ b/.github/workflows/cua-smoke.yml @@ -113,18 +113,40 @@ jobs: - name: Start opencode server on runner host run: | + set -x # Bind all interfaces so the emulator can reach it via 10.0.2.2. nohup opencode serve --hostname 0.0.0.0 --port 4096 > /tmp/opencode-server.log 2>&1 & + SRV_PID=$! + echo "opencode pid=$SRV_PID" echo "Waiting for opencode server /global/health ..." + HEALTHY=0 for i in $(seq 1 60); do - if curl -sf http://127.0.0.1:4096/global/health > /dev/null; then - echo "opencode server healthy after ${i}s"; break + if curl -sf --connect-timeout 2 -m 5 http://127.0.0.1:4096/global/health > /dev/null; then + echo "opencode server healthy after ${i}s" + HEALTHY=1 + break + fi + # Periodic state dump every 10s while waiting + if [ $((i % 10)) -eq 0 ]; then + echo "--- @${i}s: server log so far ---" + tail -20 /tmp/opencode-server.log || true + echo "--- listening ports ---" + ss -tlnp 2>/dev/null | grep -E ':4096|opencode' || echo "no listener on 4096" + echo "--- pid alive? ---" + kill -0 $SRV_PID 2>/dev/null && echo "pid $SRV_PID alive" || echo "pid $SRV_PID DEAD" fi sleep 1 done - curl -sf http://127.0.0.1:4096/global/health || { - echo "::error::opencode server failed to become healthy"; cat /tmp/opencode-server.log; exit 1; - } + if [ "$HEALTHY" != "1" ]; then + echo "::error::opencode server failed to become healthy in 60s" + echo "--- final server log ---" + cat /tmp/opencode-server.log || true + echo "--- ss listing ---" + ss -tlnp 2>/dev/null || true + echo "--- ps tree ---" + ps -ef | grep -E 'opencode|node|npm' | head -20 || true + exit 1 + fi - name: Probe opencode model capability (can it reply?) id: probe From 094c076fc27c8ed7991b86ee5b5a9ac2cf7ae608 Mon Sep 17 00:00:00 2001 From: Dennis V <2119348+dzianisv@users.noreply.github.com> Date: Tue, 9 Jun 2026 11:00:13 +0000 Subject: [PATCH 3/5] fix(ci): use api-version=preview for /openai/v1/responses (#22) Reproduced the probe failure locally against the same Azure resource: all date-based api-versions (2024-08-01-preview, 2024-12-01-preview, 2025-01-01-preview, 2025-03-01-preview, 2025-04-01-preview) return: {"error":{"code":"BadRequest","message":"API version not supported"}} Only api-version=preview and api-version=v1 succeed (200). This is the new Azure OpenAI v1 responses-API style; date strings are reserved for the legacy /openai/deployments/{model}/chat/completions endpoint. @ai-sdk/azure 3.x already defaults apiVersion to "preview" (per the type definition: "Custom api version to use. Defaults to `preview`."), so this aligns the workflow with the SDK default. Probe should now return MODEL_CAPABLE=true and the send_message scenario will run. --- .github/workflows/cua-smoke.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml index 0caa2e4..f876369 100644 --- a/.github/workflows/cua-smoke.yml +++ b/.github/workflows/cua-smoke.yml @@ -21,10 +21,12 @@ jobs: # CUA driver uses chat-completions; 2024-08-01-preview is sufficient. AZURE_OPENAI_API_VERSION: "2024-08-01-preview" # opencode's @ai-sdk/azure provider hits the new /openai/v1/responses - # endpoint, which requires 2025-03-01-preview or newer. Without this, - # probe returns "API version not supported" -> MODEL_CAPABLE=false -> - # send_message/multi_turn scenarios get skipped (#22). - OPENCODE_AZURE_API_VERSION: "2025-04-01-preview" + # endpoint. That endpoint accepts ONLY api-version=preview or v1 — every + # date-based value (2024-*-preview, 2025-*-preview, 2025-04-01-preview) + # returns 400 "API version not supported" and forces MODEL_CAPABLE=false, + # which makes send_message/multi_turn scenarios get skipped (#22). + # `preview` is also the @ai-sdk/azure 3.x default. + OPENCODE_AZURE_API_VERSION: "preview" # Android emulator reaches the runner host loopback via 10.0.2.2. # A real `opencode serve` runs on the host (see steps below), making this a true E2E. OPENCODE_URL: "http://10.0.2.2:4096" From d73a6d4e88ac57856ed62033eeeddd2665fe81bb Mon Sep 17 00:00:00 2001 From: Dennis V <2119348+dzianisv@users.noreply.github.com> Date: Tue, 9 Jun 2026 11:28:15 +0000 Subject: [PATCH 4/5] test(cua): extend send_message and multi_turn waits to 30s Assistant bubbles can take 15+ seconds to appear after send. Previous 5-second wait was too short and caused false failures even when API calls succeeded. Re-check screenshots periodically up to 30s total. --- scripts/android-cua-smoke.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/android-cua-smoke.py b/scripts/android-cua-smoke.py index c0bc943..037a2d0 100755 --- a/scripts/android-cua-smoke.py +++ b/scripts/android-cua-smoke.py @@ -565,16 +565,21 @@ def run_cua(goal: str, max_steps: int = 30, model: str = "gpt-4o", "goal": ( "You see the OpenCode mobile app. Tap the '+' button (top-right) to create a new session. " "Tap the text input at the bottom. Type 'ping'. Press back to dismiss keyboard. " - "Use the send action. Wait 5 seconds. " - "Report success if you see both a 'You' bubble and an 'Assistant' bubble." + "Use the send action. Wait 5 seconds, then take another screenshot. " + "If you don't yet see an assistant reply, wait another 10 seconds and re-check (assistant replies can take 15+ seconds). " + "If still no assistant bubble, wait another 15 seconds and re-check one more time. " + "Report success if you see both a 'You' bubble and an 'Assistant' bubble. " + "Report failure only after at least 30 seconds of total waiting with no assistant bubble." ), }, { "name": "multi_turn", "goal": ( "You see the OpenCode mobile app. Tap '+' (top-right) to create a new session. " - "Tap the text input. Type 'what is 2+2'. Press back. Use send action. Wait 5 seconds. " - "Then tap the text input again, type 'and 3+3?'. Press back. Use send action. Wait 5 seconds. " + "Tap the text input. Type 'what is 2+2'. Press back. Use send action. " + "Wait up to 30 seconds for an assistant reply (re-check every 10 seconds). " + "Then tap the text input again, type 'and 3+3?'. Press back. Use send action. " + "Wait up to 30 seconds for the second assistant reply (re-check every 10 seconds). " "Report success if you see two assistant reply bubbles." ), }, From 308f2f0535fa678d27990a6edff8dac39d40955e Mon Sep 17 00:00:00 2001 From: Dennis V <2119348+dzianisv@users.noreply.github.com> Date: Tue, 9 Jun 2026 17:19:13 +0000 Subject: [PATCH 5/5] fix(cua): screen-relative send button threshold for #22 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The send action's auto-locate filtered for y1 > 2200 and fell back to hardcoded (996, 2358) — both assume a 1080x2400 panel. The CI emulator (API 30 google_apis pixel profile) is 1080x1920, so: - the bottom_buttons filter never matched any clickable element - the fallback tap landed off-screen → 'ping' message never sent, scenario timed out with no bubbles. Switch to a screen-relative threshold (bottom 25%) and a fallback that uses get_screen_size() to land in the bottom-right corner regardless of device resolution. This was masked until now because send_message was gated by MODEL_CAPABLE=false in earlier CI runs. Refs: #22 --- scripts/android-cua-smoke.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/scripts/android-cua-smoke.py b/scripts/android-cua-smoke.py index 037a2d0..d44fa84 100755 --- a/scripts/android-cua-smoke.py +++ b/scripts/android-cua-smoke.py @@ -357,14 +357,16 @@ def execute_action(action: dict) -> str: return f"swiped ({x1},{y1})->({x2},{y2})" elif act == "send": - # Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar + # Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar. + # Threshold is screen-relative (bottom 25%) so it works on any emulator + # resolution — API 30 default profile is 1080x1920, not the 2400-tall pixel + # we previously hardcoded against. + screen_w, screen_h = get_screen_size() + bottom_threshold = int(screen_h * 0.75) xml = ui_dump() - # Find the EditText (message input) and the clickable element immediately after it - # The send button is the last clickable ViewGroup in the input row matches = re.findall(r'clickable="true"[^>]*bounds="\[(\d+),(\d+)\]\[(\d+),(\d+)\]"', xml) if matches: - # Find the rightmost clickable element near the bottom (y > 2200) - bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > 2200] + bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > bottom_threshold] if bottom_buttons: # Rightmost = highest x1 send_btn = max(bottom_buttons, key=lambda b: b[0]) @@ -372,9 +374,11 @@ def execute_action(action: dict) -> str: cy = (send_btn[1] + send_btn[3]) // 2 adb("shell", "input", "tap", str(cx), str(cy)) return f"send button tapped ({cx}, {cy})" - # Fallback: tap known location - adb("shell", "input", "tap", "996", "2358") - return "send button tapped (fallback 996, 2358)" + # Fallback: tap bottom-right corner of the screen, offset slightly inward + fx = screen_w - 80 + fy = screen_h - 120 + adb("shell", "input", "tap", str(fx), str(fy)) + return f"send button tapped (fallback {fx}, {fy})" elif act == "wait": secs = float(action.get("seconds", 2))