diff --git a/.github/workflows/cua-smoke.yml b/.github/workflows/cua-smoke.yml index 9b0aeb2..f876369 100644 --- a/.github/workflows/cua-smoke.yml +++ b/.github/workflows/cua-smoke.yml @@ -18,7 +18,15 @@ jobs: env: AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + # CUA driver uses chat-completions; 2024-08-01-preview is sufficient. AZURE_OPENAI_API_VERSION: "2024-08-01-preview" + # opencode's @ai-sdk/azure provider hits the new /openai/v1/responses + # endpoint. That endpoint accepts ONLY api-version=preview or v1 — every + # date-based value (2024-*-preview, 2025-*-preview, 2025-04-01-preview) + # returns 400 "API version not supported" and forces MODEL_CAPABLE=false, + # which makes send_message/multi_turn scenarios get skipped (#22). + # `preview` is also the @ai-sdk/azure 3.x default. + OPENCODE_AZURE_API_VERSION: "preview" # Android emulator reaches the runner host loopback via 10.0.2.2. # A real `opencode serve` runs on the host (see steps below), making this a true E2E. OPENCODE_URL: "http://10.0.2.2:4096" @@ -92,7 +100,7 @@ jobs: "options": { "resourceName": "${RESOURCE_NAME}", "apiKey": "{env:AZURE_OPENAI_API_KEY}", - "apiVersion": "${AZURE_OPENAI_API_VERSION}" + "apiVersion": "${OPENCODE_AZURE_API_VERSION}" }, "models": { "${AZURE_OPENAI_MODEL}": { "name": "Azure ${AZURE_OPENAI_MODEL}" } @@ -107,18 +115,40 @@ jobs: - name: Start opencode server on runner host run: | + set -x # Bind all interfaces so the emulator can reach it via 10.0.2.2. nohup opencode serve --hostname 0.0.0.0 --port 4096 > /tmp/opencode-server.log 2>&1 & + SRV_PID=$! + echo "opencode pid=$SRV_PID" echo "Waiting for opencode server /global/health ..." + HEALTHY=0 for i in $(seq 1 60); do - if curl -sf http://127.0.0.1:4096/global/health > /dev/null; then - echo "opencode server healthy after ${i}s"; break + if curl -sf --connect-timeout 2 -m 5 http://127.0.0.1:4096/global/health > /dev/null; then + echo "opencode server healthy after ${i}s" + HEALTHY=1 + break + fi + # Periodic state dump every 10s while waiting + if [ $((i % 10)) -eq 0 ]; then + echo "--- @${i}s: server log so far ---" + tail -20 /tmp/opencode-server.log || true + echo "--- listening ports ---" + ss -tlnp 2>/dev/null | grep -E ':4096|opencode' || echo "no listener on 4096" + echo "--- pid alive? ---" + kill -0 $SRV_PID 2>/dev/null && echo "pid $SRV_PID alive" || echo "pid $SRV_PID DEAD" fi sleep 1 done - curl -sf http://127.0.0.1:4096/global/health || { - echo "::error::opencode server failed to become healthy"; cat /tmp/opencode-server.log; exit 1; - } + if [ "$HEALTHY" != "1" ]; then + echo "::error::opencode server failed to become healthy in 60s" + echo "--- final server log ---" + cat /tmp/opencode-server.log || true + echo "--- ss listing ---" + ss -tlnp 2>/dev/null || true + echo "--- ps tree ---" + ps -ef | grep -E 'opencode|node|npm' | head -20 || true + exit 1 + fi - name: Probe opencode model capability (can it reply?) id: probe diff --git a/scripts/android-cua-smoke.py b/scripts/android-cua-smoke.py index c0bc943..d44fa84 100755 --- a/scripts/android-cua-smoke.py +++ b/scripts/android-cua-smoke.py @@ -357,14 +357,16 @@ def execute_action(action: dict) -> str: return f"swiped ({x1},{y1})->({x2},{y2})" elif act == "send": - # Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar + # Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar. + # Threshold is screen-relative (bottom 25%) so it works on any emulator + # resolution — API 30 default profile is 1080x1920, not the 2400-tall pixel + # we previously hardcoded against. + screen_w, screen_h = get_screen_size() + bottom_threshold = int(screen_h * 0.75) xml = ui_dump() - # Find the EditText (message input) and the clickable element immediately after it - # The send button is the last clickable ViewGroup in the input row matches = re.findall(r'clickable="true"[^>]*bounds="\[(\d+),(\d+)\]\[(\d+),(\d+)\]"', xml) if matches: - # Find the rightmost clickable element near the bottom (y > 2200) - bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > 2200] + bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > bottom_threshold] if bottom_buttons: # Rightmost = highest x1 send_btn = max(bottom_buttons, key=lambda b: b[0]) @@ -372,9 +374,11 @@ def execute_action(action: dict) -> str: cy = (send_btn[1] + send_btn[3]) // 2 adb("shell", "input", "tap", str(cx), str(cy)) return f"send button tapped ({cx}, {cy})" - # Fallback: tap known location - adb("shell", "input", "tap", "996", "2358") - return "send button tapped (fallback 996, 2358)" + # Fallback: tap bottom-right corner of the screen, offset slightly inward + fx = screen_w - 80 + fy = screen_h - 120 + adb("shell", "input", "tap", str(fx), str(fy)) + return f"send button tapped (fallback {fx}, {fy})" elif act == "wait": secs = float(action.get("seconds", 2)) @@ -565,16 +569,21 @@ def run_cua(goal: str, max_steps: int = 30, model: str = "gpt-4o", "goal": ( "You see the OpenCode mobile app. Tap the '+' button (top-right) to create a new session. " "Tap the text input at the bottom. Type 'ping'. Press back to dismiss keyboard. " - "Use the send action. Wait 5 seconds. " - "Report success if you see both a 'You' bubble and an 'Assistant' bubble." + "Use the send action. Wait 5 seconds, then take another screenshot. " + "If you don't yet see an assistant reply, wait another 10 seconds and re-check (assistant replies can take 15+ seconds). " + "If still no assistant bubble, wait another 15 seconds and re-check one more time. " + "Report success if you see both a 'You' bubble and an 'Assistant' bubble. " + "Report failure only after at least 30 seconds of total waiting with no assistant bubble." ), }, { "name": "multi_turn", "goal": ( "You see the OpenCode mobile app. Tap '+' (top-right) to create a new session. " - "Tap the text input. Type 'what is 2+2'. Press back. Use send action. Wait 5 seconds. " - "Then tap the text input again, type 'and 3+3?'. Press back. Use send action. Wait 5 seconds. " + "Tap the text input. Type 'what is 2+2'. Press back. Use send action. " + "Wait up to 30 seconds for an assistant reply (re-check every 10 seconds). " + "Then tap the text input again, type 'and 3+3?'. Press back. Use send action. " + "Wait up to 30 seconds for the second assistant reply (re-check every 10 seconds). " "Report success if you see two assistant reply bubbles." ), },