Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions .github/workflows/cua-smoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,15 @@ jobs:
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
# CUA driver uses chat-completions; 2024-08-01-preview is sufficient.
AZURE_OPENAI_API_VERSION: "2024-08-01-preview"
# opencode's @ai-sdk/azure provider hits the new /openai/v1/responses
# endpoint. That endpoint accepts ONLY api-version=preview or v1 — every
# date-based value (2024-*-preview, 2025-*-preview, 2025-04-01-preview)
# returns 400 "API version not supported" and forces MODEL_CAPABLE=false,
# which makes send_message/multi_turn scenarios get skipped (#22).
# `preview` is also the @ai-sdk/azure 3.x default.
OPENCODE_AZURE_API_VERSION: "preview"
# Android emulator reaches the runner host loopback via 10.0.2.2.
# A real `opencode serve` runs on the host (see steps below), making this a true E2E.
OPENCODE_URL: "http://10.0.2.2:4096"
Expand Down Expand Up @@ -92,7 +100,7 @@ jobs:
"options": {
"resourceName": "${RESOURCE_NAME}",
"apiKey": "{env:AZURE_OPENAI_API_KEY}",
"apiVersion": "${AZURE_OPENAI_API_VERSION}"
"apiVersion": "${OPENCODE_AZURE_API_VERSION}"
},
"models": {
"${AZURE_OPENAI_MODEL}": { "name": "Azure ${AZURE_OPENAI_MODEL}" }
Expand All @@ -107,18 +115,40 @@ jobs:

- name: Start opencode server on runner host
run: |
set -x
# Bind all interfaces so the emulator can reach it via 10.0.2.2.
nohup opencode serve --hostname 0.0.0.0 --port 4096 > /tmp/opencode-server.log 2>&1 &
SRV_PID=$!
echo "opencode pid=$SRV_PID"
echo "Waiting for opencode server /global/health ..."
HEALTHY=0
for i in $(seq 1 60); do
if curl -sf http://127.0.0.1:4096/global/health > /dev/null; then
echo "opencode server healthy after ${i}s"; break
if curl -sf --connect-timeout 2 -m 5 http://127.0.0.1:4096/global/health > /dev/null; then
echo "opencode server healthy after ${i}s"
HEALTHY=1
break
fi
# Periodic state dump every 10s while waiting
if [ $((i % 10)) -eq 0 ]; then
echo "--- @${i}s: server log so far ---"
tail -20 /tmp/opencode-server.log || true
echo "--- listening ports ---"
ss -tlnp 2>/dev/null | grep -E ':4096|opencode' || echo "no listener on 4096"
echo "--- pid alive? ---"
kill -0 $SRV_PID 2>/dev/null && echo "pid $SRV_PID alive" || echo "pid $SRV_PID DEAD"
fi
sleep 1
done
curl -sf http://127.0.0.1:4096/global/health || {
echo "::error::opencode server failed to become healthy"; cat /tmp/opencode-server.log; exit 1;
}
if [ "$HEALTHY" != "1" ]; then
echo "::error::opencode server failed to become healthy in 60s"
echo "--- final server log ---"
cat /tmp/opencode-server.log || true
echo "--- ss listing ---"
ss -tlnp 2>/dev/null || true
echo "--- ps tree ---"
ps -ef | grep -E 'opencode|node|npm' | head -20 || true
exit 1
fi

- name: Probe opencode model capability (can it reply?)
id: probe
Expand Down
33 changes: 21 additions & 12 deletions scripts/android-cua-smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,24 +357,28 @@ def execute_action(action: dict) -> str:
return f"swiped ({x1},{y1})->({x2},{y2})"

elif act == "send":
# Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar
# Auto-locate send button: rightmost clickable ViewGroup in the bottom input bar.
# Threshold is screen-relative (bottom 25%) so it works on any emulator
# resolution — API 30 default profile is 1080x1920, not the 2400-tall pixel
# we previously hardcoded against.
screen_w, screen_h = get_screen_size()
bottom_threshold = int(screen_h * 0.75)
xml = ui_dump()
# Find the EditText (message input) and the clickable element immediately after it
# The send button is the last clickable ViewGroup in the input row
matches = re.findall(r'clickable="true"[^>]*bounds="\[(\d+),(\d+)\]\[(\d+),(\d+)\]"', xml)
if matches:
# Find the rightmost clickable element near the bottom (y > 2200)
bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > 2200]
bottom_buttons = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches if int(y1) > bottom_threshold]
if bottom_buttons:
# Rightmost = highest x1
send_btn = max(bottom_buttons, key=lambda b: b[0])
cx = (send_btn[0] + send_btn[2]) // 2
cy = (send_btn[1] + send_btn[3]) // 2
adb("shell", "input", "tap", str(cx), str(cy))
return f"send button tapped ({cx}, {cy})"
# Fallback: tap known location
adb("shell", "input", "tap", "996", "2358")
return "send button tapped (fallback 996, 2358)"
# Fallback: tap bottom-right corner of the screen, offset slightly inward
fx = screen_w - 80
fy = screen_h - 120
adb("shell", "input", "tap", str(fx), str(fy))
return f"send button tapped (fallback {fx}, {fy})"

elif act == "wait":
secs = float(action.get("seconds", 2))
Expand Down Expand Up @@ -565,16 +569,21 @@ def run_cua(goal: str, max_steps: int = 30, model: str = "gpt-4o",
"goal": (
"You see the OpenCode mobile app. Tap the '+' button (top-right) to create a new session. "
"Tap the text input at the bottom. Type 'ping'. Press back to dismiss keyboard. "
"Use the send action. Wait 5 seconds. "
"Report success if you see both a 'You' bubble and an 'Assistant' bubble."
"Use the send action. Wait 5 seconds, then take another screenshot. "
"If you don't yet see an assistant reply, wait another 10 seconds and re-check (assistant replies can take 15+ seconds). "
"If still no assistant bubble, wait another 15 seconds and re-check one more time. "
"Report success if you see both a 'You' bubble and an 'Assistant' bubble. "
"Report failure only after at least 30 seconds of total waiting with no assistant bubble."
),
},
{
"name": "multi_turn",
"goal": (
"You see the OpenCode mobile app. Tap '+' (top-right) to create a new session. "
"Tap the text input. Type 'what is 2+2'. Press back. Use send action. Wait 5 seconds. "
"Then tap the text input again, type 'and 3+3?'. Press back. Use send action. Wait 5 seconds. "
"Tap the text input. Type 'what is 2+2'. Press back. Use send action. "
"Wait up to 30 seconds for an assistant reply (re-check every 10 seconds). "
"Then tap the text input again, type 'and 3+3?'. Press back. Use send action. "
"Wait up to 30 seconds for the second assistant reply (re-check every 10 seconds). "
"Report success if you see two assistant reply bubbles."
),
},
Expand Down