diff --git a/.github/workflows/canary.yml b/.github/workflows/canary.yml new file mode 100644 index 0000000..fc176b3 --- /dev/null +++ b/.github/workflows/canary.yml @@ -0,0 +1,48 @@ +name: Canary โ€” plugin vs Claude Code latest + +on: + schedule: + # 05:13 & 17:13 America/Los_Angeles (PDT/UTC-7 basis; cron is UTC) โ€” + # leads the fleet's hour-staggered canary waves so upstream breakage + # surfaces here first. + - cron: "13 12 * * *" + - cron: "13 0 * * *" + workflow_dispatch: + +jobs: + contract: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install bridge + latest SDK + run: | + pip install -e . pytest + pip install -U claude-agent-sdk + + - name: Install latest Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Contract tests vs latest host + run: pytest -q tests/contract + + # Unattended runs page the team; manual dispatches don't. + - name: Notify Google Chat on failure + if: failure() && github.event_name == 'schedule' + env: + WEBHOOK_URL: ${{ secrets.GOOGLE_CHAT_WEBHOOK_URL }} + run: | + [ -n "$WEBHOOK_URL" ] || exit 0 + curl -sS -X POST -H 'Content-Type: application/json' \ + -d "{\"text\": \"๐Ÿšจ claude-code-plugin canary FAILED against the latest Claude Code host: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\"}" \ + "$WEBHOOK_URL" || true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index e12f42a..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: CI - -on: - push: - branches: [main] - pull_request: - -jobs: - test: - runs-on: ubuntu-latest - timeout-minutes: 5 - - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - # inkbox is mocked in the tests, so install only what they import. - - name: Install test deps - run: pip install pytest aiohttp segno claude-agent-sdk - - - name: Test - run: pytest -q diff --git a/.github/workflows/live-channels.yml b/.github/workflows/live-channels.yml new file mode 100644 index 0000000..72d2fc9 --- /dev/null +++ b/.github/workflows/live-channels.yml @@ -0,0 +1,165 @@ +name: Live channels e2e + +# Boots a REAL bridge (tunnel + webhooks + Claude Code sessions) with this +# checkout, then drives it from a remote Inkbox identity over email + SMS. +# Two legs, serialized (they share the AUT identity): +# * mock โ€” the sessions "think" against a local deterministic model server +# (tests/live/mock_anthropic.py): free, proves the whole pipe. +# * real โ€” a real Claude model: proves reasoning + tool use end to end. + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + workflow_dispatch: + inputs: + timeout_s: + description: "Per-question reply timeout (seconds)" + required: false + default: "150" + workflow_run: + workflows: ["Canary โ€” plugin vs Claude Code latest"] + types: [completed] + +# Only one holder of the AUT identity's Inkbox tunnel at a time โ€” the voice +# suite shares this group, so live runs queue instead of fighting the tunnel. +concurrency: + group: inkbox-live-aut-tunnel + cancel-in-progress: false + +jobs: + live: + # Draft PRs and fork PRs (no secrets) skip; chained runs only follow a + # PASSING canary. + if: > + (github.event_name == 'pull_request' && + github.event.pull_request.draft == false && + github.event.pull_request.head.repo.full_name == github.repository) || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') + runs-on: ubuntu-latest + timeout-minutes: 25 + strategy: + max-parallel: 1 + fail-fast: false + matrix: + leg: [mock, real] + env: + INKBOX_API_KEY: ${{ secrets.CLAUDE_CODE_INKBOX_API_KEY }} + INKBOX_SIGNING_KEY: ${{ secrets.CLAUDE_CODE_INKBOX_SIGNING_KEY }} + CLAUDE_PROJECT_DIR: ${{ github.workspace }} + # A stray permission escalation should fail a test fast, not park the + # session for the default 10 minutes. + INKBOX_PERMISSION_TIMEOUT_S: "30" + DISABLE_AUTOUPDATER: "1" + CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: "1" + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install bridge + run: pip install -e . pytest + + - name: Install Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Derive AUT identity handle + run: | + HANDLE=$(python3 - <<'PY' + import os + from inkbox import Inkbox + c = Inkbox(api_key=os.environ["INKBOX_API_KEY"]) + print(c.mailboxes.list()[0].email_address.split("@", 1)[0]) + PY + ) + echo "INKBOX_IDENTITY=$HANDLE" >> "$GITHUB_ENV" + echo "AUT handle: $HANDLE" + + - name: Start mock model server + if: matrix.leg == 'mock' + run: | + nohup python3 tests/live/mock_anthropic.py 8089 > /tmp/mock_anthropic.log 2>&1 & + for i in $(seq 1 10); do + curl -fsS http://127.0.0.1:8089/ > /dev/null 2>&1 && exit 0 + sleep 1 + done + echo "mock model server never came up"; exit 1 + + - name: Point sessions at the mock model + if: matrix.leg == 'mock' + run: | + echo "ANTHROPIC_BASE_URL=http://127.0.0.1:8089" >> "$GITHUB_ENV" + echo "ANTHROPIC_API_KEY=sk-mock-not-used" >> "$GITHUB_ENV" + + - name: Point sessions at the real model + if: matrix.leg == 'real' + run: echo "ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}" >> "$GITHUB_ENV" + + - name: Start bridge gateway + run: | + nohup inkbox-claude run > /tmp/gateway.log 2>&1 & + echo $! > /tmp/gateway.pid + for i in $(seq 1 36); do + if grep -q "\[bridge\] ready" /tmp/gateway.log; then + echo "gateway ready"; exit 0 + fi + if ! kill -0 "$(cat /tmp/gateway.pid)" 2>/dev/null; then + echo "gateway process died during startup"; tail -n 150 /tmp/gateway.log; exit 1 + fi + sleep 5 + done + echo "gateway never became ready"; tail -n 150 /tmp/gateway.log; exit 1 + + - name: Live channel tests (${{ matrix.leg }} model) + env: + REMOTE_INKBOX_API_KEY: ${{ secrets.REMOTE_INKBOX_API_KEY }} + CLAUDE_CODE_INKBOX_API_KEY: ${{ secrets.CLAUDE_CODE_INKBOX_API_KEY }} + LIVE_EMAIL_TIMEOUT: ${{ github.event.inputs.timeout_s || '150' }} + LIVE_REAL_MODEL: ${{ matrix.leg == 'real' && '1' || '' }} + LIVE_CONTACT_CRUD: ${{ matrix.leg == 'real' && '1' || '' }} + run: python3 -m pytest tests/live -v + + # Logs can carry live message content โ€” surface them only when needed. + - name: Dump logs on failure + if: failure() + run: | + echo "=== gateway.log ==="; tail -n 300 /tmp/gateway.log 2>/dev/null || true + echo "=== mock_anthropic.log ==="; tail -n 100 /tmp/mock_anthropic.log 2>/dev/null || true + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: live-channels-${{ matrix.leg }}-logs + path: | + /tmp/gateway.log + /tmp/mock_anthropic.log + if-no-files-found: ignore + + - name: Stop bridge gateway + if: always() + run: | + [ -f /tmp/gateway.pid ] && kill "$(cat /tmp/gateway.pid)" 2>/dev/null || true + + # Page the team only when an UNATTENDED run breaks (the canary chain); + # PR authors and manual dispatchers are already watching. + notify: + needs: live + if: always() && needs.live.result == 'failure' && github.event_name == 'workflow_run' + runs-on: ubuntu-latest + steps: + - name: Notify Google Chat + env: + WEBHOOK_URL: ${{ secrets.GOOGLE_CHAT_WEBHOOK_URL }} + run: | + [ -n "$WEBHOOK_URL" ] || exit 0 + curl -sS -X POST -H 'Content-Type: application/json' \ + -d "{\"text\": \"๐Ÿšจ claude-code-plugin live channels e2e FAILED (chained off the canary): ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\"}" \ + "$WEBHOOK_URL" || true diff --git a/.github/workflows/live-voice.yml b/.github/workflows/live-voice.yml new file mode 100644 index 0000000..8b60708 --- /dev/null +++ b/.github/workflows/live-voice.yml @@ -0,0 +1,147 @@ +name: Live voice e2e + +# Real phone calls against a real bridge, one scenario per job: +# * inbound_inkbox โ€” driver calls the agent; Inkbox STT/TTS answers. +# * outbound_realtime โ€” driver texts "call me"; the agent dials back powered +# by the realtime voice API. +# A driver process (tests/live/voice_driver.py) is the peer on the other end +# of the call, bridged over the driver identity's own Inkbox tunnel. + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + workflow_dispatch: + +# Shares the AUT tunnel lock with the channels suite. +concurrency: + group: inkbox-live-aut-tunnel + cancel-in-progress: false + +jobs: + voice: + if: > + (github.event_name == 'pull_request' && + github.event.pull_request.draft == false && + github.event.pull_request.head.repo.full_name == github.repository) || + github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + max-parallel: 1 + fail-fast: false + matrix: + scenario: [inbound_inkbox, outbound_realtime] + env: + INKBOX_API_KEY: ${{ secrets.CLAUDE_CODE_INKBOX_API_KEY }} + INKBOX_SIGNING_KEY: ${{ secrets.CLAUDE_CODE_INKBOX_SIGNING_KEY }} + CLAUDE_PROJECT_DIR: ${{ github.workspace }} + INKBOX_PERMISSION_TIMEOUT_S: "30" + DISABLE_AUTOUPDATER: "1" + CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: "1" + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + VOICE_DRIVER_STATE: /tmp/voice_driver_state.json + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + # uvicorn[standard] matters: the bare install can't accept WebSocket + # upgrades, and the driver's call-media endpoint is a WebSocket. + - name: Install bridge + driver deps + run: pip install -e . pytest fastapi 'uvicorn[standard]' + + - name: Install Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Derive AUT identity handle + run: | + HANDLE=$(python3 - <<'PY' + import os + from inkbox import Inkbox + c = Inkbox(api_key=os.environ["INKBOX_API_KEY"]) + print(c.mailboxes.list()[0].email_address.split("@", 1)[0]) + PY + ) + echo "INKBOX_IDENTITY=$HANDLE" >> "$GITHUB_ENV" + echo "AUT handle: $HANDLE" + + - name: Configure speech mode (${{ matrix.scenario }}) + run: | + if [ "${{ matrix.scenario }}" = "outbound_realtime" ]; then + echo "INKBOX_REALTIME_ENABLED=true" >> "$GITHUB_ENV" + echo "INKBOX_REALTIME_MODEL=gpt-realtime-2" >> "$GITHUB_ENV" + echo "INKBOX_REALTIME_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> "$GITHUB_ENV" + else + echo "INKBOX_REALTIME_ENABLED=false" >> "$GITHUB_ENV" + fi + + - name: Start voice driver + env: + REMOTE_INKBOX_API_KEY: ${{ secrets.REMOTE_INKBOX_API_KEY }} + run: | + nohup python3 tests/live/voice_driver.py > /tmp/voice_driver.log 2>&1 & + echo $! > /tmp/voice_driver.pid + for i in $(seq 1 30); do + [ -f "$VOICE_DRIVER_STATE" ] && { echo "driver ready"; exit 0; } + if ! kill -0 "$(cat /tmp/voice_driver.pid)" 2>/dev/null; then + echo "voice driver died during startup"; tail -n 100 /tmp/voice_driver.log; exit 1 + fi + sleep 3 + done + echo "voice driver never became ready"; tail -n 100 /tmp/voice_driver.log; exit 1 + + - name: Start bridge gateway + run: | + nohup inkbox-claude run > /tmp/gateway.log 2>&1 & + echo $! > /tmp/gateway.pid + for i in $(seq 1 36); do + if grep -q "\[bridge\] ready" /tmp/gateway.log; then + echo "gateway ready"; exit 0 + fi + if ! kill -0 "$(cat /tmp/gateway.pid)" 2>/dev/null; then + echo "gateway process died during startup"; tail -n 150 /tmp/gateway.log; exit 1 + fi + sleep 5 + done + echo "gateway never became ready"; tail -n 150 /tmp/gateway.log; exit 1 + + - name: Live voice test (${{ matrix.scenario }}) + env: + REMOTE_INKBOX_API_KEY: ${{ secrets.REMOTE_INKBOX_API_KEY }} + CLAUDE_CODE_INKBOX_API_KEY: ${{ secrets.CLAUDE_CODE_INKBOX_API_KEY }} + VOICE_SCENARIO: ${{ matrix.scenario }} + LIVE_REAL_MODEL: "1" + run: python3 -m pytest tests/live/test_voice.py -v + + # Logs can carry live call transcripts โ€” surface them only when needed. + - name: Dump logs on failure + if: failure() + run: | + echo "=== gateway.log ==="; tail -n 300 /tmp/gateway.log 2>/dev/null || true + echo "=== voice_driver.log ==="; tail -n 150 /tmp/voice_driver.log 2>/dev/null || true + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: live-voice-${{ matrix.scenario }}-logs + path: | + /tmp/gateway.log + /tmp/voice_driver.log + if-no-files-found: ignore + + # Driver first (SIGINT so its cleanup reverts the number's auto-accept), + # then a beat for the revert to land, then the gateway. + - name: Teardown + if: always() + run: | + [ -f /tmp/voice_driver.pid ] && kill -INT "$(cat /tmp/voice_driver.pid)" 2>/dev/null || true + sleep 3 + [ -f /tmp/gateway.pid ] && kill "$(cat /tmp/gateway.pid)" 2>/dev/null || true diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..7594643 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,60 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + +jobs: + # Offline unit suite โ€” inkbox is mocked in the tests, so install only what + # they import. Runs on every push/PR, drafts included. + unit: + runs-on: ubuntu-latest + timeout-minutes: 10 + strategy: + matrix: + python-version: ["3.10", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install test deps + run: pip install pytest aiohttp segno claude-agent-sdk + + # tests/contract runs in its own job against the LATEST host, not here. + # tests/live is collected but self-skips without the live API keys. + - name: Test + run: pytest -q --ignore=tests/contract + + # Contract suite โ€” the bridge's view of the host interface, checked against + # the latest published claude-agent-sdk + Claude Code CLI so upstream drift + # fails a PR here instead of a live gateway later. + contract-pr: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install bridge + latest SDK + run: | + pip install -e . pytest + pip install -U claude-agent-sdk + + - name: Install latest Claude Code CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Contract tests vs latest host + run: pytest -q tests/contract diff --git a/tests/contract/test_host_interface.py b/tests/contract/test_host_interface.py new file mode 100644 index 0000000..a6b3ba1 --- /dev/null +++ b/tests/contract/test_host_interface.py @@ -0,0 +1,100 @@ +"""Contract tests: the host interface this bridge depends on, against the +INSTALLED claude-agent-sdk + Claude Code CLI. + +Run in CI with the latest published SDK/CLI (not the pinned dev versions) so an +upstream rename, signature change, or removal fails HERE โ€” before it takes down +a live gateway. Everything asserted is something the bridge actually imports, +constructs, or invokes. +""" + +from __future__ import annotations + +import shutil +import subprocess +from unittest.mock import MagicMock + +import pytest + + +def test_sdk_exports_every_symbol_the_bridge_imports(): + # Mirrors the imports in sessions.py and tools.py, 1:1. + from claude_agent_sdk import ( # noqa: F401 + AssistantMessage, + ClaudeAgentOptions, + ClaudeSDKClient, + PermissionResultAllow, + PermissionResultDeny, + ResultMessage, + TextBlock, + create_sdk_mcp_server, + tool, + ) + + +def test_options_accept_the_kwargs_the_bridge_passes(): + """Constructing ClaudeAgentOptions with the exact kwargs sessions.py uses + fails loudly if the SDK renames or drops any of them.""" + from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient + + async def _can_use_tool(tool_name, input_data, context): # signature stand-in + raise NotImplementedError + + options = ClaudeAgentOptions( + cwd="/tmp", + model=None, + system_prompt={"type": "preset", "preset": "claude_code", "append": "extra"}, + setting_sources=["user", "project"], + allowed_tools=["Read", "mcp__inkbox__inkbox_whoami"], + mcp_servers={}, + can_use_tool=_can_use_tool, + resume=None, + ) + # The client must construct from those options without connecting. + assert ClaudeSDKClient(options=options) is not None + + +def test_permission_results_construct_like_the_bridge_uses_them(): + from claude_agent_sdk import PermissionResultAllow, PermissionResultDeny + + PermissionResultAllow() + PermissionResultAllow(updated_input={"answers": {}}) + PermissionResultDeny(message="not approved") + + +def test_inkbox_mcp_server_builds_against_installed_sdk(): + """build_inkbox_mcp_server exercises the SDK's ``tool`` decorator and + ``create_sdk_mcp_server`` for every tool the bridge registers.""" + from inkbox_claude.tools import build_inkbox_mcp_server + + server, tool_names = build_inkbox_mcp_server(MagicMock(), "contract-test") + assert server is not None + expected = { + "mcp__inkbox__inkbox_whoami", + "mcp__inkbox__inkbox_send_email", + "mcp__inkbox__inkbox_send_sms", + "mcp__inkbox__inkbox_send_imessage", + "mcp__inkbox__inkbox_place_call", + "mcp__inkbox__inkbox_list_calls", + "mcp__inkbox__inkbox_get_call_transcript", + "mcp__inkbox__inkbox_list_text_conversations", + "mcp__inkbox__inkbox_get_text_conversation", + "mcp__inkbox__inkbox_list_imessage_conversations", + "mcp__inkbox__inkbox_get_imessage_conversation", + "mcp__inkbox__inkbox_lookup_contact", + "mcp__inkbox__inkbox_list_contacts", + "mcp__inkbox__inkbox_get_contact", + "mcp__inkbox__inkbox_create_contact", + "mcp__inkbox__inkbox_update_contact", + "mcp__inkbox__inkbox_delete_contact", + } + assert expected <= set(tool_names) + + +def test_claude_cli_installed_and_answers_version(): + """The SDK drives a ``claude`` subprocess; the CLI must be present and sane.""" + claude = shutil.which("claude") + if claude is None: + pytest.fail("claude CLI not on PATH โ€” the bridge cannot start sessions without it") + out = subprocess.run([claude, "--version"], capture_output=True, text=True, timeout=60) + assert out.returncode == 0, f"claude --version failed: {out.stderr[:300]}" + assert out.stdout.strip(), "claude --version printed nothing" diff --git a/tests/live/mock_anthropic.py b/tests/live/mock_anthropic.py new file mode 100644 index 0000000..56f097c --- /dev/null +++ b/tests/live/mock_anthropic.py @@ -0,0 +1,94 @@ +"""Deterministic Anthropic-API mock for live agent tests. + +Claude Code honours ``ANTHROPIC_BASE_URL``, so pointing the bridged sessions at +this server makes the agent "think" here instead of against the real API: no +real key, no tokens, no flakiness, fully deterministic. We still exercise the +entire real pipeline (bridge, tunnel, inbound routing, Claude Code session, +Inkbox send + delivery) โ€” only the LLM brain is faked. + +Every reply contains ``REPLY_OK`` plus, when present, the inbound's smoke nonce, +so a live test can assert the canned content travelled inbound โ†’ model โ†’ reply โ†’ +delivery end to end (and that the agent did NOT fall back to an error message). + +Serves the Messages API (``POST /v1/messages``, streaming and not) and the +token-count endpoint. Run: ``python mock_anthropic.py [port]`` (default 8089). +Stdlib only. +""" + +from __future__ import annotations + +import json +import re +import sys +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +_NONCE = re.compile(r"smoke-[0-9a-f]{6,}") + + +def _reply_text(req: dict) -> str: + m = _NONCE.search(json.dumps(req)) + tag = m.group(0) if m else "no-nonce" + return f"REPLY_OK {tag} โ€” automated reachability reply from the agent." + + +class Handler(BaseHTTPRequestHandler): + def log_message(self, *_args): # quiet + pass + + def _send_json(self, code: int, obj: dict) -> None: + body = json.dumps(obj).encode() + self.send_response(code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_GET(self): # noqa: N802 (health / probes) + self._send_json(200, {"ok": True}) + + def _sse(self, event: str, data: dict) -> None: + self.wfile.write(f"event: {event}\ndata: {json.dumps(data)}\n\n".encode()) + + def do_POST(self): # noqa: N802 + n = int(self.headers.get("Content-Length") or 0) + try: + req = json.loads(self.rfile.read(n) or b"{}") + except ValueError: + req = {} + + if self.path.rstrip("/").endswith("/count_tokens"): + self._send_json(200, {"input_tokens": 1}) + return + + text = _reply_text(req) + model = req.get("model", "mock-model") + usage = {"input_tokens": 1, "output_tokens": 1} + if req.get("stream"): + self.send_response(200) + self.send_header("Content-Type", "text/event-stream") + self.end_headers() + self._sse("message_start", {"type": "message_start", "message": { + "id": "msg_mock", "type": "message", "role": "assistant", "model": model, + "content": [], "stop_reason": None, "stop_sequence": None, "usage": usage, + }}) + self._sse("content_block_start", {"type": "content_block_start", "index": 0, + "content_block": {"type": "text", "text": ""}}) + self._sse("content_block_delta", {"type": "content_block_delta", "index": 0, + "delta": {"type": "text_delta", "text": text}}) + self._sse("content_block_stop", {"type": "content_block_stop", "index": 0}) + self._sse("message_delta", {"type": "message_delta", + "delta": {"stop_reason": "end_turn", "stop_sequence": None}, + "usage": {"output_tokens": 1}}) + self._sse("message_stop", {"type": "message_stop"}) + self.wfile.flush() + else: + self._send_json(200, { + "id": "msg_mock", "type": "message", "role": "assistant", "model": model, + "content": [{"type": "text", "text": text}], + "stop_reason": "end_turn", "stop_sequence": None, "usage": usage, + }) + + +if __name__ == "__main__": + port = int(sys.argv[1]) if len(sys.argv) > 1 else 8089 + ThreadingHTTPServer(("127.0.0.1", port), Handler).serve_forever() diff --git a/tests/live/test_cross_channel.py b/tests/live/test_cross_channel.py new file mode 100644 index 0000000..fbb584b --- /dev/null +++ b/tests/live/test_cross_channel.py @@ -0,0 +1,196 @@ +"""Live cross-channel suite โ€” the agent answers on a DIFFERENT channel. + +Ask on one channel; the agent must figure out the sender's *other-channel* address +from the contact card and respond there. Each request carries a short token, and we +assert that token shows up on the other channel โ€” proving the response is tied to +the request. + + * email -> SMS : email asks for a text; we poll SMS for the token. + * SMS -> email: SMS asks for an email; we poll email for the token. + +Voice is the odd one out: an unanswered call carries no token, so instead of +matching content we assert that a *new inbound call from the AUT's number* lands +on the driver's number within the window โ€” proof the request reasoned its way to +``inkbox_place_call`` and Inkbox actually dialed the driver. + + * email -> call: email asks the agent to call; we poll the driver's calls. + * SMS -> call: SMS asks the agent to call; we poll the driver's calls. + +More channels (iMessage) get added here. Real-model only. +""" + +from __future__ import annotations + +import os +import re +import time +import uuid + +import pytest + +REMOTE_KEY = os.environ.get("REMOTE_INKBOX_API_KEY") +AUT_KEY = os.environ.get("CLAUDE_CODE_INKBOX_API_KEY") +BASE_URL = os.environ.get("INKBOX_BASE_URL", "https://inkbox.ai") +REAL = os.environ.get("LIVE_REAL_MODEL") == "1" +TIMEOUT_S = float(os.environ.get("LIVE_XCHANNEL_TIMEOUT", "200")) +POLL_EVERY_S = 6.0 + +pytestmark = pytest.mark.skipif( + not (REMOTE_KEY and AUT_KEY and REAL), + reason="cross-channel suite: needs both keys + LIVE_REAL_MODEL=1", +) + + +def _digits(s: str) -> str: + return re.sub(r"\D", "", s or "") + + +def _client(key): + from inkbox import Inkbox + + return Inkbox(api_key=key, base_url=BASE_URL) + + +def _token() -> str: + return uuid.uuid4().hex[:6] + + +@pytest.fixture(scope="module") +def xc(): + remote = _client(REMOTE_KEY) + aut = _client(AUT_KEY) + remote_email = remote.mailboxes.list()[0].email_address + aut_email = aut.mailboxes.list()[0].email_address + rnums = remote.phone_numbers.list() + anums = aut.phone_numbers.list() + assert rnums and anums, "both identities need a phone number for cross-channel" + remote_phone, remote_pid = rnums[0].number, str(rnums[0].id) + aut_phone = anums[0].number + + # The agent can only cross channels if the sender's card has BOTH an email and a + # phone. Ensure it does (merge in whatever is missing; never clobber existing data). + from inkbox.contacts.types import ContactEmail, ContactPhone + matches = aut.contacts.lookup(email=remote_email) + if not matches: + aut.contacts.create( + given_name="Penny", family_name="Tester", + emails=[ContactEmail("work", remote_email)], + phones=[ContactPhone("mobile", remote_phone)], + ) + else: + c = matches[0] + emails = list(getattr(c, "emails", [])) + phones = list(getattr(c, "phones", [])) + changed = False + if not any((e.value or "").lower() == remote_email.lower() for e in emails): + emails.append(ContactEmail("work", remote_email)) + changed = True + if not any(_digits(p.value)[-10:] == _digits(remote_phone)[-10:] for p in phones): + phones.append(ContactPhone("mobile", remote_phone)) + changed = True + if changed: + aut.contacts.update(c.id, emails=emails, phones=phones) + + return { + "remote": remote, "aut": aut, + "remote_email": remote_email, "remote_pid": remote_pid, + "aut_email": aut_email, "aut_phone": aut_phone, + } + + +def test_email_request_gets_sms_response(xc): + """Email asks the agent to TEXT a code; the code must arrive over SMS.""" + remote, remote_pid, aut_phone = xc["remote"], xc["remote_pid"], xc["aut_phone"] + token = _token() + tail = _digits(aut_phone)[-10:] + + def _sms_from_aut(): + return [m for m in remote.texts.list(remote_pid, limit=30) + if (getattr(m, "direction", "") or "").lower() == "inbound" + and _digits(getattr(m, "remote_phone_number", "") or "")[-10:] == tail] + + before = {m.id for m in _sms_from_aut()} + remote.messages.send( + xc["remote_email"], to=[xc["aut_email"]], subject=f"[{token}] text me please", + body_text=f"Please send me a text message (SMS) that says: lalala {token}", + ) + + deadline = time.monotonic() + TIMEOUT_S + while time.monotonic() < deadline: + for m in _sms_from_aut(): + if m.id not in before and token in (getattr(m, "text", "") or "").lower(): + return # cross-channel confirmed: email request -> SMS response with the token + time.sleep(POLL_EVERY_S) + pytest.fail(f"agent did not send an SMS containing {token!r} within {TIMEOUT_S:.0f}s") + + +def test_sms_request_gets_email_response(xc): + """SMS asks the agent to EMAIL a code; the code must arrive over email.""" + from inkbox.mail.types import MessageDirection + + remote, remote_email, aut_email = xc["remote"], xc["remote_email"], xc["aut_email"] + token = _token() + + def _email_from_aut(): + return [m for m in remote.messages.list(remote_email, direction=MessageDirection.INBOUND) + if aut_email.lower() in (getattr(m, "from_address", "") or "").lower()] + + before = {m.id for m in _email_from_aut()} + remote.texts.send(xc["remote_pid"], to=xc["aut_phone"], text=f"Please email me the code {token}.") + + deadline = time.monotonic() + TIMEOUT_S + while time.monotonic() < deadline: + for m in _email_from_aut(): + if m.id in before: + continue + hay = (getattr(m, "subject", "") or "").lower() + if token not in hay: + body = getattr(remote.messages.get(remote_email, m.id), "body_text", "") or "" + hay = body.lower() + if token in hay: + return # cross-channel confirmed: SMS request -> email response with the token + time.sleep(POLL_EVERY_S) + pytest.fail(f"agent did not send an email containing {token!r} within {TIMEOUT_S:.0f}s") + + +def _inbound_calls_from_aut(remote, remote_pid: str, aut_phone: str): + """The driver's inbound calls originating from the AUT's number.""" + tail = _digits(aut_phone)[-10:] + return [c for c in remote.calls.list(remote_pid, limit=30) + if (getattr(c, "direction", "") or "").lower() == "inbound" + and _digits(getattr(c, "remote_phone_number", "") or "")[-10:] == tail] + + +def _wait_for_new_call(remote, remote_pid: str, aut_phone: str, before: set): + """Block until an inbound call from the AUT with an id not in ``before`` appears. + + ``before`` is the pre-request snapshot, so a stale call can't satisfy the + assertion โ€” same new-id correlation the SMS/email legs use. Fails on timeout. + """ + deadline = time.monotonic() + TIMEOUT_S + while time.monotonic() < deadline: + for c in _inbound_calls_from_aut(remote, remote_pid, aut_phone): + if c.id not in before: + return # a fresh call from the AUT landed on the driver's number + time.sleep(POLL_EVERY_S) + pytest.fail(f"agent did not place a call to the driver within {TIMEOUT_S:.0f}s") + + +def test_email_request_gets_call(xc): + """Email asks the agent to CALL; a new inbound call must land on the driver.""" + remote, remote_pid, aut_phone = xc["remote"], xc["remote_pid"], xc["aut_phone"] + # Snapshot BEFORE sending so a pre-existing call can't be mistaken for the reply. + before = {c.id for c in _inbound_calls_from_aut(remote, remote_pid, aut_phone)} + remote.messages.send( + xc["remote_email"], to=[xc["aut_email"]], subject="please call me", + body_text="Please place a phone call to my number now โ€” I'd rather talk than type.", + ) + _wait_for_new_call(remote, remote_pid, aut_phone, before) + + +def test_sms_request_gets_call(xc): + """SMS asks the agent to CALL; a new inbound call must land on the driver.""" + remote, remote_pid, aut_phone = xc["remote"], xc["remote_pid"], xc["aut_phone"] + before = {c.id for c in _inbound_calls_from_aut(remote, remote_pid, aut_phone)} + remote.texts.send(remote_pid, to=aut_phone, text="Call me please โ€” give me a ring now.") + _wait_for_new_call(remote, remote_pid, aut_phone, before) diff --git a/tests/live/test_email_intelligence.py b/tests/live/test_email_intelligence.py new file mode 100644 index 0000000..134b6b8 --- /dev/null +++ b/tests/live/test_email_intelligence.py @@ -0,0 +1,269 @@ +"""Live intelligence suite over email โ€” the agent's REAL brain + tools. + +Runs against a real Claude model (``LIVE_REAL_MODEL=1``, real key) so it proves +the agent actually reasons and uses its Inkbox tools/data โ€” not a mock. A remote +identity emails questions; we verify the replies against values looked up live +through the API keys (NO hardcoded expectations): + + * basic โ€” answers a simple question (sanity). + * own identity โ€” reports its own handle / email / phone (looked up via the AUT key). + * sender โ€” reports who the sender is, from the contact card it can see + (looked up via the AUT key). + * tools โ€” names its real Inkbox tools (scraped from the tool sources). + * contact CRUD โ€” with LIVE_CONTACT_CRUD=1, creates/updates/deletes a + temporary contact through the real agent loop. + +Skipped unless both keys + LIVE_REAL_MODEL=1 are set. +""" + +from __future__ import annotations + +import os +import re +import time +import uuid +from pathlib import Path + +import pytest + +REMOTE_KEY = os.environ.get("REMOTE_INKBOX_API_KEY") +AUT_KEY = os.environ.get("CLAUDE_CODE_INKBOX_API_KEY") +BASE_URL = os.environ.get("INKBOX_BASE_URL", "https://inkbox.ai") +TIMEOUT_S = float(os.environ.get("LIVE_EMAIL_TIMEOUT", "150")) +POLL_EVERY_S = 5.0 +# "i hit an error" is the bridge's canned failed-turn reply. +ERROR_MARKERS = ("non-retryable error", "missing authentication", "http 401", "http 403", "traceback", + "i hit an error") + +pytestmark = pytest.mark.skipif( + not (REMOTE_KEY and AUT_KEY and os.environ.get("LIVE_REAL_MODEL") == "1"), + reason="real-model intelligence suite: needs both keys + LIVE_REAL_MODEL=1", +) + + +def _digits(s: str) -> str: + return re.sub(r"\D", "", s or "") + + +def _phone_present(phone: str, body: str) -> bool: + """True if the agent reported ``phone`` in ``body``. + + Accepts either the full number (all digits present) or a privacy-masked + form the model tends to emit in formal identity listings, where it keeps a + leading prefix + the last 4 and masks the middle (e.g. ``+192****3235``). + The masked branch requires a run of mask chars immediately followed by the + real last-4, so it won't false-match on markdown bold (``**name:**``). + """ + want = _digits(phone) + if want[-10:] in _digits(body): + return True + tail = re.escape(want[-4:]) + return bool(re.search(r"[*xXโ€ขยท]{2,}\D{0,2}" + tail, body)) + + +def _mailbox(client) -> str: + boxes = client.mailboxes.list() + assert boxes, "identity has no mailbox" + return boxes[0].email_address + + +def _first_phone(client) -> str | None: + nums = client.phone_numbers.list() + return nums[0].number if nums else None + + +def _client(key): + from inkbox import Inkbox + + return Inkbox(api_key=key, base_url=BASE_URL) + + +def _plugin_tool_names() -> list[str]: + """Tool names the bridge registers, scraped from the tool sources โ€” + tracks the code without a hand-kept list.""" + src = Path(__file__).resolve().parents[2] / "inkbox_claude" / "tools.py" + return sorted(set(re.findall(r'@tool\(\s*"(inkbox_[a-z0-9_]+)"', src.read_text()))) + + +def _ask(remote, aut_email: str, remote_email: str, question: str) -> str: + """Email the agent a question; return the reply body (lowercased).""" + from inkbox.mail.types import MessageDirection + + nonce = f"smoke-{uuid.uuid4().hex[:8]}" + sent = remote.messages.send(remote_email, to=[aut_email], subject=f"[{nonce}] {question[:40]}", body_text=question) + thread_id = str(getattr(sent, "thread_id", "") or "") + + def _is_reply(msg) -> bool: + if thread_id and str(getattr(msg, "thread_id", "") or "") == thread_id: + return True + frm = (getattr(msg, "from_address", "") or "").lower() + return aut_email.lower() in frm and nonce in (getattr(msg, "subject", "") or "") + + deadline = time.monotonic() + TIMEOUT_S + while time.monotonic() < deadline: + for msg in remote.messages.list(remote_email, direction=MessageDirection.INBOUND): + if _is_reply(msg): + body = getattr(remote.messages.get(remote_email, msg.id), "body_text", "") or "" + bad = [m for m in ERROR_MARKERS if m in body.lower()] + assert not bad, f"reply is an error, not a real answer: {bad}\n{body[:300]}" + return body.lower() + time.sleep(POLL_EVERY_S) + pytest.fail(f"no reply within {TIMEOUT_S:.0f}s to: {question!r}") + + +@pytest.fixture(scope="module") +def ctx(): + remote = _client(REMOTE_KEY) + aut = _client(AUT_KEY) + return { + "remote": remote, + "aut": aut, + "remote_email": _mailbox(remote), + "aut_email": _mailbox(aut), + } + + +def test_basic_reply(ctx): + body = _ask(ctx["remote"], ctx["aut_email"], ctx["remote_email"], + "Please reply with a one-sentence acknowledgement that you received this email.") + assert len(body.strip()) > 0, "empty reply" + + +def test_reports_own_identity(ctx): + aut = ctx["aut"] + handle = _mailbox(aut).split("@", 1)[0] + aut_email = ctx["aut_email"] + aut_phone = _first_phone(aut) + assert aut_phone, "AUT identity has no phone number to report" + + body = _ask(ctx["remote"], aut_email, ctx["remote_email"], + "What is your full Inkbox identity? Reply with your handle, display " + "name, email address, and phone number. Write the phone number in " + "full โ€” every digit, with no masking, asterisks, or abbreviation.") + assert handle in body, f"reply missing handle {handle!r}\n{body[:400]}" + assert aut_email in body, f"reply missing email {aut_email!r}\n{body[:400]}" + # Accept a privacy-masked phone (the model self-redacts the middle digits + # in formal listings) as well as full. + assert _phone_present(aut_phone, body), f"reply missing phone {aut_phone!r}\n{body[:400]}" + + +def test_reports_sender_details(ctx): + """The agent must report who the sender is, from the contact card it can see.""" + aut, remote = ctx["aut"], ctx["remote"] + remote_email = ctx["remote_email"] + + # Look up (or seed) the sender's contact in the AUT org โ€” the card the agent sees. + matches = aut.contacts.lookup(email=remote_email) + if not matches: + from inkbox.contacts.types import ContactEmail, ContactPhone + rphone = _first_phone(remote) + aut.contacts.create( + given_name="Penny", + family_name="Tester", + emails=[ContactEmail(label="work", value=remote_email)], + phones=[ContactPhone(label="mobile", value=rphone)] if rphone else None, + ) + matches = aut.contacts.lookup(email=remote_email) + assert matches, "could not establish a contact card for the sender" + contact = matches[0] + name = (getattr(contact, "preferred_name", None) or getattr(contact, "given_name", None) or "") + emails = [e.value for e in getattr(contact, "emails", [])] + phones = [p.value for p in getattr(contact, "phones", [])] + + body = _ask(ctx["remote"], ctx["aut_email"], remote_email, + "Who am I to you? Tell me everything you have on file about me. " + "Include my phone number in full โ€” every digit, with no masking, " + "asterisks, or abbreviation.") + if name: + assert name.lower() in body, f"reply missing sender name {name!r}\n{body[:400]}" + assert any(e.lower() in body for e in emails), f"reply missing sender email {emails}\n{body[:400]}" + if phones: + # Accept full or privacy-masked (see _phone_present). + assert any(_phone_present(p, body) for p in phones), \ + f"reply missing sender phone {phones}\n{body[:400]}" + + +def test_aware_of_inkbox_tools(ctx): + """Non-LLM proof the agent is wired with real tools: it names them.""" + tool_names = _plugin_tool_names() + assert tool_names, "no inkbox_* tool names found in inkbox_claude/tools.py" + contact_tools = { + "inkbox_lookup_contact", + "inkbox_list_contacts", + "inkbox_get_contact", + "inkbox_create_contact", + "inkbox_update_contact", + "inkbox_delete_contact", + } + assert contact_tools <= set(tool_names) + + body = _ask(ctx["remote"], ctx["aut_email"], ctx["remote_email"], + "List the exact names of all the Inkbox tools you have access to, one per line.") + hits = [t for t in tool_names if t.lower() in body] + assert len(hits) >= 3, f"agent named only {hits} of its tools {tool_names}\n{body[:500]}" + missing_contacts = sorted(t for t in contact_tools if t.lower() not in body) + assert not missing_contacts, f"agent did not name contact tools {missing_contacts}\n{body[:500]}" + + +def _contacts_by_email(client, email: str): + return list(client.contacts.lookup(email=email) or []) + + +def _delete_contacts_by_email(client, email: str) -> None: + for contact in _contacts_by_email(client, email): + contact_id = str(getattr(contact, "id", "") or "") + if contact_id: + client.contacts.delete(contact_id) + + +@pytest.mark.skipif( + os.environ.get("LIVE_CONTACT_CRUD") != "1", + reason="mutating contact CRUD live test: set LIVE_CONTACT_CRUD=1 to opt in", +) +def test_contact_crud_tool_use(ctx): + """The real agent can reason about and use contact write tools end to end.""" + aut = ctx["aut"] + nonce = f"cc-live-{uuid.uuid4().hex[:8]}" + contact_name = f"Claude Live {nonce}" + contact_email = f"{nonce}@example.com" + updated_notes = f"updated-notes-{nonce}" + + _delete_contacts_by_email(aut, contact_email) + try: + created = _ask( + ctx["remote"], + ctx["aut_email"], + ctx["remote_email"], + "Use inkbox_create_contact now. Create a new contact named " + f"{contact_name} with email {contact_email}. Do not just describe the action. " + f"After the tool succeeds, reply exactly: CREATED {nonce}", + ) + assert "created" in created and nonce in created, created[:500] + matches = _contacts_by_email(aut, contact_email) + assert matches, f"agent said it created {contact_email}, but lookup found nothing" + contact_id = str(getattr(matches[0], "id", "") or "") + assert contact_id, f"created contact has no id: {matches[0]!r}" + + updated = _ask( + ctx["remote"], + ctx["aut_email"], + ctx["remote_email"], + "Use inkbox_update_contact now. Update contactId " + f"{contact_id} and set notes to {updated_notes}. Do not create a second contact. " + f"After the tool succeeds, reply exactly: UPDATED {nonce}", + ) + assert "updated" in updated and nonce in updated, updated[:500] + fetched = aut.contacts.get(contact_id) + assert updated_notes.lower() in str(getattr(fetched, "notes", "") or "").lower() + + deleted = _ask( + ctx["remote"], + ctx["aut_email"], + ctx["remote_email"], + "I confirm this is a temporary test contact. Use inkbox_delete_contact now " + f"to delete contactId {contact_id}. After the tool succeeds, reply exactly: DELETED {nonce}", + ) + assert "deleted" in deleted and nonce in deleted, deleted[:500] + assert not _contacts_by_email(aut, contact_email) + finally: + _delete_contacts_by_email(aut, contact_email) diff --git a/tests/live/test_email_reply.py b/tests/live/test_email_reply.py new file mode 100644 index 0000000..6dda45c --- /dev/null +++ b/tests/live/test_email_reply.py @@ -0,0 +1,97 @@ +"""Live test: the agent emails back โ€” and the reply is real, not an error. + +A *remote* Inkbox identity emails the agent-under-test (AUT). The AUT's running +bridge routes it into a Claude Code session that "thinks" against a deterministic +mock model (see mock_anthropic.py โ€” no real LLM, so this is repeatable and free), +and emails a reply. + +We assert two independent things so a broken setup can't pass: + 1. delivery โ€” a reply lands in the remote mailbox, tracked by thread_id; + 2. content โ€” the reply body carries the mock's ``REPLY_OK `` marker and + contains NO error strings (this is what catches the agent emailing + back a model-auth 401 instead of a real reply). + +Skipped unless both API keys are present, so it never runs in the offline suite. +Requires the AUT bridge to already be running (the workflow starts it). +""" + +from __future__ import annotations + +import os +import time +import uuid + +import pytest + +REMOTE_KEY = os.environ.get("REMOTE_INKBOX_API_KEY") +AUT_KEY = os.environ.get("CLAUDE_CODE_INKBOX_API_KEY") +BASE_URL = os.environ.get("INKBOX_BASE_URL", "https://inkbox.ai") +TIMEOUT_S = float(os.environ.get("LIVE_EMAIL_TIMEOUT", "120")) +POLL_EVERY_S = 5.0 + +# Strings that mean the agent replied with a failure instead of a real answer. +ERROR_MARKERS = ("non-retryable error", "missing authentication", "http 401", "http 403", "traceback", + "i hit an error") + +pytestmark = pytest.mark.skipif( + not (REMOTE_KEY and AUT_KEY) or os.environ.get("LIVE_REAL_MODEL") == "1", + reason="mock-model reachability test (needs both keys; skipped in real-model mode)", +) + + +def _mailbox(client) -> str: + boxes = client.mailboxes.list() + assert boxes, "identity has no mailbox" + return boxes[0].email_address + + +def test_email_reachability(): + from inkbox import Inkbox + from inkbox.mail.types import MessageDirection + + remote = Inkbox(api_key=REMOTE_KEY, base_url=BASE_URL) + aut = Inkbox(api_key=AUT_KEY, base_url=BASE_URL) + + remote_email = _mailbox(remote) + aut_email = _mailbox(aut) + assert remote_email.lower() != aut_email.lower(), "remote and AUT must be different identities" + + nonce = f"smoke-{uuid.uuid4().hex[:8]}" + subject = f"[{nonce}] are you there?" + sent = remote.messages.send( + remote_email, + to=[aut_email], + subject=subject, + body_text="This is an automated reachability check โ€” please reply to this email to confirm.", + ) + thread_id = str(getattr(sent, "thread_id", "") or "") + + # Poll the remote mailbox for the AUT's reply โ€” match on thread_id (preferred), + # falling back to sender + nonce when the send didn't surface a thread id. + def _is_reply(msg) -> bool: + if thread_id and str(getattr(msg, "thread_id", "") or "") == thread_id: + return True + frm = (getattr(msg, "from_address", "") or "").lower() + subj = getattr(msg, "subject", "") or "" + return aut_email.lower() in frm and nonce in subj + + deadline = time.monotonic() + TIMEOUT_S + reply = None + while time.monotonic() < deadline and reply is None: + for msg in remote.messages.list(remote_email, direction=MessageDirection.INBOUND): + if _is_reply(msg): + reply = msg + break + if reply is None: + time.sleep(POLL_EVERY_S) + + # (1) delivery + assert reply is not None, f"no reply within {TIMEOUT_S:.0f}s โ€” inbound routing or reply send is broken" + + # (2) content is a real reply, not an error fallback + detail = remote.messages.get(remote_email, reply.id) + body = ((getattr(detail, "body_text", "") or "") + " " + (getattr(reply, "subject", "") or "")).lower() + bad = [m for m in ERROR_MARKERS if m in body] + assert not bad, f"reply delivered but the body is an error, not a real answer: {bad}\n{body[:300]}" + assert "reply_ok" in body, f"reply delivered but missing the mock marker REPLY_OK:\n{body[:300]}" + assert nonce in body, f"reply did not echo the request nonce {nonce} โ€” agent may not have read the inbound" diff --git a/tests/live/test_sms.py b/tests/live/test_sms.py new file mode 100644 index 0000000..8bc6988 --- /dev/null +++ b/tests/live/test_sms.py @@ -0,0 +1,159 @@ +"""Live SMS suite โ€” the same questions as the email suite, over real SMS. + +SMS differs from email: agent-to-agent SMS skips the START opt-in (the server +bypasses it for inter-agent traffic), and outbound SMS is subject to carrier + +spam filtering โ€” so prompts ask for SHORT replies and avoid spammy content. + + * mock leg โ†’ reachability (deterministic ``REPLY_OK`` from the mock model). + * real leg โ†’ intelligence: basic, own identity, sender, tools. + +Skipped unless both keys are set. Replies are matched by *new* inbound message id +from the AUT's number (robust to clock skew). +""" + +from __future__ import annotations + +import os +import re +import time +from pathlib import Path + +import pytest + +REMOTE_KEY = os.environ.get("REMOTE_INKBOX_API_KEY") +AUT_KEY = os.environ.get("CLAUDE_CODE_INKBOX_API_KEY") +BASE_URL = os.environ.get("INKBOX_BASE_URL", "https://inkbox.ai") +REAL = os.environ.get("LIVE_REAL_MODEL") == "1" +TIMEOUT_S = float(os.environ.get("LIVE_SMS_TIMEOUT", "180")) +POLL_EVERY_S = 6.0 +# "i hit an error" is the bridge's canned failed-turn reply. +ERROR_MARKERS = ("non-retryable error", "missing authentication", "http 401", "http 403", "traceback", + "i hit an error") + +pytestmark = pytest.mark.skipif( + not (REMOTE_KEY and AUT_KEY), + reason="live SMS suite: needs REMOTE_INKBOX_API_KEY + CLAUDE_CODE_INKBOX_API_KEY", +) +real_only = pytest.mark.skipif(not REAL, reason="intelligence runs in the real-model leg") +mock_only = pytest.mark.skipif(REAL, reason="reachability runs in the mock-model leg") + + +def _digits(s: str) -> str: + return re.sub(r"\D", "", s or "") + + +def _client(key): + from inkbox import Inkbox + + return Inkbox(api_key=key, base_url=BASE_URL) + + +def _phone(client): + nums = client.phone_numbers.list() + assert nums, "identity has no phone number" + return nums[0].number, str(nums[0].id) + + +def _plugin_tool_names() -> list[str]: + """Tool names the bridge registers, scraped from the tool sources โ€” + tracks the code without a hand-kept list.""" + src = Path(__file__).resolve().parents[2] / "inkbox_claude" / "tools.py" + return sorted(set(re.findall(r'@tool\(\s*"(inkbox_[a-z0-9_]+)"', src.read_text()))) + + +@pytest.fixture(scope="module") +def sms(): + remote = _client(REMOTE_KEY) + aut = _client(AUT_KEY) + aut_phone, _aut_pid = _phone(aut) + _remote_phone, remote_pid = _phone(remote) + # No opt-in/START needed: the server bypasses the missing-opt-in gate for + # inter-agent traffic (the recipient is an Inkbox-managed number). Only an + # explicit STOP/opt-out would block. + return {"remote": remote, "aut": aut, "aut_phone": aut_phone, "remote_pid": remote_pid} + + +def _ask_sms(sms, text: str) -> str: + """Text the agent; return the reply body (lowercased), matched by new message id. + + The agent sometimes emits a trailing *second* SMS for the PREVIOUS question + (a duplicate "OK", or a masked + unmasked identity pair) that lands a few + seconds late. Matching on "any new inbound id after I sent" would let that + leftover leak into the next question's match. So before sending we first + drain the inbound conversation to a quiet state โ€” polling until the id-set + stops growing โ€” which folds any in-flight trailing reply into ``before``. + """ + remote, aut_phone, pid = sms["remote"], sms["aut_phone"], sms["remote_pid"] + tail = _digits(aut_phone)[-10:] + + def _inbound_from_aut(): + out = [] + for m in remote.texts.list(pid, limit=30): + if (getattr(m, "direction", "") or "").lower() == "inbound" \ + and _digits(getattr(m, "remote_phone_number", "") or "")[-10:] == tail: + out.append(m) + return out + + # Settle: wait until no new inbound arrives for one quiet poll, so a trailing + # reply to the prior question is captured in `before` instead of mis-matched. + before = {m.id for m in _inbound_from_aut()} + quiet_deadline = time.monotonic() + 2 * POLL_EVERY_S + while time.monotonic() < quiet_deadline: + time.sleep(POLL_EVERY_S) + now_ids = {m.id for m in _inbound_from_aut()} + if now_ids == before: + break + before = now_ids + + remote.texts.send(pid, to=aut_phone, text=text) + + deadline = time.monotonic() + TIMEOUT_S + while time.monotonic() < deadline: + for m in _inbound_from_aut(): + if m.id not in before: + body = getattr(m, "text", "") or "" + bad = [x for x in ERROR_MARKERS if x in body.lower()] + assert not bad, f"SMS reply is an error, not a real answer: {bad}\n{body[:200]}" + return body.lower() + time.sleep(POLL_EVERY_S) + pytest.fail(f"no SMS reply within {TIMEOUT_S:.0f}s to: {text!r}") + + +@mock_only +def test_sms_reachability(sms): + body = _ask_sms(sms, "ping") + assert "reply_ok" in body, f"mock reachability: missing REPLY_OK marker\n{body[:200]}" + + +@real_only +def test_sms_basic_reply(sms): + body = _ask_sms(sms, "Please reply OK to confirm you got this text.") + assert len(body.strip()) > 0, "empty reply" + + +@real_only +def test_sms_reports_own_identity(sms): + aut_email = sms["aut"].mailboxes.list()[0].email_address + body = _ask_sms(sms, "Reply with just your Inkbox email address and phone number โ€” short.") + assert aut_email in body, f"reply missing email {aut_email!r}\n{body[:200]}" + + +@real_only +def test_sms_reports_sender_details(sms): + aut, remote = sms["aut"], sms["remote"] + remote_email = remote.mailboxes.list()[0].email_address + matches = aut.contacts.lookup(email=remote_email) + if not matches: + pytest.skip("no contact card for the sender to report") + name = (getattr(matches[0], "preferred_name", None) or getattr(matches[0], "given_name", None) or "") + body = _ask_sms(sms, "Who am I to you? Tell me what you have on file about me.") + if name: + assert name.lower() in body, f"reply missing sender name {name!r}\n{body[:200]}" + + +@real_only +def test_sms_aware_of_inkbox_tools(sms): + tool_names = _plugin_tool_names() + body = _ask_sms(sms, "Name three of your Inkbox tools (exact names).") + hits = [t for t in tool_names if t.lower() in body] + assert len(hits) >= 2, f"agent named only {hits} of its tools\n{body[:300]}" diff --git a/tests/live/test_voice.py b/tests/live/test_voice.py new file mode 100644 index 0000000..ccd1824 --- /dev/null +++ b/tests/live/test_voice.py @@ -0,0 +1,153 @@ +"""Live voice-call suite โ€” real phone calls, real model, transcript-verified. + +Two scenarios, each run against a bridge booted in the matching speech mode (the +workflow sets that up and selects the scenario via VOICE_SCENARIO): + + * inbound_inkbox โ€” the driver calls the agent; the agent answers with Inkbox + STT/TTS and holds a turn. + * outbound_realtime โ€” the driver texts "call me"; the agent places a call back, + powered by the realtime API, and holds a turn. + +A companion driver process (voice_driver.py) bridges the driver's side of the call +over an Inkbox tunnel and speaks one line. We then read the stored call transcript +and assert both parties spoke โ€” proving the agent reached the caller out loud. +""" + +from __future__ import annotations + +import json +import os +import re +import time + +import pytest + +REMOTE_KEY = os.environ.get("REMOTE_INKBOX_API_KEY") +AUT_KEY = os.environ.get("CLAUDE_CODE_INKBOX_API_KEY") +BASE_URL = os.environ.get("INKBOX_BASE_URL", "https://inkbox.ai") +REAL = os.environ.get("LIVE_REAL_MODEL") == "1" +SCENARIO = os.environ.get("VOICE_SCENARIO", "") +STATE_FILE = os.environ.get("VOICE_DRIVER_STATE", "/tmp/voice_driver_state.json") +TIMEOUT_S = float(os.environ.get("LIVE_VOICE_TIMEOUT", "220")) +POLL_EVERY_S = 6.0 + +pytestmark = pytest.mark.skipif( + not (REMOTE_KEY and AUT_KEY and REAL), + reason="voice suite: needs both keys + LIVE_REAL_MODEL=1", +) + + +def _digits(s: str) -> str: + return re.sub(r"\D", "", s or "") + + +def _client(key): + from inkbox import Inkbox + + return Inkbox(api_key=key, base_url=BASE_URL) + + +def _driver_state() -> dict: + with open(STATE_FILE) as fh: + return json.load(fh) + + +def _aut_phone(aut) -> str: + nums = aut.phone_numbers.list() + assert nums, "AUT identity has no phone number" + return nums[0].number + + +def _segments(remote, number_id, call_id): + """Transcript segments for a call, split by who spoke.""" + segs = remote.transcripts.list(number_id, call_id) + rem = [s for s in segs if (getattr(s, "party", "") or "").lower() == "remote" and (s.text or "").strip()] + loc = [s for s in segs if (getattr(s, "party", "") or "").lower() == "local" and (s.text or "").strip()] + return segs, rem, loc + + +def _wait_for_two_way_call(remote, number_id, call_id): + """Block until the call transcript shows BOTH the agent and the driver spoke.""" + deadline = time.monotonic() + TIMEOUT_S + last = "" + while time.monotonic() < deadline: + try: + _all, rem, loc = _segments(remote, number_id, call_id) + except Exception as exc: # transcripts may 404 until the call is set up + last = f"transcripts not ready: {exc!r}" + time.sleep(POLL_EVERY_S) + continue + if rem and loc: + agent_said = " | ".join(s.text.strip() for s in rem) + return agent_said # the agent reached the caller out loud, in a two-way call + last = f"segments so far: remote={len(rem)} local={len(loc)}" + time.sleep(POLL_EVERY_S) + pytest.fail(f"agent never held a two-way call within {TIMEOUT_S:.0f}s ({last})") + + +def _aut_speech_mode(aut, direction, driver_number): + """(use_inkbox_tts, use_inkbox_stt) of the agent's most recent answered call + in `direction` with the driver. Tells Inkbox STT/TTS (True/True) from realtime + (False/False), so each leg can prove it ran the speech path it claims.""" + num_id = str(aut.phone_numbers.list()[0].id) + tail = _digits(driver_number)[-10:] + answered = [c for c in aut.calls.list(num_id, limit=10) + if (getattr(c, "direction", "") or "").lower() == direction + and _digits(getattr(c, "remote_phone_number", "") or "")[-10:] == tail + and c.use_inkbox_tts is not None] + assert answered, f"no answered {direction} agent call with the driver found" + c = answered[0] # newest first + return c.use_inkbox_tts, c.use_inkbox_stt + + +@pytest.mark.skipif(SCENARIO != "inbound_inkbox", reason="inbound Inkbox STT/TTS leg only") +def test_inbound_call_inkbox_tts_stt(): + """Driver calls the agent; the agent answers via Inkbox STT/TTS and replies.""" + st = _driver_state() + remote, aut = _client(REMOTE_KEY), _client(AUT_KEY) + aut_phone = _aut_phone(aut) + + # Place the call to the agent, handing Inkbox the driver's own media WS. + call = remote.calls.place( + from_number=st["number"], to_number=aut_phone, client_websocket_url=st["ws_url"], + ) + agent_said = _wait_for_two_way_call(remote, st["number_id"], call.id) + assert agent_said, "agent produced no speech on the inbound call" + + tts, stt = _aut_speech_mode(aut, "inbound", st["number"]) + assert tts and stt, f"inbound call should run Inkbox STT/TTS, got tts={tts} stt={stt}" + + +@pytest.mark.skipif(SCENARIO != "outbound_realtime", reason="outbound realtime leg only") +def test_outbound_call_realtime(): + """Driver texts 'call me'; the agent places a realtime-powered call and replies.""" + st = _driver_state() + remote, aut = _client(REMOTE_KEY), _client(AUT_KEY) + aut_phone = _aut_phone(aut) + tail = _digits(aut_phone)[-10:] + + def _inbound_from_aut(): + return [c for c in remote.calls.list(st["number_id"], limit=30) + if (getattr(c, "direction", "") or "").lower() == "inbound" + and _digits(getattr(c, "remote_phone_number", "") or "")[-10:] == tail] + + before = {c.id for c in _inbound_from_aut()} + remote.texts.send(st["number_id"], to=aut_phone, text="Please call me right now by phone โ€” give me a ring.") + + # Wait for the agent to dial back, then verify the call transcript. + deadline = time.monotonic() + TIMEOUT_S + call_id = None + while time.monotonic() < deadline: + fresh = [c for c in _inbound_from_aut() if c.id not in before] + if fresh: + call_id = fresh[0].id + break + time.sleep(POLL_EVERY_S) + assert call_id, f"agent never placed a call back within {TIMEOUT_S:.0f}s" + + agent_said = _wait_for_two_way_call(remote, st["number_id"], call_id) + assert agent_said, "agent produced no speech on the outbound call" + + tts, stt = _aut_speech_mode(aut, "outbound", st["number"]) + assert tts is False and stt is False, \ + f"outbound call must be powered by the realtime API (Inkbox speech off), got tts={tts} stt={stt}" diff --git a/tests/live/voice_driver.py b/tests/live/voice_driver.py new file mode 100644 index 0000000..f1ff2a3 --- /dev/null +++ b/tests/live/voice_driver.py @@ -0,0 +1,172 @@ +"""Live voice-call driver: the peer on the other end of a real phone call. + +Opens an Inkbox tunnel for the driver identity, serves the call-media WebSocket +behind it, and bridges audio in Inkbox STT/TTS mode (text frames only โ€” no local +model). It speaks one scripted line so the agent under test gets a turn, and the +call transcript (read separately by the test) proves the agent replied. + +Run as a standalone process alongside the gateway. On startup it writes a small +JSON state file (its public WS URL + phone-number id) that the test reads to place +or expect a call. Two call directions are supported by the same bridge: + * the test places a call to the agent and passes this driver's WS URL, or + * the agent calls this driver's number, which is set to auto-accept onto the + same WS URL. + +Env: + REMOTE_INKBOX_API_KEY driver identity key (identity-scoped) + INKBOX_BASE_URL API root (default https://inkbox.ai) + VOICE_DRIVER_PORT local port the tunnel forwards to (default 8090) + VOICE_DRIVER_STATE path to write the JSON state file + VOICE_DRIVER_LINE the one line the driver speaks (default below) +""" + +from __future__ import annotations + +import json +import logging +import os +import threading +import time +from pathlib import Path + +import uvicorn +from fastapi import FastAPI, WebSocket +from starlette.websockets import WebSocketState + +from inkbox import Inkbox +from inkbox.tunnels.client import connect as tunnel_connect + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s driver %(message)s") +log = logging.getLogger("voice_driver") + +API_KEY = os.environ["REMOTE_INKBOX_API_KEY"] +BASE_URL = os.environ.get("INKBOX_BASE_URL", "https://inkbox.ai") +PORT = int(os.environ.get("VOICE_DRIVER_PORT", "8090")) +STATE_FILE = os.environ.get("VOICE_DRIVER_STATE", "/tmp/voice_driver_state.json") +LINE = os.environ.get( + "VOICE_DRIVER_LINE", + "Hi, this is a quick test call. Please reply out loud with one short sentence, then say goodbye.", +) +# Speak shortly after the pipeline is ready so the agent's greeting lands first. +SPEAK_AFTER_S = float(os.environ.get("VOICE_DRIVER_SPEAK_AFTER", "3")) +# Then give the agent a turn and hang up โ€” a dropped WS does NOT end the call, so we +# must send an explicit stop or the leg lingers until the server max-duration cap. +LISTEN_S = float(os.environ.get("VOICE_DRIVER_LISTEN", "12")) + +app = FastAPI() + + +@app.get("/health") +async def health() -> dict: + return {"ok": True} + + +@app.websocket("/phone/media/ws") +async def phone_media_ws(ws: WebSocket) -> None: + """Accept the call-media WS in Inkbox STT/TTS mode and run one scripted turn.""" + import asyncio + + # Opt into Inkbox-managed speech both ways โ†’ we exchange text, not audio. + await ws.accept(headers=[ + (b"x-use-inkbox-text-to-speech", b"true"), + (b"x-use-inkbox-speech-to-text", b"true"), + ]) + log.info("call WS accepted") + spoke = asyncio.Event() + convo: asyncio.Task | None = None + + async def _speak(text: str) -> None: + if spoke.is_set(): + return + spoke.set() + await ws.send_text(json.dumps({"event": "text", "delta": text})) + await ws.send_text(json.dumps({"event": "text", "done": True})) + log.info("spoke: %s", text) + + async def _run_turn() -> None: + # Speak one line, give the agent a turn, then hang up so the call ends fast. + await asyncio.sleep(SPEAK_AFTER_S) + await _speak(LINE) + await asyncio.sleep(LISTEN_S) + try: + await ws.send_text(json.dumps({"event": "stop"})) + log.info("sent stop (hangup)") + except Exception: + pass + + try: + while True: + raw = await ws.receive_text() + ev = json.loads(raw) + kind = ev.get("event") + if kind == "start": + log.info("call start: %s", ev.get("stream_id")) + convo = asyncio.create_task(_run_turn()) + elif kind == "transcript" and ev.get("is_final"): + log.info("heard (final): %s", ev.get("text")) + await _speak(LINE) # speak now if the greeting beat our timer + elif kind == "stop": + log.info("call stop: %s", ev.get("reason")) + break + except Exception as exc: # noqa: BLE001 โ€” never let the bridge crash the process + log.info("WS loop ended: %r", exc) + finally: + if convo: + convo.cancel() + if ws.client_state != WebSocketState.DISCONNECTED: + try: + await ws.close() + except Exception: + pass + + +def _run_uvicorn() -> uvicorn.Server: + server = uvicorn.Server(uvicorn.Config(app, host="127.0.0.1", port=PORT, log_level="warning")) + threading.Thread(target=server.run, name="uvicorn", daemon=True).start() + deadline = time.monotonic() + 10 + while time.monotonic() < deadline: + if server.started: + return server + time.sleep(0.05) + raise RuntimeError("uvicorn did not start") + + +def main() -> None: + client = Inkbox(api_key=API_KEY, base_url=BASE_URL) + handle = client.mailboxes.list()[0].email_address.split("@", 1)[0] # tunnel name = handle + num = client.phone_numbers.list()[0] + log.info("driver identity %s number %s", handle, num.number) + + server = _run_uvicorn() + + listener = tunnel_connect( + client, name=handle, forward_to=f"http://127.0.0.1:{PORT}", + state_dir=f"/tmp/inkbox-tunnel-{handle}", + ) + public_host = listener.tunnel.public_host + ws_url = f"wss://{public_host}/phone/media/ws" + log.info("tunnel ready: %s", ws_url) + + # Auto-accept inbound calls (agent โ†’ driver) straight onto this WS. + prev_action = getattr(num, "incoming_call_action", None) + client.phone_numbers.update(num.id, incoming_call_action="auto_accept", client_websocket_url=ws_url) + + Path(STATE_FILE).write_text(json.dumps({ + "ws_url": ws_url, "number": num.number, "number_id": str(num.id), "handle": handle, + })) + log.info("state written to %s", STATE_FILE) + + try: + listener.wait() + finally: + # Leave the number as we found it so other suites aren't affected. + try: + client.phone_numbers.update(num.id, incoming_call_action=prev_action or "auto_reject") + except Exception as exc: # noqa: BLE001 + log.info("number revert failed: %r", exc) + listener.close() + server.should_exit = True + + +if __name__ == "__main__": + main()