diff --git a/.dev.vars.example b/.dev.vars.example index 5fc7dca6f..86c402367 100644 --- a/.dev.vars.example +++ b/.dev.vars.example @@ -1,42 +1,40 @@ # Copy this to .dev.vars and fill in your values # .dev.vars is gitignored and used by wrangler dev -# AI Provider (at least one required) -ANTHROPIC_API_KEY=sk-ant-... -# OPENAI_API_KEY=sk-... - -# Cloudflare AI Gateway (alternative to direct provider keys) -# CLOUDFLARE_AI_GATEWAY_API_KEY=your-provider-api-key -# CF_AI_GATEWAY_ACCOUNT_ID=your-account-id -# CF_AI_GATEWAY_GATEWAY_ID=your-gateway-id -# CF_AI_GATEWAY_MODEL=workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast - -# Legacy AI Gateway (still supported) -# AI_GATEWAY_API_KEY=your-key -# AI_GATEWAY_BASE_URL=https://gateway.ai.cloudflare.com/v1/{account_id}/{gateway_id}/anthropic - -# Local development mode - skips Cloudflare Access auth and bypasses device pairing +# ────────────────────────────────────────────── +# Mode A: Quick dev (skip login, straight to chat) +# ────────────────────────────────────────────── +DEV_MODE=true + +# ────────────────────────────────────────────── +# Mode B: Full Poe login flow (paste API key, get chat) +# Comment out DEV_MODE and uncomment the session secrets below. +# ────────────────────────────────────────────── # DEV_MODE=true - -# E2E test mode - skips Cloudflare Access auth but keeps device pairing enabled -# Use this for automated tests that need to test the real pairing flow -# E2E_TEST_MODE=true +# SESSION_SECRET=dev-session-secret-32-chars-minimum +# ENCRYPTION_SECRET=dev-encrypt-secret-32-chars-minimum # Enable debug routes at /debug/* (optional) # DEBUG_ROUTES=true -# Optional - set a fixed token instead of auto-generated +# Gateway token (any string works for local dev) MOLTBOT_GATEWAY_TOKEN=dev-token-change-in-prod -# Cloudflare Access configuration for /_admin and /api routes -# Required for admin UI and device pairing API in production -# CF_ACCESS_TEAM_DOMAIN=myteam.cloudflareaccess.com -# CF_ACCESS_AUD=your-application-audience-tag +# AI Provider (optional - users bring their own Poe keys in production) +# For local dev without Poe, set one of these: +# ANTHROPIC_API_KEY=sk-ant-... +# OPENAI_API_KEY=sk-... + +# Cloudflare AI Gateway (alternative to direct provider keys) +# CLOUDFLARE_AI_GATEWAY_API_KEY=your-provider-api-key +# CF_AI_GATEWAY_ACCOUNT_ID=your-account-id +# CF_AI_GATEWAY_GATEWAY_ID=your-gateway-id +# CF_AI_GATEWAY_MODEL=workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast # Optional chat channels # TELEGRAM_BOT_TOKEN=optional # DISCORD_BOT_TOKEN=optional -# CDP (Chrome DevTools Protocol) configuration for browser automation +# CDP (Chrome DevTools Protocol) for browser automation # CDP_SECRET=shared-secret-for-cdp-auth # WORKER_URL=https://your-worker.example.com diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..f7b343880 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,67 @@ +# Claude Code Instructions + +Guidelines for Claude Code when working on this codebase. + +## Project + +PoeClaw — a Cloudflare Worker that runs OpenClaw AI assistant in a Cloudflare Sandbox container. See [README.md](./README.md) for user-facing docs, [AGENTS.md](./AGENTS.md) for technical agent instructions. + +## Philosophy + +Read [`docs/PHILOSOPHY.md`](./docs/PHILOSOPHY.md) before making changes. It defines our layered defense strategy for quality: + +- **Layer 0: CI Guardrails** — lint, format, typecheck must pass. No `--no-verify`. +- **Layer 1: Types** — minimize `as any`, use exhaustive pattern matching, share types across server/client. +- **Layer 2: Unit Tests** — test pure logic and security boundaries. Don't test route registration or HTML templates. +- **Layer 3: Property Tests** — use for combinatorial config (`buildEnvVars`), security code, classification logic. +- **Layer 4: Contract Tests** — define API boundary promises before implementing. +- **Layer 5: Integration Tests** — real dependencies for boundary behavior. Mocks are necessary lies. +- **Layer 6: Deploy Safety** — health checks and post-deploy verification. + +### Key Principles + +1. **Test behavior, not implementation.** Tests verify *what* code promises, not *how* it works. +2. **Security code gets the full treatment.** Auth, credentials, secrets — no shortcuts. +3. **Mocks are necessary lies.** Use for fast feedback on pure logic. Don't trust for boundary behavior. +4. **Pragmatism is not laziness.** Use the Severity x Detectability matrix: test where severity is high and detectability is low. + +## Commands + +Use `make` as the primary interface. Run `make help` to see all targets. + +```bash +make check # Run ALL checks: typecheck, lint, format, test (use before completing work) +make test # Run tests (vitest) +make test-watch # Watch mode +make typecheck # TypeScript strict check +make lint # Run linter (oxlint) +make fix # Auto-fix: lint + format +make build # Build worker + client +make deploy # Build and deploy to Cloudflare +make dev # Clean start: kill stale containers, clear state, run dev server +make clean # Kill sandbox containers and remove wrangler state +make status # Show sandbox containers, images, and listening ports +``` + +
+npm equivalents (for reference) + +```bash +npm test # Run tests +npm run test:watch # Watch mode +npm run typecheck # TypeScript strict check +npm run lint # oxlint +npm run format:check # oxfmt +npm run build # Build worker + client +npm run deploy # Build and deploy to Cloudflare +``` +
+ +## Working Here + +- Read [AGENTS.md](./AGENTS.md) for project structure, patterns, and common tasks. +- Read [CONTRIBUTING.md](./CONTRIBUTING.md) for contribution rules and AI policy. +- Tests are colocated: `foo.ts` → `foo.test.ts` in the same directory. +- Use the existing mock infrastructure in `src/test-utils.ts` for new tests. +- When adding a test, include a threat-model comment: what failure mode does it catch? +- Run `make check` before considering work complete. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 16ccfceb7..5a3c62809 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,7 +6,7 @@ We welcome contributions, but with a few short rules: - **You cannot be offended if we close a PR** or otherwise decide not to merge your work. We're maintaining Moltworker for (many, many) others, and we're ultimately the ones that have to maintain the code. This is especially true if we believe your PR to be AI driven without any human-in-the-loop review or explanation. Not all ideas or work makes it. If it's critical to your workflow, you can fork it! -- **Demonstrate that you've tested your work** - whether via manual testing, automated tests, or a mix of both. You may be quizzed here. +- **Demonstrate that you've tested your work** - run `make check` (typecheck, lint, format, tests) before submitting. You may be quizzed here. ## AI Contributions diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..1b7b9745b --- /dev/null +++ b/Makefile @@ -0,0 +1,122 @@ +.DEFAULT_GOAL := help +.PHONY: help dev dev-fast clean nuke build \ + check test test-watch test-cov typecheck lint fix \ + deploy logs secret secrets status \ + docker-build docker-shell docker-logs docker-prune + +# ────────────────────────────────────────────── +# Help +# ────────────────────────────────────────────── + +help: ## Show all available commands + @echo "Usage: make " + @echo "" + @grep -E '^[a-zA-Z_-]+:.*##' $(MAKEFILE_LIST) \ + | awk -F ':.*## ' '{ printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 }' + +# ────────────────────────────────────────────── +# Core Dev Workflow +# ────────────────────────────────────────────── + +dev: clean ## Clean start: kill stale containers, clear state, run dev server + @npx wrangler dev + +dev-fast: ## Skip cleanup, just run wrangler dev + @npx wrangler dev + +clean: ## Kill sandbox containers and remove wrangler state + @echo "Stopping sandbox containers..." + @-docker ps -q --filter ancestor=cloudflare-dev/sandbox | xargs -r docker kill 2>/dev/null + @-docker ps -q --filter name=sandbox | xargs -r docker kill 2>/dev/null + @-docker ps -q | xargs -r -I{} sh -c 'docker inspect --format="{{.Id}} {{.Config.Image}}" {} | grep sandbox | cut -d" " -f1' | xargs -r docker kill 2>/dev/null + @echo "Removing wrangler state..." + @rm -rf .wrangler/state + @echo "Clean complete." + +nuke: clean ## Full reset: clean + remove dist, node_modules, images, reinstall + @echo "Removing build artifacts..." + @rm -rf dist/ + @echo "Removing node_modules..." + @rm -rf node_modules/ + @echo "Removing sandbox docker images..." + @-docker images --format '{{.Repository}}:{{.Tag}} {{.ID}}' | grep sandbox | awk '{print $$2}' | xargs -r docker rmi -f 2>/dev/null + @echo "Reinstalling dependencies..." + @npm install + @echo "Nuke complete." + +build: ## Build worker + client + @npx vite build + +# ────────────────────────────────────────────── +# Quality & Testing +# ────────────────────────────────────────────── + +check: typecheck lint fix-format-check test ## Run ALL checks: typecheck, lint, format-check, test (mirrors CI) + +fix-format-check: + @npx oxfmt --check src/ + +test: ## Run tests + @npx vitest run + +test-watch: ## Run tests in watch mode + @npx vitest + +test-cov: ## Run tests with coverage + @npx vitest run --coverage + +typecheck: ## TypeScript strict check + @npx tsc --noEmit + +lint: ## Run linter + @npx oxlint src/ + +fix: ## Auto-fix: lint + format + @npx oxlint --fix src/ + @npx oxfmt --write src/ + +# ────────────────────────────────────────────── +# Deployment & Ops +# ────────────────────────────────────────────── + +deploy: build ## Build and deploy to Cloudflare + @npx wrangler deploy + +logs: ## Tail production logs + @npx wrangler tail + +secret: ## Set a secret (usage: make secret KEY=FOO VALUE=bar) + @echo "$(VALUE)" | npx wrangler secret put $(KEY) + +secrets: ## List all secrets + @npx wrangler secret list + +status: ## Show sandbox containers, images, and listening ports + @echo "=== Sandbox Containers ===" + @docker ps --filter name=sandbox --format 'table {{.ID}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}' 2>/dev/null || true + @echo "" + @echo "=== Sandbox Images ===" + @docker images --format 'table {{.Repository}}\t{{.Tag}}\t{{.Size}}' | grep -E 'REPOSITORY|sandbox' || true + @echo "" + @echo "=== Listening Ports (8787, 5173, 18789) ===" + @lsof -iTCP:8787 -iTCP:5173 -iTCP:18789 -sTCP:LISTEN -P -n 2>/dev/null || echo " (none)" + +# ────────────────────────────────────────────── +# Docker / Container Helpers +# ────────────────────────────────────────────── + +docker-build: ## Force rebuild sandbox image (no cache) + @docker build --no-cache -t cloudflare-dev/sandbox:local . + +docker-shell: ## Shell into running sandbox container + @docker exec -it $$(docker ps --filter name=sandbox -q | head -1) bash + +docker-logs: ## Tail logs from running sandbox container + @docker logs -f $$(docker ps --filter name=sandbox -q | head -1) + +docker-prune: ## Remove old/dangling sandbox images + @echo "Removing dangling images..." + @-docker image prune -f + @echo "Removing old sandbox images..." + @-docker images --format '{{.Repository}}:{{.Tag}} {{.ID}}' | grep sandbox | awk '{print $$2}' | xargs -r docker rmi -f 2>/dev/null + @echo "Prune complete." diff --git a/README.md b/README.md index ea82f03af..eb63fd102 100644 --- a/README.md +++ b/README.md @@ -1,281 +1,172 @@ -# OpenClaw on Cloudflare Workers +# PoeClaw -Run [OpenClaw](https://github.com/openclaw/openclaw) (formerly Moltbot, formerly Clawdbot) personal AI assistant in a [Cloudflare Sandbox](https://developers.cloudflare.com/sandbox/). +Multi-tenant [OpenClaw](https://github.com/openclaw/openclaw) platform powered by [Poe API keys](https://poe.com/api_key), running in [Cloudflare Sandbox](https://developers.cloudflare.com/sandbox/) containers. -![moltworker architecture](./assets/logo.png) +![PoeClaw logo](./assets/logo.png) -> **Experimental:** This is a proof of concept demonstrating that OpenClaw can run in Cloudflare Sandbox. It is not officially supported and may break without notice. Use at your own risk. +> **Experimental:** Proof of concept. Not officially supported — may break without notice. -[![Deploy to Cloudflare](https://deploy.workers.cloudflare.com/button)](https://deploy.workers.cloudflare.com/?url=https://github.com/cloudflare/moltworker) +## How It Works + +1. You deploy PoeClaw to Cloudflare Workers +2. Share the URL with your team +3. Each person enters their own [Poe API key](https://poe.com/api_key) +4. They get a private sandbox with access to all models on their Poe subscription +5. Each user's data and API key are isolated — separate containers, separate encrypted storage ## Requirements -- [Workers Paid plan](https://www.cloudflare.com/plans/developer-platform/) ($5 USD/month) — required for Cloudflare Sandbox containers -- [Anthropic API key](https://console.anthropic.com/) — for Claude access, or you can use AI Gateway's [Unified Billing](https://developers.cloudflare.com/ai-gateway/features/unified-billing/) +- [Workers Paid plan](https://www.cloudflare.com/plans/developer-platform/) ($5 USD/month) — required for Cloudflare Sandbox +- Each user needs a [Poe API key](https://poe.com/api_key) — provides access to Claude, GPT, Gemini, and other models through Poe's subscription The following Cloudflare features used by this project have free tiers: -- Cloudflare Access (authentication) - Browser Rendering (for browser navigation) - AI Gateway (optional, for API routing/analytics) - R2 Storage (optional, for persistence) -## Container Cost Estimate - -This project uses a `standard-1` Cloudflare Container instance (1/2 vCPU, 4 GiB memory, 8 GB disk). Below are approximate monthly costs assuming the container runs 24/7, based on [Cloudflare Containers pricing](https://developers.cloudflare.com/containers/pricing/): +## Cost Estimate -| Resource | Provisioned | Monthly Usage | Included Free | Overage | Approx. Cost | -|----------|-------------|---------------|---------------|---------|--------------| -| Memory | 4 GiB | 2,920 GiB-hrs | 25 GiB-hrs | 2,895 GiB-hrs | ~$26/mo | -| CPU (at ~10% utilization) | 1/2 vCPU | ~2,190 vCPU-min | 375 vCPU-min | ~1,815 vCPU-min | ~$2/mo | -| Disk | 8 GB | 5,840 GB-hrs | 200 GB-hrs | 5,640 GB-hrs | ~$1.50/mo | -| Workers Paid plan | | | | | $5/mo | -| **Total** | | | | | **~$34.50/mo** | +PoeClaw uses `basic` Cloudflare Container instances (1 vCPU, 1 GiB memory, 2 GB disk) with a default sleep timeout of 1 hour. This keeps per-user costs low for multi-tenant deployments. -Notes: -- CPU is billed on **active usage only**, not provisioned capacity. The 10% utilization estimate is a rough baseline for a lightly-used personal assistant; your actual cost will vary with usage. -- Memory and disk are billed on **provisioned capacity** for the full time the container is running. -- To reduce costs, configure `SANDBOX_SLEEP_AFTER` (e.g., `10m`) so the container sleeps when idle. A container that only runs 4 hours/day would cost roughly ~$5-6/mo in compute on top of the $5 plan fee. -- Network egress, Workers/Durable Objects requests, and logs are additional but typically minimal for personal use. -- See the [instance types table](https://developers.cloudflare.com/containers/pricing/) for other options (e.g., `lite` at 256 MiB/$0.50/mo memory or `standard-4` at 12 GiB for heavier workloads). +| Resource | Per User/Month (1h sleep) | Notes | +|----------|--------------------------|-------| +| Memory | ~$0.50 | 1 GiB, billed only while awake | +| CPU | ~$0.20 | Billed on active usage (~10% utilization) | +| Disk | ~$0.22 | 2 GB provisioned | +| **Per-user total** | **~$0.92** | Assumes ~4 hrs active/day | -## What is OpenClaw? +Plus the $5/month Workers Paid plan (flat, not per-user). A team of 10 costs roughly **$14/month** total. -[OpenClaw](https://github.com/openclaw/openclaw) (formerly Moltbot, formerly Clawdbot) is a personal AI assistant with a gateway architecture that connects to multiple chat platforms. Key features: +To adjust the sleep timeout, set `SANDBOX_SLEEP_AFTER` (e.g., `10m`, `1h`, `never`). The default is `1h`. Setting `never` keeps containers alive indefinitely but increases cost. -- **Control UI** - Web-based chat interface at the gateway -- **Multi-channel support** - Telegram, Discord, Slack -- **Device pairing** - Secure DM authentication requiring explicit approval -- **Persistent conversations** - Chat history and context across sessions -- **Agent runtime** - Extensible AI capabilities with workspace and skills - -This project packages OpenClaw to run in a [Cloudflare Sandbox](https://developers.cloudflare.com/sandbox/) container, providing a fully managed, always-on deployment without needing to self-host. Optional R2 storage enables persistence across container restarts. - -## Architecture - -![moltworker architecture](./assets/architecture.png) +See the [instance types table](https://developers.cloudflare.com/containers/pricing/) for other options. ## Quick Start -_Cloudflare Sandboxes are available on the [Workers Paid plan](https://dash.cloudflare.com/?to=/:account/workers/plans)._ - ```bash -# Install dependencies +# Clone and install +git clone https://github.com/garethpaul/poeclaw +cd poeclaw npm install -# Set your API key (direct Anthropic access) -npx wrangler secret put ANTHROPIC_API_KEY +# Generate session secrets (required for multi-tenant auth) +echo "$(openssl rand -hex 32)" | npx wrangler secret put SESSION_SECRET +echo "$(openssl rand -hex 32)" | npx wrangler secret put ENCRYPTION_SECRET -# Or use Cloudflare AI Gateway instead (see "Optional: Cloudflare AI Gateway" below) -# npx wrangler secret put CLOUDFLARE_AI_GATEWAY_API_KEY -# npx wrangler secret put CF_AI_GATEWAY_ACCOUNT_ID -# npx wrangler secret put CF_AI_GATEWAY_GATEWAY_ID - -# Generate and set a gateway token (required for remote access) -# Save this token - you'll need it to access the Control UI -export MOLTBOT_GATEWAY_TOKEN=$(openssl rand -hex 32) -echo "Your gateway token: $MOLTBOT_GATEWAY_TOKEN" -echo "$MOLTBOT_GATEWAY_TOKEN" | npx wrangler secret put MOLTBOT_GATEWAY_TOKEN +# Generate gateway token (secures Worker-to-container communication) +echo "$(openssl rand -hex 32)" | npx wrangler secret put MOLTBOT_GATEWAY_TOKEN # Deploy -npm run deploy +make deploy ``` -After deploying, open the Control UI with your token: +> Run `make help` to see all available commands. + +After deploying, visit your worker URL: ``` -https://your-worker.workers.dev/?token=YOUR_GATEWAY_TOKEN +https://poeclaw.your-subdomain.workers.dev ``` -Replace `your-worker` with your actual worker subdomain and `YOUR_GATEWAY_TOKEN` with the token you generated above. - -**Note:** The first request may take 1-2 minutes while the container starts. - -> **Important:** You will not be able to use the Control UI until you complete the following steps. You MUST: -> 1. [Set up Cloudflare Access](#setting-up-the-admin-ui) to protect the admin UI -> 2. [Pair your device](#device-pairing) via the admin UI at `/_admin/` - -You'll also likely want to [enable R2 storage](#persistent-storage-r2) so your paired devices and conversation history persist across container restarts (optional but recommended). - -## Setting Up the Admin UI - -To use the admin UI at `/_admin/` for device management, you need to: -1. Enable Cloudflare Access on your worker -2. Set the Access secrets so the worker can validate JWTs +You'll see a login page. Enter a [Poe API key](https://poe.com/api_key) to start chatting. -### 1. Enable Cloudflare Access on workers.dev +**Note:** The first request after login may take 1-2 minutes while the per-user container starts. -The easiest way to protect your worker is using the built-in Cloudflare Access integration for workers.dev: - -1. Go to the [Workers & Pages dashboard](https://dash.cloudflare.com/?to=/:account/workers-and-pages) -2. Select your Worker (e.g., `moltbot-sandbox`) -3. In **Settings**, under **Domains & Routes**, in the `workers.dev` row, click the meatballs menu (`...`) -4. Click **Enable Cloudflare Access** -5. Copy the values shown in the dialog (you'll need the AUD tag later). **Note:** The "Manage Cloudflare Access" link in the dialog may 404 — ignore it. -6. To configure who can access, go to **Zero Trust** in the Cloudflare dashboard sidebar → **Access** → **Applications**, and find your worker's application: - - Add your email address to the allow list - - Or configure other identity providers (Google, GitHub, etc.) -7. Copy the **Application Audience (AUD)** tag from the Access application settings. This will be your `CF_ACCESS_AUD` in Step 2 below - -### 2. Set Access Secrets +## Architecture -After enabling Cloudflare Access, set the secrets so the worker can validate JWTs: +PoeClaw adds a multi-tenant session layer on top of OpenClaw: -```bash -# Your Cloudflare Access team domain (e.g., "myteam.cloudflareaccess.com") -npx wrangler secret put CF_ACCESS_TEAM_DOMAIN - -# The Application Audience (AUD) tag from your Access application that you copied in the step above -npx wrangler secret put CF_ACCESS_AUD +``` +User (browser) + │ + ├─ POST /api/auth/login → Validates Poe API key, creates session cookie + │ Encrypts key with AES-GCM, stores in per-user sandbox + │ + ├─ Chat UI (React) → Model selector, SSE streaming, dark theme + │ + └─ /v1/chat/completions → Proxied to per-user OpenClaw gateway + Each user gets their own Sandbox container + resolved by: getSandbox(env.Sandbox, userHash) ``` -You can find your team domain in the [Zero Trust Dashboard](https://one.dash.cloudflare.com/) under **Settings** > **Custom Pages** (it's the subdomain before `.cloudflareaccess.com`). +### Security -### 3. Redeploy +- **Session cookies**: HMAC-SHA256 signed, `HttpOnly; Secure; SameSite=Lax`, 24h expiry +- **API key storage**: AES-GCM encrypted with random IV, stored in per-user sandbox +- **User isolation**: Each user gets a separate Durable Object / container, keyed by SHA-256 hash of their API key +- **Rate limiting**: 10 login attempts per IP per minute +- **CSP headers**: `default-src 'self'; style-src 'self' 'unsafe-inline'; frame-ancestors 'none'`, plus `X-Frame-Options: DENY`, `X-Content-Type-Options: nosniff` +- **R2 namespacing**: Per-user paths (`users/{userHash}/openclaw/`) prevent cross-tenant data access -```bash -npm run deploy -``` - -Now visit `/_admin/` and you'll be prompted to authenticate via Cloudflare Access before accessing the admin UI. +## Authentication -### Alternative: Manual Access Application +PoeClaw uses **Poe API key authentication** instead of Cloudflare Access: -If you prefer more control, you can manually create an Access application: +1. User enters their Poe API key on the login page +2. Key is validated against `api.poe.com/v1/models` — the available models list is returned +3. A session cookie is set (HMAC-SHA256 signed, 24h TTL) +4. The API key is AES-GCM encrypted and stored in the user's sandbox +5. On subsequent requests, the session cookie is verified and the per-user sandbox is resolved -1. Go to [Cloudflare Zero Trust Dashboard](https://one.dash.cloudflare.com/) -2. Navigate to **Access** > **Applications** -3. Create a new **Self-hosted** application -4. Set the application domain to your Worker URL (e.g., `moltbot-sandbox.your-subdomain.workers.dev`) -5. Add paths to protect: `/_admin/*`, `/api/*`, `/debug/*` -6. Configure your desired identity providers (e.g., email OTP, Google, GitHub) -7. Copy the **Application Audience (AUD)** tag and set the secrets as shown above +No Cloudflare Access setup is needed. Each user authenticates with their own Poe key. ### Local Development -For local development, create a `.dev.vars` file with: +Start the dev server with `make dev` (cleans stale containers first) or `make dev-fast` (skip cleanup): ```bash -DEV_MODE=true # Skip Cloudflare Access auth + bypass device pairing -DEBUG_ROUTES=true # Enable /debug/* routes (optional) +make dev ``` -## Authentication - -By default, moltbot uses **device pairing** for authentication. When a new device (browser, CLI, etc.) connects, it must be approved via the admin UI at `/_admin/`. +Create a `.dev.vars` file for local config: -### Device Pairing - -1. A device connects to the gateway -2. The connection is held pending until approved -3. An admin approves the device via `/_admin/` -4. The device is now paired and can connect freely - -This is the most secure option as it requires explicit approval for each device. - -### Gateway Token (Required) - -A gateway token is required to access the Control UI when hosted remotely. Pass it as a query parameter: - -``` -https://your-worker.workers.dev/?token=YOUR_TOKEN -wss://your-worker.workers.dev/ws?token=YOUR_TOKEN +```bash +DEV_MODE=true # Skip session auth, use a single sandbox +DEBUG_ROUTES=true # Enable /debug/* routes (optional) ``` -**Note:** Even with a valid token, new devices still require approval via the admin UI at `/_admin/` (see Device Pairing above). - -For local development only, set `DEV_MODE=true` in `.dev.vars` to skip Cloudflare Access authentication and enable `allowInsecureAuth` (bypasses device pairing entirely). - ## Persistent Storage (R2) -By default, moltbot data (configs, paired devices, conversation history) is lost when the container restarts. To enable persistent storage across sessions, configure R2: +By default, user data is lost when a container sleeps. To enable persistence, configure R2: ### 1. Create R2 API Token 1. Go to **R2** > **Overview** in the [Cloudflare Dashboard](https://dash.cloudflare.com/) 2. Click **Manage R2 API Tokens** -3. Create a new token with **Object Read & Write** permissions +3. Create a token with **Object Read & Write** permissions 4. Select the `moltbot-data` bucket (created automatically on first deploy) 5. Copy the **Access Key ID** and **Secret Access Key** ### 2. Set Secrets ```bash -# R2 Access Key ID npx wrangler secret put R2_ACCESS_KEY_ID - -# R2 Secret Access Key npx wrangler secret put R2_SECRET_ACCESS_KEY - -# Your Cloudflare Account ID npx wrangler secret put CF_ACCOUNT_ID ``` -To find your Account ID: Go to the [Cloudflare Dashboard](https://dash.cloudflare.com/), click the three dots menu next to your account name, and select "Copy Account ID". - ### How It Works -R2 storage uses a backup/restore approach for simplicity: - -**On container startup:** -- If R2 is mounted and contains backup data, it's restored to the moltbot config directory -- OpenClaw uses its default paths (no special configuration needed) - -**During operation:** -- A cron job runs every 5 minutes to sync the moltbot config to R2 -- You can also trigger a manual backup from the admin UI at `/_admin/` - -**In the admin UI:** -- When R2 is configured, you'll see "Last backup: [timestamp]" -- Click "Backup Now" to trigger an immediate sync - -Without R2 credentials, moltbot still works but uses ephemeral storage (data lost on container restart). - -## Container Lifecycle - -By default, the sandbox container stays alive indefinitely (`SANDBOX_SLEEP_AFTER=never`). This is recommended because cold starts take 1-2 minutes. - -To reduce costs for infrequently used deployments, you can configure the container to sleep after a period of inactivity: - -```bash -npx wrangler secret put SANDBOX_SLEEP_AFTER -# Enter: 10m (or 1h, 30m, etc.) -``` - -When the container sleeps, the next request will trigger a cold start. If you have R2 storage configured, your paired devices and data will persist across restarts. - -## Admin UI - -![admin ui](./assets/adminui.png) - -Access the admin UI at `/_admin/` to: -- **R2 Storage Status** - Shows if R2 is configured, last backup time, and a "Backup Now" button -- **Restart Gateway** - Kill and restart the moltbot gateway process -- **Device Pairing** - View pending requests, approve devices individually or all at once, view paired devices +Each user's data is stored under `users/{userHash}/openclaw/` in R2, preventing cross-tenant access. -The admin UI requires Cloudflare Access authentication (or `DEV_MODE=true` for local development). - -## Debug Endpoints - -Debug endpoints are available at `/debug/*` when enabled (requires `DEBUG_ROUTES=true` and Cloudflare Access): - -- `GET /debug/processes` - List all container processes -- `GET /debug/logs?id=` - Get logs for a specific process -- `GET /debug/version` - Get container and moltbot version info +- **On container startup:** Data is restored from R2 to the container +- **During operation:** A sync loop watches for file changes and backs up to R2 +- **Without R2:** PoeClaw still works, but data is lost when the container sleeps ## Optional: Chat Channels +Chat platform tokens are set as worker-level secrets and shared across all user instances. Per-user channel configuration is not currently supported. + ### Telegram ```bash npx wrangler secret put TELEGRAM_BOT_TOKEN -npm run deploy ``` ### Discord ```bash npx wrangler secret put DISCORD_BOT_TOKEN -npm run deploy ``` ### Slack @@ -283,201 +174,97 @@ npm run deploy ```bash npx wrangler secret put SLACK_BOT_TOKEN npx wrangler secret put SLACK_APP_TOKEN -npm run deploy ``` ## Optional: Browser Automation (CDP) -This worker includes a Chrome DevTools Protocol (CDP) shim that enables browser automation capabilities. This allows OpenClaw to control a headless browser for tasks like web scraping, screenshots, and automated testing. - -### Setup - -1. Set a shared secret for authentication: +The worker includes a Chrome DevTools Protocol (CDP) shim for browser automation (screenshots, scraping, etc.). ```bash -npx wrangler secret put CDP_SECRET -# Enter a secure random string +npx wrangler secret put CDP_SECRET # Shared secret for authentication +npx wrangler secret put WORKER_URL # https://poeclaw.your-subdomain.workers.dev +make deploy ``` -2. Set your worker's public URL: - -```bash -npx wrangler secret put WORKER_URL -# Enter: https://your-worker.workers.dev -``` - -3. Redeploy: - -```bash -npm run deploy -``` - -### Endpoints - | Endpoint | Description | |----------|-------------| -| `GET /cdp/json/version` | Browser version information | -| `GET /cdp/json/list` | List available browser targets | -| `GET /cdp/json/new` | Create a new browser target | -| `WS /cdp/devtools/browser/{id}` | WebSocket connection for CDP commands | - -All endpoints require authentication via the `?secret=` query parameter. - -## Built-in Skills - -The container includes pre-installed skills in `/root/clawd/skills/`: - -### cloudflare-browser - -Browser automation via the CDP shim. Requires `CDP_SECRET` and `WORKER_URL` to be set (see [Browser Automation](#optional-browser-automation-cdp) above). +| `GET /cdp/json/version` | Browser version info | +| `GET /cdp/json/list` | List browser targets | +| `GET /cdp/json/new` | Create new target | +| `WS /cdp/devtools/browser/{id}` | WebSocket CDP connection | -**Scripts:** -- `screenshot.js` - Capture a screenshot of a URL -- `video.js` - Create a video from multiple URLs -- `cdp-client.js` - Reusable CDP client library - -**Usage:** -```bash -# Screenshot -node /root/clawd/skills/cloudflare-browser/scripts/screenshot.js https://example.com output.png - -# Video from multiple URLs -node /root/clawd/skills/cloudflare-browser/scripts/video.js "https://site1.com,https://site2.com" output.mp4 --scroll -``` - -See `skills/cloudflare-browser/SKILL.md` for full documentation. +All endpoints require `?secret=`. ## Optional: Cloudflare AI Gateway -You can route API requests through [Cloudflare AI Gateway](https://developers.cloudflare.com/ai-gateway/) for caching, rate limiting, analytics, and cost tracking. OpenClaw has native support for Cloudflare AI Gateway as a first-class provider. - -AI Gateway acts as a proxy between OpenClaw and your AI provider (e.g., Anthropic). Requests are sent to `https://gateway.ai.cloudflare.com/v1/{account_id}/{gateway_id}/anthropic` instead of directly to `api.anthropic.com`, giving you Cloudflare's analytics, caching, and rate limiting. You still need a provider API key (e.g., your Anthropic API key) — the gateway forwards it to the upstream provider. - -### Setup - -1. Create an AI Gateway in the [AI Gateway section](https://dash.cloudflare.com/?to=/:account/ai/ai-gateway/create-gateway) of the Cloudflare Dashboard. -2. Set the three required secrets: - -```bash -# Your AI provider's API key (e.g., your Anthropic API key). -# This is passed through the gateway to the upstream provider. -npx wrangler secret put CLOUDFLARE_AI_GATEWAY_API_KEY - -# Your Cloudflare account ID -npx wrangler secret put CF_AI_GATEWAY_ACCOUNT_ID - -# Your AI Gateway ID (from the gateway overview page) -npx wrangler secret put CF_AI_GATEWAY_GATEWAY_ID -``` - -All three are required. OpenClaw constructs the gateway URL from the account ID and gateway ID, and passes the API key to the upstream provider through the gateway. - -3. Redeploy: +Route API requests through [Cloudflare AI Gateway](https://developers.cloudflare.com/ai-gateway/) for caching, rate limiting, and analytics. ```bash -npm run deploy +npx wrangler secret put CLOUDFLARE_AI_GATEWAY_API_KEY # Your provider's API key +npx wrangler secret put CF_AI_GATEWAY_ACCOUNT_ID # Cloudflare account ID +npx wrangler secret put CF_AI_GATEWAY_GATEWAY_ID # AI Gateway ID +make deploy ``` -When Cloudflare AI Gateway is configured, it takes precedence over direct `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`. - -### Choosing a Model - -By default, AI Gateway uses Anthropic's Claude Sonnet 4.5. To use a different model or provider, set `CF_AI_GATEWAY_MODEL` with the format `provider/model-id`: - -```bash -npx wrangler secret put CF_AI_GATEWAY_MODEL -# Enter: workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast -``` - -This works with any [AI Gateway provider](https://developers.cloudflare.com/ai-gateway/usage/providers/): - -| Provider | Example `CF_AI_GATEWAY_MODEL` value | API key is... | -|----------|-------------------------------------|---------------| -| Workers AI | `workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast` | Cloudflare API token | -| OpenAI | `openai/gpt-4o` | OpenAI API key | -| Anthropic | `anthropic/claude-sonnet-4-5` | Anthropic API key | -| Groq | `groq/llama-3.3-70b` | Groq API key | - -**Note:** `CLOUDFLARE_AI_GATEWAY_API_KEY` must match the provider you're using — it's your provider's API key, forwarded through the gateway. You can only use one provider at a time through the gateway. For multiple providers, use direct keys (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`) alongside the gateway config. - -#### Workers AI with Unified Billing - -With [Unified Billing](https://developers.cloudflare.com/ai-gateway/features/unified-billing/), you can use Workers AI models without a separate provider API key — Cloudflare bills you directly. Set `CLOUDFLARE_AI_GATEWAY_API_KEY` to your [AI Gateway authentication token](https://developers.cloudflare.com/ai-gateway/configuration/authentication/) (the `cf-aig-authorization` token). - -### Legacy AI Gateway Configuration - -The previous `AI_GATEWAY_API_KEY` + `AI_GATEWAY_BASE_URL` approach is still supported for backward compatibility but is deprecated in favor of the native configuration above. +When configured, AI Gateway takes precedence over direct API keys. See [AI Gateway docs](https://developers.cloudflare.com/ai-gateway/) for provider options. ## All Secrets Reference | Secret | Required | Description | |--------|----------|-------------| -| `CLOUDFLARE_AI_GATEWAY_API_KEY` | Yes* | Your AI provider's API key, passed through the gateway (e.g., your Anthropic API key). Requires `CF_AI_GATEWAY_ACCOUNT_ID` and `CF_AI_GATEWAY_GATEWAY_ID` | -| `CF_AI_GATEWAY_ACCOUNT_ID` | Yes* | Your Cloudflare account ID (used to construct the gateway URL) | -| `CF_AI_GATEWAY_GATEWAY_ID` | Yes* | Your AI Gateway ID (used to construct the gateway URL) | -| `CF_AI_GATEWAY_MODEL` | No | Override default model: `provider/model-id` (e.g. `workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast`). See [Choosing a Model](#choosing-a-model) | -| `ANTHROPIC_API_KEY` | Yes* | Direct Anthropic API key (alternative to AI Gateway) | -| `ANTHROPIC_BASE_URL` | No | Direct Anthropic API base URL | -| `OPENAI_API_KEY` | No | OpenAI API key (alternative provider) | -| `AI_GATEWAY_API_KEY` | No | Legacy AI Gateway API key (deprecated, use `CLOUDFLARE_AI_GATEWAY_API_KEY` instead) | -| `AI_GATEWAY_BASE_URL` | No | Legacy AI Gateway endpoint URL (deprecated) | -| `CF_ACCESS_TEAM_DOMAIN` | Yes* | Cloudflare Access team domain (required for admin UI) | -| `CF_ACCESS_AUD` | Yes* | Cloudflare Access application audience (required for admin UI) | -| `MOLTBOT_GATEWAY_TOKEN` | Yes | Gateway token for authentication (pass via `?token=` query param) | -| `DEV_MODE` | No | Set to `true` to skip CF Access auth + device pairing (local dev only) | -| `DEBUG_ROUTES` | No | Set to `true` to enable `/debug/*` routes | -| `SANDBOX_SLEEP_AFTER` | No | Container sleep timeout: `never` (default) or duration like `10m`, `1h` | -| `R2_ACCESS_KEY_ID` | No | R2 access key for persistent storage | -| `R2_SECRET_ACCESS_KEY` | No | R2 secret key for persistent storage | -| `CF_ACCOUNT_ID` | No | Cloudflare account ID (required for R2 storage) | +| `SESSION_SECRET` | **Yes** | HMAC-SHA256 key for session cookies (32+ chars) | +| `ENCRYPTION_SECRET` | **Yes** | AES-GCM key for encrypting stored API keys (32+ chars) | +| `MOLTBOT_GATEWAY_TOKEN` | **Yes** | Token to secure the internal container gateway | +| `ANTHROPIC_API_KEY` | No | Direct Anthropic API key (alternative to Poe) | +| `OPENAI_API_KEY` | No | Direct OpenAI API key (alternative to Poe) | +| `CLOUDFLARE_AI_GATEWAY_API_KEY` | No | API key for AI Gateway | +| `CF_AI_GATEWAY_ACCOUNT_ID` | No | Cloudflare account ID for AI Gateway | +| `CF_AI_GATEWAY_GATEWAY_ID` | No | AI Gateway ID | +| `CF_AI_GATEWAY_MODEL` | No | Override model: `provider/model-id` | +| `DEV_MODE` | No | `true` to skip auth (local dev only) | +| `DEBUG_ROUTES` | No | `true` to enable `/debug/*` routes | +| `SANDBOX_SLEEP_AFTER` | No | Container sleep timeout: `1h` (default), `10m`, `never` | +| `R2_ACCESS_KEY_ID` | No | R2 access key for persistence | +| `R2_SECRET_ACCESS_KEY` | No | R2 secret key for persistence | +| `CF_ACCOUNT_ID` | No | Cloudflare account ID (for R2) | | `TELEGRAM_BOT_TOKEN` | No | Telegram bot token | -| `TELEGRAM_DM_POLICY` | No | Telegram DM policy: `pairing` (default) or `open` | | `DISCORD_BOT_TOKEN` | No | Discord bot token | -| `DISCORD_DM_POLICY` | No | Discord DM policy: `pairing` (default) or `open` | | `SLACK_BOT_TOKEN` | No | Slack bot token | | `SLACK_APP_TOKEN` | No | Slack app token | -| `CDP_SECRET` | No | Shared secret for CDP endpoint authentication (see [Browser Automation](#optional-browser-automation-cdp)) | -| `WORKER_URL` | No | Public URL of the worker (required for CDP) | +| `CDP_SECRET` | No | Shared secret for CDP authentication | +| `WORKER_URL` | No | Public URL of the worker (for CDP) | -## Security Considerations - -### Authentication Layers - -OpenClaw in Cloudflare Sandbox uses multiple authentication layers: - -1. **Cloudflare Access** - Protects admin routes (`/_admin/`, `/api/*`, `/debug/*`). Only authenticated users can manage devices. +## Debug Endpoints -2. **Gateway Token** - Required to access the Control UI. Pass via `?token=` query parameter. Keep this secret. +Available at `/debug/*` when `DEBUG_ROUTES=true` (requires valid session): -3. **Device Pairing** - Each device (browser, CLI, chat platform DM) must be explicitly approved via the admin UI before it can interact with the assistant. This is the default "pairing" DM policy. +- `GET /debug/processes` - List container processes +- `GET /debug/logs?id=` - Process logs +- `GET /debug/version` - Container and gateway version info ## Troubleshooting -**`npm run dev` fails with an `Unauthorized` error:** You need to enable Cloudflare Containers in the [Containers dashboard](https://dash.cloudflare.com/?to=/:account/workers/containers) - -**Gateway fails to start:** Check `npx wrangler secret list` and `npx wrangler tail` - -**Config changes not working:** Edit the `# Build cache bust:` comment in `Dockerfile` and redeploy +**Login fails with "Invalid API key":** Ensure your Poe API key starts with `pb-` and is valid at [poe.com/api_key](https://poe.com/api_key). -**Slow first request:** Cold starts take 1-2 minutes. Subsequent requests are faster. +**"Server configuration error" on login:** `SESSION_SECRET` or `ENCRYPTION_SECRET` is not set. Run the secret setup commands from Quick Start. -**R2 not mounting:** Check that all three R2 secrets are set (`R2_ACCESS_KEY_ID`, `R2_SECRET_ACCESS_KEY`, `CF_ACCOUNT_ID`). Note: R2 mounting only works in production, not with `wrangler dev`. +**Slow first request after login:** Cold starts take 1-2 minutes while the per-user container boots. Subsequent requests are fast. -**Access denied on admin routes:** Ensure `CF_ACCESS_TEAM_DOMAIN` and `CF_ACCESS_AUD` are set, and that your Cloudflare Access application is configured correctly. +**Data lost after inactivity:** Containers sleep after 1 hour by default. Configure R2 storage for persistence across restarts. -**Devices not appearing in admin UI:** Device list commands take 10-15 seconds due to WebSocket connection overhead. Wait and refresh. +**`npm run dev` fails with "Unauthorized":** Enable Cloudflare Containers in the [Containers dashboard](https://dash.cloudflare.com/?to=/:account/workers/containers). -**WebSocket issues in local development:** `wrangler dev` has known limitations with WebSocket proxying through the sandbox. HTTP requests work but WebSocket connections may fail. Deploy to Cloudflare for full functionality. +**WebSocket issues in local dev:** `wrangler dev` has known limitations with WebSocket proxying through the sandbox. Deploy to Cloudflare for full functionality. ## Known Issues -### Windows: Gateway fails to start with exit code 126 (permission denied) +### Windows: Gateway fails to start with exit code 126 -On Windows, Git may check out shell scripts with CRLF line endings instead of LF. This causes `start-openclaw.sh` to fail with exit code 126 inside the Linux container. Ensure your repository uses LF line endings — configure Git with `git config --global core.autocrlf input` or add a `.gitattributes` file with `* text=auto eol=lf`. See [#64](https://github.com/cloudflare/moltworker/issues/64) for details. +Git may check out shell scripts with CRLF line endings. Ensure LF line endings: `git config --global core.autocrlf input` or add `.gitattributes` with `* text=auto eol=lf`. ## Links - [OpenClaw](https://github.com/openclaw/openclaw) -- [OpenClaw Docs](https://docs.openclaw.ai/) +- [Poe API Keys](https://poe.com/api_key) - [Cloudflare Sandbox Docs](https://developers.cloudflare.com/sandbox/) -- [Cloudflare Access Docs](https://developers.cloudflare.com/cloudflare-one/policies/access/) +- [Cloudflare Containers Pricing](https://developers.cloudflare.com/containers/pricing/) diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 000000000..23204258b --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,278 @@ +# Getting Started with PoeClaw + +This guide walks you through deploying PoeClaw from scratch. By the end, you'll have a URL you can share with your team where each person logs in with their own [Poe API key](https://poe.com/api_key) and gets a private AI assistant. + +## Prerequisites + +- **Node.js 22+** ([download](https://nodejs.org/)) +- **Cloudflare account** with [Workers Paid plan](https://www.cloudflare.com/plans/developer-platform/) ($5/month) — required for Sandbox containers +- **Wrangler CLI** — installed automatically via npm +- Each user needs a [Poe API key](https://poe.com/api_key) (starts with `pb-`) + +### Enable Cloudflare Containers + +Before your first deploy, enable Containers in your Cloudflare dashboard: + +1. Go to [Workers & Pages](https://dash.cloudflare.com/?to=/:account/workers-and-pages) +2. Click **Containers** in the sidebar +3. Enable the feature if prompted + +## Step 1: Clone and Install + +```bash +git clone https://github.com/garethpaul/poeclaw +cd poeclaw +npm install +``` + +## Step 2: Set Required Secrets + +PoeClaw needs three secrets. Generate them with `openssl` and store them via Wrangler: + +```bash +# 1. Session signing key (HMAC-SHA256 for session cookies) +echo "$(openssl rand -hex 32)" | npx wrangler secret put SESSION_SECRET + +# 2. Encryption key (AES-GCM for encrypting stored API keys) +echo "$(openssl rand -hex 32)" | npx wrangler secret put ENCRYPTION_SECRET + +# 3. Gateway token (secures Worker ↔ container communication) +echo "$(openssl rand -hex 32)" | npx wrangler secret put MOLTBOT_GATEWAY_TOKEN +``` + +Each command will prompt you to confirm. That's it for required setup — users bring their own Poe API keys, so no AI provider key is needed at the platform level. + +## Step 3: Deploy + +```bash +make deploy +``` + +This builds the React client, bundles the Worker, builds the container image, and deploys everything to Cloudflare. First deploy takes a few minutes while the container image uploads. + +> Run `make help` to see all available commands. + +Your app is now live at: + +``` +https://poeclaw..workers.dev +``` + +Find your subdomain in the [Workers dashboard](https://dash.cloudflare.com/?to=/:account/workers-and-pages) under your worker's settings. + +## Step 4: First Login + +1. Open your worker URL in a browser +2. You'll see the PoeClaw login page +3. Paste a [Poe API key](https://poe.com/api_key) (starts with `pb-`) +4. The key is validated against the Poe API — available models are fetched automatically +5. On success, a session cookie is set and you're taken to the chat interface + +**First request takes 1-2 minutes** while your per-user container boots. After that, responses are fast. + +## Step 5: Share with Your Team + +Send your team the URL. Each person: +1. Gets their own Poe API key from [poe.com/api_key](https://poe.com/api_key) +2. Visits the URL and enters their key +3. Gets a private sandbox with their own model access, chat history, and encrypted key storage + +Users are isolated — each gets a separate container keyed by a SHA-256 hash of their API key. + +--- + +## Local Development + +For local development without deploying: + +### 1. Create `.dev.vars` + +```bash +cp .dev.vars.example .dev.vars +``` + +Edit `.dev.vars`: + +```bash +DEV_MODE=true +MOLTBOT_GATEWAY_TOKEN=dev-token +``` + +`DEV_MODE=true` skips session auth and uses a single shared sandbox — no Poe key needed locally. + +### 2. Start the Worker + +```bash +make dev +``` + +This kills stale containers and clears wrangler state before starting. Use `make dev-fast` to skip cleanup. + +Open http://localhost:8787. In dev mode, you skip the login page and go straight to the chat interface. + +> **Note:** WebSocket connections have known limitations with `wrangler dev`. Deploy to Cloudflare for full functionality. + +### 3. Client-Only Dev (Optional) + +For faster React iteration with hot reload: + +```bash +npm run dev +``` + +This runs the Vite dev server on http://localhost:5173, proxying API requests to the Worker. + +--- + +## Optional: Persistent Storage (R2) + +Without R2, user data is lost when containers sleep (default: after 1 hour of inactivity). To persist chat history and settings across restarts: + +### 1. Create R2 API Token + +1. Go to **R2 > Overview** in the [Cloudflare Dashboard](https://dash.cloudflare.com/) +2. Click **Manage R2 API Tokens** +3. Create a token with **Object Read & Write** permissions for the `moltbot-data` bucket +4. Copy the **Access Key ID** and **Secret Access Key** + +### 2. Set R2 Secrets + +```bash +npx wrangler secret put R2_ACCESS_KEY_ID +# Paste your access key ID + +npx wrangler secret put R2_SECRET_ACCESS_KEY +# Paste your secret access key + +npx wrangler secret put CF_ACCOUNT_ID +# Paste your Cloudflare Account ID +# (Found at: Dashboard → any zone → Overview → API section, or URL bar) +``` + +### 3. Redeploy + +```bash +make deploy +``` + +R2 is now active. Each user's data is stored under `users/{userHash}/openclaw/` in the `moltbot-data` bucket, synced every 30 seconds via rclone inside the container. + +--- + +## Optional: Container Sleep Timeout + +Containers sleep after 1 hour of inactivity by default. To change this: + +```bash +npx wrangler secret put SANDBOX_SLEEP_AFTER +# Enter: 10m, 30m, 1h, or never +``` + +- `10m` — saves cost for rarely-used instances +- `1h` — default, good balance of cost and responsiveness +- `never` — always-on, fastest response but highest cost + +With R2 configured, data persists across sleep/wake cycles. + +--- + +## Optional: Debug Routes + +Enable debug endpoints for troubleshooting container issues: + +```bash +npx wrangler secret put DEBUG_ROUTES +# Enter: true +``` + +Then access (requires valid session): +- `GET /debug/processes` — list container processes +- `GET /debug/logs?id=` — view process logs +- `GET /debug/version` — container and gateway version info + +--- + +## Testing and Development + +```bash +# Run ALL checks (typecheck, lint, format, test) — mirrors CI +make check + +# Run tests only +make test + +# Watch mode for TDD +make test-watch + +# Type checking (TypeScript strict mode) +make typecheck + +# Lint +make lint + +# Auto-fix lint + format issues +make fix +``` + +Run `make check` before considering work complete. + +--- + +## Troubleshooting + +| Problem | Solution | +|---------|----------| +| "Invalid API key" on login | Key must start with `pb-`. Verify at [poe.com/api_key](https://poe.com/api_key) | +| "Server configuration error" | `SESSION_SECRET` or `ENCRYPTION_SECRET` not set. Re-run Step 2 | +| Slow first request (1-2 min) | Normal — container is booting. Subsequent requests are fast | +| Data lost after inactivity | Configure R2 storage (see above) or set `SANDBOX_SLEEP_AFTER=never` | +| `make dev` fails with "Unauthorized" | Enable Containers in [dashboard](https://dash.cloudflare.com/?to=/:account/workers/containers) | +| Gateway exit code 126 (Windows) | CRLF line endings. Run `git config --global core.autocrlf input` and re-clone | + +--- + +## All Secrets Reference + +| Secret | Required | Description | +|--------|----------|-------------| +| `SESSION_SECRET` | **Yes** | HMAC-SHA256 key for session cookies | +| `ENCRYPTION_SECRET` | **Yes** | AES-GCM key for encrypting stored API keys | +| `MOLTBOT_GATEWAY_TOKEN` | **Yes** | Token securing Worker-to-container communication | +| `R2_ACCESS_KEY_ID` | No | R2 access key (enables persistence) | +| `R2_SECRET_ACCESS_KEY` | No | R2 secret key (enables persistence) | +| `CF_ACCOUNT_ID` | No | Cloudflare account ID (required for R2) | +| `SANDBOX_SLEEP_AFTER` | No | Sleep timeout: `1h` (default), `10m`, `never` | +| `DEBUG_ROUTES` | No | Set to `true` for `/debug/*` endpoints | +| `DEV_MODE` | No | Set to `true` for local dev (skip auth) | +| `CDP_SECRET` | No | Shared secret for browser automation | +| `WORKER_URL` | No | Public worker URL (required for CDP) | +| `TELEGRAM_BOT_TOKEN` | No | Telegram bot integration | +| `DISCORD_BOT_TOKEN` | No | Discord bot integration | +| `SLACK_BOT_TOKEN` | No | Slack bot integration | +| `SLACK_APP_TOKEN` | No | Slack app token (required with Slack bot) | + +## Using Direct API Keys (Without Poe) + +If you want to use direct API keys instead of Poe, set them as worker secrets: + +```bash +# Anthropic +npx wrangler secret put ANTHROPIC_API_KEY + +# OpenAI +npx wrangler secret put OPENAI_API_KEY + +# Or Cloudflare AI Gateway (see README.md for details) +npx wrangler secret put CLOUDFLARE_AI_GATEWAY_API_KEY +npx wrangler secret put CF_AI_GATEWAY_ACCOUNT_ID +npx wrangler secret put CF_AI_GATEWAY_GATEWAY_ID +``` + +See the full secrets reference in [README.md](../README.md) for all available options. + +## Next Steps + +- **Add R2** for persistent storage across container restarts +- **Connect chat channels** (Telegram, Discord, Slack) — see [README.md](../README.md) +- **Enable browser automation** via CDP — see [README.md](../README.md) +- **Configure AI Gateway** for analytics and caching — see [README.md](../README.md) diff --git a/docs/PHILOSOPHY.md b/docs/PHILOSOPHY.md new file mode 100644 index 000000000..d4d37e699 --- /dev/null +++ b/docs/PHILOSOPHY.md @@ -0,0 +1,259 @@ +# Development Philosophy + +**How we build, test, and ship PoeClaw.** + +> This document emerged from a structured debate between ten competing philosophical positions on test-driven development, type safety, and deployment guardrails — each arguing for a different approach to software quality in this codebase. The positions attacked each other's weaknesses and defended their own. What follows is the consensus that survived the debate. The original position papers are preserved in [`docs/philosophy/positions/`](./philosophy/positions/) for reference. + +--- + +## The Core Insight: PoeClaw Is a Gateway + +PoeClaw is not a computation engine. It is a **proxy, orchestrator, and boundary-crossing system**. Its value is entirely in the promises it makes at its interfaces: browser clients, the Poe API, the Cloudflare Sandbox container, R2 storage, and chat platform integrations. + +This architectural fact determines everything about our quality strategy. The bugs that ship to production live at the seams between systems, not inside pure functions. Our testing and quality approach must reflect this. + +--- + +## Layered Defense + +Quality is not one thing. It is a stack of defenses, each catching different classes of defects at different costs. We invest in all layers, in order of cost-effectiveness: + +``` +Layer 0: CI Guardrails ........... cheapest, broadest, always on +Layer 1: Type System ............. compile-time structural safety +Layer 2: Unit Tests .............. pure logic verification +Layer 3: Property Tests .......... combinatorial & security invariants +Layer 4: Contract Tests .......... API boundary verification +Layer 5: Integration Tests ....... real dependency behavior +Layer 6: Deploy Safety ........... production health verification +``` + +Lower layers are prerequisites for higher layers. There is no value running integration tests against code that doesn't type-check. There is no value deploying code that fails contract tests. + +--- + +## Layer 0: CI Guardrails + +**Principle: Raise the floor before raising the ceiling.** + +Static analysis prevents entire categories of defects at near-zero ongoing cost. The CI pipeline gates (lint → format → typecheck → test) are the most cost-effective quality investment we make. + +### Rules + +- **No code merges without green guardrails.** Lint, format, and typecheck gates are non-negotiable. No `--no-verify` bypasses. +- **Warnings are technical debt.** Every oxlint warning should be promoted to error or explicitly suppressed with a comment explaining why. The `warn` level is for evaluating new rules, not a permanent state. +- **Guardrails run first in CI.** Cheap checks gate expensive checks. The current pipeline order (lint → format → typecheck → test) must be preserved. +- **Security lint rules are errors.** Enable oxlint's `security` category at error severity for a codebase that handles JWT authentication, API credentials, and WebSocket proxying. + +--- + +## Layer 1: Type System + +**Principle: Types eliminate categories; tests eliminate instances.** + +A single type annotation prevents a class of bugs across every call site. A single test prevents one bug in one scenario. For structural correctness, types win on cost-effectiveness. + +### Rules + +- **`typecheck` is the first CI gate.** Type errors indicate structural problems that make test results unreliable. +- **Minimize `as any` and `as unknown as`.** Each type assertion is an unverified assumption. Prefer runtime validation + type narrowing. When assertions are necessary, comment why. +- **Use exhaustive pattern matching.** For discriminated unions (process status, middleware response type), use `switch` + `never` default so the compiler flags unhandled cases. +- **Share types across boundaries.** Server response types and client API types should be imported from a shared module, not independently maintained duplicates. +- **Type-safe test utilities.** Mock factories should implement typed interfaces. No `as any` in test infrastructure — it defeats the purpose of the type system. + +### Honest Limitation + +Types have no runtime presence. They cannot validate that `ANTHROPIC_API_KEY` contains a valid key, only that it's `string | undefined`. Runtime validation is essential at system boundaries. + +--- + +## Layer 2: Unit Tests + +**Principle: Test the contract, not the implementation.** + +Unit tests verify pure logic — functions whose correctness depends on their inputs, not on external system behavior. For orchestration and boundary-crossing code, unit tests with mocks provide fast feedback but limited confidence. + +### What to Unit Test (Non-Negotiable) + +| Category | Why | Examples | +|----------|-----|---------| +| Security boundaries | Failures are silent and critical | JWT verification, credential redaction | +| Data integrity invariants | Failures cause data loss | R2 sync direction, exclusion patterns | +| Complex discrimination logic | Non-obvious edge cases | Process matching (4 gateway patterns, 5 CLI patterns, 2 statuses) | +| Configuration building | Conditional precedence with many optional fields | `buildEnvVars` env var mapping | + +### What NOT to Unit Test + +| Category | Why | +|----------|-----| +| Route registration | Tests Hono's router, not your code. A 404 is immediately visible. | +| Static HTML/templates | String literal assertions test nothing. Visual bugs are caught visually. | +| One-line delegations | The type system validates the call. The test is a tautology. | +| Code slated for replacement | The design document marks it for rewrite. Don't test throwaway code. | + +### Test Quality Over Quantity + +- **Test files are colocated** with source files (`foo.ts` → `foo.test.ts`). This pattern is established; maintain it. +- **Each test protects one invariant.** If you can't name the invariant in the `describe` block, the test is probably testing implementation details. +- **Threat model comments.** When adding a test, include a one-line comment stating what failure mode it catches: + ```typescript + // Threat: CLI commands incorrectly matched as gateway process + it('returns null when only CLI commands are running', ...) + ``` +- **No coverage targets.** Coverage percentages incentivize low-value tests. Instead, review *which* code is covered and ask: is the uncovered code high-severity and low-detectability? + +--- + +## Layer 3: Property Tests + +**Principle: Invariants over examples.** + +For functions with combinatorial input spaces or security-critical algebraic properties, property-based tests (fast-check) are strictly superior to hand-written examples. They explore the full input space and find edge cases humans systematically miss. + +### When to Use Properties + +- **Combinatorial configuration:** `buildEnvVars` has 20+ optional fields. Properties verify precedence invariants across all 2^20 combinations. Examples cover O(n) cases. +- **Security-critical code:** `timingSafeEqual` must be reflexive, symmetric, and agree with strict equality. Properties encode these universally. +- **Classification logic:** `findExistingMoltbotProcess` classifies command strings. Properties verify that CLI commands are never misclassified as gateway processes for all strings, not just the ones someone thought to try. +- **Failure isolation:** `syncToR2` has fatal vs. non-fatal failure semantics. Properties verify the 2^n failure matrix implicitly. + +### When NOT to Use Properties + +- Simple CRUD or data transformation where examples are clearer +- Integration-level behavior that requires real system interaction +- UI components + +### Rules + +- Properties complement examples, never replace them. Every property test file should include at least one example test as documentation. +- Invest in generators as reusable project assets (`src/test-generators.ts`). +- Start with the three highest-value targets: `buildEnvVars`, `findExistingMoltbotProcess`, and any timing-safe comparison functions. Expand reactively when bugs surface that properties would have caught. + +--- + +## Layer 4: Contract Tests + +**Principle: The interfaces ARE the product.** + +PoeClaw is a gateway. Its correctness is determined by whether it honors the promises it makes at its boundaries. Contract tests verify those promises explicitly. + +### Boundaries That Need Contracts + +1. **Authentication** — login, session cookies, logout +2. **Gateway status** — health checks, process state +3. **Chat completions proxy** — streaming, error handling +4. **Device management** — pairing, approval +5. **Storage/sync** — R2 backup status, manual sync +6. **WebSocket proxy** — message relay, error transformation, close propagation + +### Rules + +- **Define before implement.** Write the contract test first. The implementation is done when the contract passes. +- **Contracts are versioned promises.** Changing a response shape requires updating the contract explicitly. The diff is reviewable. +- **Use Hono's test client.** No external HTTP servers or Pact brokers needed. `app.request()` exercises the real route handlers. +- **No contracts for internal interfaces.** Contracts are for external boundaries where a different system is on the other side. + +--- + +## Layer 5: Integration Tests + +**Principle: The bugs that matter live at the seams.** + +For a proxy architecture, the traditional test pyramid is inverted. Most logic IS boundary crossing. Integration tests that exercise real dependencies catch the bugs that unit tests structurally cannot. + +### The Mock Problem (Honestly Stated) + +The AGENTS.md documents four behavioral quirks of the Cloudflare Sandbox API that were all discovered in production, not in tests: + +1. `proc.status` doesn't update immediately after process completion +2. s3fs doesn't support setting timestamps (caused rsync failures) +3. Mount state detection requires `mount | grep`, not error message parsing +4. `waitForPort` race conditions require 3-minute timeouts + +Our mocks return instant success with perfect behavior. The real system has opinions. Every mock is a bet that the real system works as you imagine — and in this codebase, that bet has been wrong at least four documented times. + +### Rules + +- **Every system boundary gets at least one integration test** that exercises the real protocol. No mocks at the boundary. +- **Prefer record-replay over hand-written mocks.** Captured real `ExecResult` output carries actual stdout format, actual timing characteristics, and actual error messages. +- **Pure logic needs no mocks.** `buildEnvVars`, config validation, string parsing — test these with real data structures, not mocked services. +- **Integration tests are first-class CI citizens.** They run on every PR, not as a nightly job. +- **Flaky tests get investigated, not deleted.** A flaky integration test is detecting a real timing-dependent behavior. + +--- + +## Layer 6: Deploy Safety + +**Principle: If you can't ship with confidence, your tests are wrong.** + +Testing exists to enable safe deployment. A test suite that passes but leaves you afraid to deploy has failed at its actual job. + +### The Deployment Reality + +`npm run deploy` = `wrangler deploy` = instant 100% production rollout. No staging. No canary. No automated rollback. The blast radius is every connected client across every channel (HTTP, Telegram, Discord, Slack, CDP). + +### Rules + +- **Every deploy is verified.** Post-deploy smoke tests against the production health endpoint are not optional. +- **Health checks are test artifacts.** A `/healthz` endpoint that verifies worker response, sandbox availability, and R2 accessibility is maintained with the same rigor as unit tests. +- **Deploy events are observable.** Every deploy is logged, timestamped, and correlated with the git commit. +- **Rollback is practiced.** The team knows how to execute `wrangler rollback` and has verified it works. + +--- + +## The Decision Framework + +Before writing a test, answer two questions: + +1. **If this code breaks, how bad is it?** (Critical / High / Medium / Low) +2. **If this code breaks, how quickly will someone notice?** (Seconds / Hours / Days) + +| | Noticed in seconds | Noticed in hours+ | +|---|---|---| +| **Critical severity** | Contract test | Unit test + property test + integration test | +| **High severity** | Contract test | Unit test + integration test | +| **Medium severity** | No test needed | Unit test | +| **Low severity** | No test needed | No test needed | + +Security-critical code (auth, credentials, secrets) is always "Critical severity, noticed in hours+" — it gets the full treatment regardless of detectability estimates, because the consequences of being wrong are catastrophic. + +--- + +## What We Believe (Consensus) + +These principles survived the debate: + +1. **Quality is a stack, not a choice.** Types, tests, properties, contracts, integration tests, and deploy safety are complementary layers. No single approach is sufficient. + +2. **Test behavior, not implementation.** Tests should verify *what* code promises, not *how* it works internally. Implementation changes should not break tests unless behavior changes. + +3. **The type system is the first line of defense.** It's the cheapest, broadest quality tool we have. Invest in precise types. Don't bypass them with `as any`. + +4. **Mocks are necessary lies.** Use them for fast feedback on pure logic. Don't trust them for boundary behavior. Verify mock assumptions with periodic integration tests. + +5. **Security code gets the full treatment.** JWT verification, credential handling, secret comparison, and auth middleware deserve unit tests, property tests, integration tests, AND mutation testing verification. No shortcuts. + +6. **Pragmatism is not laziness.** Not testing route registration is engineering judgment. Not testing JWT verification is negligence. Know the difference. + +7. **The deployment boundary is part of the testing story.** "Tests pass" and "production is healthy" are different claims. Bridge the gap with health checks and post-deploy verification. + +8. **Testing effort follows a power law.** A small number of well-chosen tests catch a disproportionate share of real bugs. Invest testing effort where severity is high and detectability is low. + +--- + +## Further Reading + +The ten position papers that produced this consensus are preserved for reference. Each contains detailed analysis of specific code patterns, concrete examples, and honest assessments of its own weaknesses: + +| Position | Paper | Key Insight | +|----------|-------|-------------| +| Strict TDD | [`01-strict-tdd.md`](./philosophy/positions/01-strict-tdd.md) | TDD's design pressure drives extraction of shared logic (e.g., CDP auth duplication) | +| Type System First | [`02-type-system.md`](./philosophy/positions/02-type-system.md) | `as unknown as` casts and duplicated client/server types reveal under-investment in types | +| Integration First | [`03-integration-first.md`](./philosophy/positions/03-integration-first.md) | For a proxy architecture, the traditional test pyramid should be inverted | +| Pragmatic Minimalist | [`04-pragmatic-minimalist.md`](./philosophy/positions/04-pragmatic-minimalist.md) | The Severity x Detectability matrix determines what's worth testing | +| Property-Based Testing | [`05-property-based.md`](./philosophy/positions/05-property-based.md) | `buildEnvVars` has 2^20 input combinations; properties test them all implicitly | +| Mutation Testing | [`06-mutation-testing.md`](./philosophy/positions/06-mutation-testing.md) | The `\|\|` to `&&` mutation in R2 credential validation survives the current test suite | +| Contract/API-First | [`07-contract-first.md`](./philosophy/positions/07-contract-first.md) | Gateway interfaces ARE the product; contracts test what users care about | +| Deployment Safety | [`08-deploy-safety.md`](./philosophy/positions/08-deploy-safety.md) | `wrangler deploy` is instant 100% rollout with no automated rollback | +| Anti-Mock Realism | [`09-anti-mock.md`](./philosophy/positions/09-anti-mock.md) | Four documented sandbox API quirks were all discovered in production, not tests | +| CI Guardrails | [`10-guardrails.md`](./philosophy/positions/10-guardrails.md) | Guardrails are O(1) to add; tests are O(n). The cost curves are fundamentally different | diff --git a/docs/bugs/login-page-never-shown.md b/docs/bugs/login-page-never-shown.md new file mode 100644 index 000000000..3fd76b884 --- /dev/null +++ b/docs/bugs/login-page-never-shown.md @@ -0,0 +1,103 @@ +# Bug: Login page never shown + +**Date:** 2026-02-15 +**Status:** Fixed + +## Symptoms + +The PoeClaw login page (where users enter their Poe API key) was never displayed in any configuration: + +- **DEV_MODE=true**: Showed "waiting for poeclaw to load" then OpenClaw's own UI. Login was completely bypassed. +- **DEV_MODE=off**: Redirect loop (401/302). SPA HTML never reached the browser. + +## Root Causes + +### 1. `/api/auth/me` faked authentication in DEV_MODE + +**File:** `src/routes/auth.ts` (former lines 120-128) + +The `GET /api/auth/me` endpoint had a DEV_MODE early-return that told the SPA the user was already authenticated: + +```typescript +if (c.env.DEV_MODE === 'true') { + return c.json({ + authenticated: true, // ← Told SPA user was already logged in + userHash: 'dev-user', + keyLast4: '0000', + models: [{ id: 'default', name: 'Default Model' }], + }); +} +``` + +The SPA flow (`src/client/App.tsx`): +1. App mounts → `fetch('/api/auth/me')` +2. Response says `authenticated: true` → SPA sets session → shows ChatPage +3. LoginPage is **never rendered** + +### 2. ASSETS.fetch() was fragile + +**Files:** `src/routes/public.ts`, `src/index.ts` + +Two issues combined: + +- **No try-catch** around `ASSETS.fetch()` — if it threw, the request fell through silently to the next handler (the catch-all proxy, which served the container's own UI). +- **Requesting `/` instead of `/index.html`** — with `html_handling: "auto-trailing-slash"` in wrangler config, ASSETS could return a 3xx redirect for the root path, causing a redirect loop. + +### Contributing: Catch-all proxy serves container UI + +**File:** `src/index.ts` (catch-all `app.all('*', ...)`) + +Once the OpenClaw container started, the catch-all proxy forwarded `GET /` to the container, which served its own web UI. If the public route or session middleware failed to serve the SPA, users saw the container UI (no login) or a redirect loop. + +## Investigation Method + +Five parallel investigation agents explored: + +1. **SPA routing agent**: Found `/api/auth/me` DEV_MODE bypass fakes authentication, preventing LoginPage from rendering. +2. **ASSETS binding agent**: Found `ASSETS.fetch()` with `/` path could trigger `auto-trailing-slash` redirects; no error handling. +3. **DEV_MODE proxy agent**: Found catch-all proxy serves container UI, loading.html flow replaces SPA. +4. **Hono routing agent**: Confirmed no Hono routing bug; the issue was ASSETS.fetch() failure causing silent fall-through. +5. **Miniflare agent**: Confirmed miniflare's ASSETS should return 200 for `/` when `index.html` exists, but redirect behavior depends on config. + +## Fixes Applied + +### Fix 1: Remove DEV_MODE bypass from `/api/auth/me` + +**File:** `src/routes/auth.ts` + +Removed the DEV_MODE early-return block. The endpoint now always checks for a valid session cookie, regardless of DEV_MODE. Without a cookie, it returns `{ authenticated: false }` with 401, which causes the SPA to show the login page. + +### Fix 2: Harden ASSETS.fetch() in public route and session middleware + +**Files:** `src/routes/public.ts`, `src/index.ts` + +- Changed `ASSETS.fetch()` to request `/index.html` explicitly (avoids `auto-trailing-slash` redirect) +- Wrapped in try-catch — if ASSETS fails, returns 500 with helpful message ("SPA not available. Run: make build") +- Removed debug `console.log` statements from session middleware + +## Tests Added + +### `src/routes/auth.test.ts` + +- "returns 401 without session cookie even in DEV_MODE" — verifies DEV_MODE cannot bypass login +- "returns session info with valid cookie in DEV_MODE" — verifies authenticated sessions work in DEV_MODE +- "returns 401 with expired session cookie" — verifies expired sessions are rejected +- "POST /api/auth/login creates session that GET /api/auth/me accepts" — full login→me flow + +### `src/routes/public.test.ts` (new file) + +- "GET / returns 200 with HTML content" — SPA shell is served +- "GET / requests /index.html explicitly from ASSETS" — avoids redirect +- "GET / returns 500 if ASSETS binding fails" — error is caught, not silent +- "GET / does not redirect" — no 3xx responses +- "GET /assets/* serves static files" — passthrough works +- "GET /sandbox-health returns health check JSON" — health endpoint works +- "Session middleware serves SPA for unauthenticated HTML requests" — no redirect loop + +## Lessons Learned + +1. **DEV_MODE should never fake auth status.** Dev convenience shortcuts in auth endpoints mask real bugs and prevent testing the actual login flow. DEV_MODE can skip sandbox auth (session middleware), but must not fake API responses that drive SPA navigation. + +2. **ASSETS.fetch() needs defensive coding.** Always request explicit filenames (`/index.html`) instead of paths that may trigger server-side redirects. Always wrap in try-catch since the binding may not be available (pre-build, misconfiguration). + +3. **Silent fall-through is the enemy of debugging.** When a route handler fails silently (no try-catch, error swallowed), the request falls through to the next handler, producing confusing behavior. Explicit error responses are always better than silent fall-through. diff --git a/docs/philosophy/positions/01-strict-tdd.md b/docs/philosophy/positions/01-strict-tdd.md new file mode 100644 index 000000000..a890b5546 --- /dev/null +++ b/docs/philosophy/positions/01-strict-tdd.md @@ -0,0 +1,210 @@ +# Position 01: Strict Test-Driven Development + +**Author:** strict-tdd +**Position:** Red-Green-Refactor is non-negotiable for all production code + +--- + +## Core Thesis + +Every line of production code in this project must be justified by a failing test that was written first. The Red-Green-Refactor cycle is not a nice-to-have or a team preference — it is the only reliable method for producing code whose behavior is fully specified, whose design is driven by usage rather than implementation, and whose correctness can be verified in seconds rather than minutes of manual testing against a live Cloudflare Sandbox container. + +--- + +## Key Arguments + +### 1. This codebase already proves TDD works here — the tested modules are the most reliable + +The modules with thorough unit tests — `src/auth/jwt.ts`, `src/auth/middleware.ts`, `src/gateway/env.ts`, `src/gateway/process.ts`, `src/gateway/r2.ts`, `src/gateway/sync.ts`, `src/utils/logging.ts` — are the most maintainable code in the project. Consider `buildEnvVars()` in `src/gateway/env.ts`: its 17 test cases (`src/gateway/env.test.ts`) document every mapping, every edge case (trailing slashes on URLs, legacy gateway overrides, the `MOLTBOT_GATEWAY_TOKEN` -> `OPENCLAW_GATEWAY_TOKEN` rename). When a new env var like `CF_AI_GATEWAY_MODEL` was added, the test came first, the mapping was trivial to implement, and the test suite immediately confirmed no regressions. Compare this to the *untested* modules — `src/routes/api.ts` (302 lines, 0 tests), `src/routes/cdp.ts` (1919 lines, 0 tests), `src/routes/debug.ts` (389 lines, 0 tests), `src/routes/public.ts` (70 lines, 0 tests), `src/index.ts` (450 lines, 0 tests). These are where bugs hide. + +### 2. Process-finding logic is a textbook TDD success story + +`findExistingMoltbotProcess()` in `src/gateway/process.ts` must distinguish between gateway processes (`openclaw gateway`, `start-openclaw.sh`) and CLI commands (`openclaw devices list`, `openclaw --version`, `openclaw onboard`). It also maintains backward compatibility with legacy command names (`clawdbot gateway`, `start-moltbot.sh`). The 10 test cases in `src/gateway/process.test.ts` were written to spec this behavior precisely. Without TDD, a developer adding legacy compatibility might have written a regex that accidentally matches `openclaw onboard` — the test at line 135 (`'does not match openclaw onboard as a gateway process'`) exists precisely because someone thought about the edge case *before* writing code. TDD forces you to enumerate the boundary conditions up front, not discover them in production. + +### 3. The `timingSafeEqual` function in `cdp.ts` is untested — and it has a real bug pattern + +The CDP route (`src/routes/cdp.ts:1907-1916`) implements timing-safe string comparison for secret authentication. This function has a subtle but well-known vulnerability: the early return on `a.length !== b.length` (line 1908) leaks the length of the secret via timing, partially defeating the purpose of the constant-time comparison. A TDD approach would have required writing tests that specify the contract: "comparison must not reveal information about the secret via timing." This would have forced the developer to research the correct implementation (pad both strings to the same length, or use a hash-based comparison). Instead, the function was written implementation-first and never tested, so the bug stands. + +Furthermore, the entire CDP authentication pattern — checking `providedSecret`, handling missing `CDP_SECRET`, validating `BROWSER` binding — is duplicated across three route handlers (`/cdp`, `/json/version`, `/json/list`, `/json`) with no tests. TDD would have driven extraction of a shared auth guard, because writing the same test four times is painful enough to demand refactoring. + +### 4. Env var validation logic in `index.ts` is complex and entirely untested + +`validateRequiredEnv()` (`src/index.ts:56-92`) implements a non-trivial decision tree: it checks for `MOLTBOT_GATEWAY_TOKEN`, conditionally requires CF Access vars (skipping them in dev/test mode), and validates that at least one AI provider is configured (Cloudflare AI Gateway *or* legacy gateway *or* direct Anthropic *or* OpenAI). This function has 2^7+ possible input combinations. Without tests, we have no idea if it correctly handles the case where *only* OpenAI is configured (no Anthropic key, no gateway). We don't know if it correctly skips CF Access validation when `E2E_TEST_MODE` is set but `DEV_MODE` is not. TDD would have produced a test matrix covering these combinations *before* the function was written, guaranteeing correctness by construction. + +### 5. The WebSocket proxy in the catch-all route is the highest-risk untested code + +The catch-all route in `src/index.ts:229-445` is the most complex handler in the entire codebase. It: +- Detects WebSocket upgrades +- Shows a loading page if the gateway isn't ready +- Injects gateway tokens into WebSocket requests +- Proxies bidirectional WebSocket messages +- Transforms error messages in transit +- Truncates close reasons to 123 bytes for WebSocket spec compliance +- Handles close and error events on both sides + +This is 216 lines of intricate stateful logic with zero tests. The `transformErrorMessage()` function (`src/index.ts:38-48`) is also untested. In a TDD approach, each of these behaviors would have a failing test *before* implementation. The token injection logic (line 296-299) is particularly critical — it silently adds authentication credentials to requests. If this logic is wrong (e.g., if it double-adds tokens when `token` is already in the URL), it could cause authentication failures that are extremely hard to debug in production. + +--- + +## Rebuttals to Counterarguments + +### vs. "Types are tests" (TypeScript type system as primary defense) + +Types catch a specific class of error — shape mismatches, null access on non-optional fields, wrong argument order when types differ. But this project's most dangerous bugs are *behavioral*, not structural. Consider `buildEnvVars()`: TypeScript confirms that the return type is `Record`, but it cannot verify that `AI_GATEWAY_API_KEY` *overrides* `ANTHROPIC_API_KEY` when both are set (the legacy gateway behavior at `src/gateway/env.ts:29-34`). That's a behavioral contract that only a test can specify. TypeScript also cannot catch the `timingSafeEqual` timing leak — the types are correct; the *algorithm* is wrong. + +Where types genuinely help in this codebase: the `MoltbotEnv` interface (`src/types.ts:6-45`) documents all 25+ env vars with their optionality. The `AppEnv` type ensures middleware sets `sandbox` and `accessUser` correctly. I acknowledge this — types and tests are complementary, not competing. But types alone are insufficient. This project has full type coverage *and* bugs in untested code. + +**Honest concession:** For pure data-transformation functions where the logic is a direct mapping (like a subset of `buildEnvVars`), types do reduce the marginal value of tests. But TDD still provides documentation and regression protection that types cannot. + +### vs. "Integration tests matter more than unit tests" + +Integration tests are valuable, but they are *slow*, *flaky*, and *coarse-grained* in this project's context. Running an actual Cloudflare Sandbox container takes minutes for cold start (`STARTUP_TIMEOUT_MS = 180_000` — a 3-minute timeout in `src/config.ts:9`). You cannot run a red-green-refactor cycle with a 3-minute feedback loop. The existing test suite (`vitest run`) completes in seconds because it uses mocks (`src/test-utils.ts`) that simulate sandbox behavior. + +Integration tests also suffer from the "test gap" problem: when an integration test fails, you don't know *which* unit is broken. If a sync-to-R2 integration test fails, is it the credential validation? The rclone config generation? The rclone command flags? The sync ordering? The unit tests in `src/gateway/r2.test.ts` and `src/gateway/sync.test.ts` pinpoint the failure location immediately. + +**Honest concession:** There are real classes of bugs that only integration tests catch — the interaction between `ensureMoltbotGateway()` and the real Sandbox API, the actual WebSocket proxying behavior, race conditions between concurrent requests. I advocate for a testing pyramid: many unit tests (TDD), fewer integration tests (after implementation), rare E2E tests. But the *foundation* must be TDD'd unit tests. + +### vs. "Only test critical paths" (pragmatist) + +This position sounds reasonable until you ask: "Which paths are critical?" In this codebase, the answer is *almost all of them*: +- Auth middleware? Critical (security). +- JWT verification? Critical (security). +- Env var building? Critical (misconfiguration = broken containers). +- Process finding? Critical (wrong match = killing the wrong process). +- R2 sync? Critical (data loss). +- CDP secret auth? Critical (security). +- WebSocket proxy? Critical (core user-facing feature). +- Config validation? Critical (broken deployments). + +The "pragmatist" ends up testing almost everything anyway, but without the discipline of TDD, they write tests *after* implementation — tests that confirm the code works as written rather than tests that specify how the code *should* work. Post-hoc tests are weaker because they're influenced by implementation knowledge. A developer who has already written the `timingSafeEqual` function will write a test that passes for the current (buggy) implementation rather than a test that specifies the correct security contract. + +**Honest concession:** There is genuinely low-value test territory: the static route handlers in `src/routes/public.ts` (lines 15-21, 24-31) that simply proxy to ASSETS or return a hardcoded JSON object. Testing `return c.json({ status: 'ok' })` adds no value. I accept a minimal exception for trivially correct code — but the bar for "trivially correct" must be very high. + +### vs. "Mocks lie — test with real dependencies" + +The mocks in this project (`src/test-utils.ts`) are well-designed and honest. `createMockSandbox()` exposes the same interface as a real Cloudflare Sandbox: `listProcesses()`, `startProcess()`, `exec()`, `writeFile()`, `containerFetch()`, `wsConnect()`. The mock contracts are verified by the fact that the tested modules (`process.ts`, `r2.ts`, `sync.ts`) work correctly in production — the mocks match reality. + +But this objection has real weight in one area: the `waitForProcess()` utility (`src/gateway/utils.ts`) polls `proc.getStatus()` in a loop. A mock can't truly test the timing behavior — it resolves instantly. And the `ensureMoltbotGateway()` function (`src/gateway/process.ts:56-138`) has a complex interaction pattern (find process -> wait for port -> kill if dead -> start new -> wait again) that mocks can only partially simulate. + +**Honest concession:** Mocks *can* diverge from reality, especially for the Cloudflare Sandbox API which is proprietary and evolving. When mock behavior drifts from real behavior, tests give false confidence. The mitigation is (a) keep mocks minimal — mock interfaces, not implementations, and (b) validate mock contracts with occasional integration tests against a real sandbox. But this doesn't invalidate TDD — it means TDD needs a mock-verification layer. + +### vs. "Linting and CI guardrails prevent more bugs than tests" + +This project uses `oxlint` (`src/` scope) and `oxfmt` for formatting. These catch stylistic issues and some correctness problems (unused variables, no-await-in-loop without justification). But linters cannot catch: +- The `timingSafeEqual` timing leak (it's syntactically and semantically valid JS) +- The env var override precedence (linters don't understand business logic) +- The process-matching logic edge cases (no lint rule for "don't match `openclaw onboard`") +- The WebSocket token injection correctness (linters can't verify URL manipulation logic) + +Linting is a complement to TDD, not a substitute. The `eslint-disable-next-line` comments scattered through the codebase (e.g., `src/gateway/utils.ts:21`, `src/routes/api.ts:163`) show where lint rules are *intentionally* violated — a test would verify the *reason* for the violation is still valid. + +**Honest concession:** The `typecheck` script (`tsc --noEmit`) catches a meaningful class of integration-level errors (mismatched function signatures across modules) that unit tests don't target. CI-enforced type checking is genuinely valuable and catches bugs that TDD alone would miss. + +--- + +## Specific Example: How Strict TDD Would Have Caught a Real Bug Class + +### The CDP Authentication Duplication Problem + +`src/routes/cdp.ts` contains four route handlers (`/`, `/json/version`, `/json/list`, `/json`) that each independently implement the same authentication sequence: + +```typescript +// This pattern is repeated 4 times (lines 156-170, 211-227, 262-278, 318-334): +const providedSecret = url.searchParams.get('secret'); +const expectedSecret = c.env.CDP_SECRET; + +if (!expectedSecret) { + return c.json({ error: 'CDP endpoint not configured' }, 503); +} + +if (!providedSecret || !timingSafeEqual(providedSecret, expectedSecret)) { + return c.json({ error: 'Unauthorized' }, 401); +} + +if (!c.env.BROWSER) { + return c.json({ error: 'Browser Rendering not configured' }, 503); +} +``` + +In a TDD approach, the developer would write tests for the first route: + +```typescript +it('returns 503 when CDP_SECRET is not set', ...) +it('returns 401 when secret is missing', ...) +it('returns 401 when secret is wrong', ...) +it('returns 503 when BROWSER binding is missing', ...) +it('returns 401 for timing-safe comparison', ...) +``` + +When they reach the second route handler, they would need to write the same five tests again. At this point, the TDD cycle creates natural pressure to extract a shared `cdpAuth` middleware — because *writing duplicate tests is painful*. This is TDD's design pressure at work. The developer extracts a `verifyCDPSecret()` middleware, writes tests for it once, and all four routes share the same tested auth logic. + +Without TDD, the developer copies the auth block four times (as happened here), and any future change to the auth logic (e.g., fixing the timing-safe comparison) must be applied in four places. If one copy is missed, one route has a security vulnerability while the other three are patched — a silent, partial fix that's extremely hard to detect. + +Additionally, if TDD had been applied, the developer would have been forced to write a test for `timingSafeEqual` itself. A proper test spec would include: + +```typescript +it('returns true for equal strings', ...) +it('returns false for different strings of same length', ...) +it('returns false for different strings of different length', ...) +it('does not short-circuit on length difference (timing safety)', ...) +``` + +The fourth test is the key one. It forces the developer to think about *why* they're using timing-safe comparison. The current implementation fails this contract because `if (a.length !== b.length) return false` is an early exit that leaks length information. A TDD developer would either (a) fix the implementation to not leak length, or (b) explicitly document and test the accepted risk, or (c) use the Web Crypto API's `crypto.subtle.timingSafeEqual` (available in Workers runtime) instead of a hand-rolled implementation. + +--- + +## Proposed Rules for This Project + +### Rule 1: No production code without a failing test first + +Every new function, route handler, middleware, and utility must begin with a test that fails. The test describes the desired behavior; the implementation makes it pass. This applies to: +- Gateway logic (`src/gateway/`) +- Auth middleware (`src/auth/`) +- Route handlers (`src/routes/`) +- Utility functions (`src/utils/`) +- Configuration validation (`src/index.ts`) + +### Rule 2: Test file co-location + +Tests live next to their source files (e.g., `foo.ts` and `foo.test.ts` in the same directory). This pattern is already established in the project. New routes like `src/routes/api.ts` must have a corresponding `src/routes/api.test.ts`. + +### Rule 3: Use the existing mock infrastructure + +`src/test-utils.ts` provides `createMockEnv()`, `createMockSandbox()`, `createMockExecResult()`, and `suppressConsole()`. All new tests should use these utilities. Extend them when new sandbox capabilities are needed (e.g., mock `wsConnect()` for WebSocket proxy tests). + +### Rule 4: Test the contract, not the implementation + +Tests should verify *what* a function does, not *how* it does it. For example, `buildEnvVars()` tests verify output key-value pairs, not whether the function uses `if` statements or a mapping table. This allows refactoring without breaking tests. + +### Rule 5: Minimum coverage targets for new code + +- Security-critical code (auth, secret handling, token validation): 100% branch coverage +- Core business logic (env building, process management, sync): 90%+ branch coverage +- Route handlers: test all status codes and error paths +- Pure utilities: 100% coverage (they're easy to test) + +### Rule 6: Extract shared logic when tests reveal duplication + +If writing tests for a new route handler requires duplicating test setups from another handler's tests, extract the shared logic into a testable middleware or utility function. This is TDD's design feedback — listen to it. + +### Rule 7: Acknowledge the testing pyramid + +TDD produces the *base* of the pyramid: fast, isolated unit tests. The project should also have: +- **Integration tests** (slower, test module interactions, run in CI) +- **E2E tests** (slowest, test the full Worker against a real sandbox, run pre-deploy) + +But these are in addition to, not instead of, TDD'd unit tests. + +--- + +## Weaknesses in This Position (Intellectual Honesty) + +1. **Cold start testing is genuinely hard to TDD.** The 3-minute sandbox boot process, race conditions between concurrent requests, and Durable Object state management are inherently integration-level concerns. Unit-testing `ensureMoltbotGateway()` with mocks can verify the happy path and error handling, but cannot catch real timing bugs. + +2. **The Cloudflare Sandbox API is a moving target.** Mock contracts (`src/test-utils.ts`) may drift from the real API as `@cloudflare/sandbox` evolves. Tests can pass while production breaks. This is a real risk that requires periodic integration test validation. + +3. **TDD adds upfront time.** For a project in rapid prototyping phase (which PoeClaw is, per the design doc), strict TDD can slow initial velocity. The counter-argument is that untested code accrues tech debt that slows future velocity — but the honest truth is that for a project that might pivot significantly, some of those tests become throwaway work. + +4. **Some code in this project is genuinely hard to unit test.** The WebSocket proxy (`src/index.ts:283-429`) involves bidirectional event listeners, WebSocketPair construction, and `executionCtx.waitUntil()`. Mocking all of this is possible but produces tests that are fragile and tightly coupled to implementation details — exactly what TDD should avoid. + +5. **The React admin UI (`src/client/`) is excluded from this analysis.** Frontend component testing follows different patterns (component testing, snapshot testing) that don't map cleanly to classical TDD. This position focuses on the Worker backend. diff --git a/docs/philosophy/positions/02-type-system.md b/docs/philosophy/positions/02-type-system.md new file mode 100644 index 000000000..afc9cfd8d --- /dev/null +++ b/docs/philosophy/positions/02-type-system.md @@ -0,0 +1,235 @@ +# Position 02: The Type System Is the First Line of Defense + +**Advocate:** Type System Advocate +**Date:** 2026-02-15 +**Status:** Position Paper for Scientific Debate + +--- + +## Core Thesis + +TypeScript's type system is the most cost-effective bug prevention mechanism available to this project. A well-designed type system eliminates *entire categories* of defects at compile time -- classes of bugs that tests can only catch one instance at a time, and only when someone thinks to write the test. For PoeClaw/OpenClaw, `typecheck` should be the first gate in CI, and investment in precise types (branded types, discriminated unions, exhaustive matching) will yield compounding returns as the codebase grows. + +--- + +## Key Arguments + +### 1. Types Enforce Invariants Exhaustively; Tests Enforce Them Anecdotally + +A type constraint applies to *every* call site in the codebase simultaneously. A test applies to the specific scenario the author imagined. Consider the `MoltbotEnv` interface in `src/types.ts`. It declares 30+ environment bindings, most marked as `string | undefined` (via `?`). The type system *already* forces every consumer of `c.env.ANTHROPIC_API_KEY` to handle the `undefined` case -- `strictNullChecks` makes this a compile error, not a runtime surprise. + +Now look at `gateway/env.ts`: the `buildEnvVars` function checks each variable with `if (env.X)` before assigning it. This is correct, but it's correct *because the types forced it*. If someone added a new variable and forgot the guard, TypeScript would emit an error at `envVars[key] = env[key]` due to the `string | undefined` not being assignable to `string`. No test suite catches that kind of omission unless someone specifically writes a test for the new variable -- and they usually don't. + +**Evidence from this codebase:** The project has 7 test files covering auth, gateway, and logging. There is no test that verifies `buildEnvVars` correctly handles a *newly added* environment variable. The types do this automatically. + +### 2. Hono's Type-Safe Routing Is a Force Multiplier -- But Only If We Invest in It + +The project already benefits from Hono's generic type parameter `Hono`. Every route handler gets typed access to `c.env` (bindings) and `c.get('sandbox')` / `c.get('accessUser')` (variables). This is excellent, but we're leaving value on the table. + +Currently, route parameters are implicitly `string`: +```typescript +// routes/api.ts +const requestId = c.req.param('requestId'); // string -- but is it validated? +``` + +Hono supports typed route parameters via path generics. We could define: +```typescript +adminApi.post('/devices/:requestId/approve', async (c) => { + const requestId = c.req.param('requestId'); // Typed by path +}); +``` + +More importantly, Hono supports typed *response* schemas. Today, our API responses are ad-hoc objects -- `c.json({ success: true }, 200)` -- with no shared contract between server and client. The client-side types in `src/client/api.ts` (e.g., `DeviceListResponse`, `ApproveResponse`) are *independently maintained duplicates* of the server's response shapes. Nothing enforces that they stay in sync. A type-first approach would define shared response types imported by both sides, making desynchronization a compile error. + +### 3. The `as unknown as` Pattern Is a Type System Smell That Reveals Missing Abstractions + +The JWT verification in `auth/jwt.ts:34` performs a double cast: +```typescript +return payload as unknown as JWTPayload; +``` + +This is not a failure of the type system -- it's a failure to *use* the type system. The `jose` library returns a `JWTPayload` type from its own module, which has a different shape from our `JWTPayload`. Rather than bridging these types properly (via a mapping function that validates the expected fields), we bypass the type checker entirely. + +This pattern appears throughout the CDP route (`routes/cdp.ts`) with 100+ type assertions like: +```typescript +const format = (params.format as string) || 'png'; +const quality = params.quality as number | undefined; +const clip = params.clip as { x: number; y: number; w: number; h: number } | undefined; +``` + +Every `as` assertion is an unverified assumption. Each one is a potential runtime error that the type system *could* prevent if we invested in proper typed parameter extraction. A disciplined type-first approach would replace these with a validated parser (e.g., Zod schemas), making the CDP route both type-safe *and* runtime-safe. + +### 4. Discriminated Unions Prevent Impossible States + +The project already uses a simple discriminated union in `AccessMiddlewareOptions`: +```typescript +type: 'json' | 'html' +``` + +This drives conditional logic in `middleware.ts` where `json` routes return `c.json()` and `html` routes return `c.html()`. But the implementation doesn't use exhaustive matching -- it uses `if/else`, meaning the compiler won't warn if a third type (e.g., `'redirect'`) is added to the union. + +More critically, the process management types from `@cloudflare/sandbox` use implicit string literals (`'running' | 'starting' | 'completed' | 'failed'`) but the codebase treats them as `string` in several places (e.g., `test-utils.ts:32`: `status: status as Process['status']`). A proper discriminated union with exhaustive `switch` statements would make adding a new process state (e.g., `'suspended'`) a compile-time task -- the compiler would flag every handler that doesn't account for it. + +### 5. The Boolean String Anti-Pattern Is a Type System Opportunity + +Throughout the codebase, boolean configuration is represented as `string | undefined` and checked with `=== 'true'`: +```typescript +// middleware.ts:19 +return env.DEV_MODE === 'true'; + +// index.ts:58 +const isTestMode = env.DEV_MODE === 'true' || env.E2E_TEST_MODE === 'true'; +``` + +This is fragile. Setting `DEV_MODE='yes'` or `DEV_MODE='1'` silently fails. The type system can't prevent this because the type is `string | undefined`, not `'true' | 'false' | undefined`. While Cloudflare Workers environment variables *are* strings at runtime, we could introduce a branded type or wrapper: + +```typescript +type BooleanEnvVar = 'true' | 'false'; +``` + +This narrows the type and makes the intent explicit. Tests for this behavior would need to enumerate every truthy/falsy string someone might pass; the type system prevents the problem structurally. + +--- + +## Anticipated Counterarguments and Rebuttals + +### Against Strict TDD Purists: "Tests First, Always" + +**Their argument:** Tests are executable specifications. Types are just constraints -- they don't prove behavior. You should write the test first, see it fail, then implement. Types are a nice-to-have, not the driver. + +**Rebuttal:** I agree that tests specify *behavior*. But types specify *structure* and *contracts*, which is a different (and complementary) concern. The TDD cycle assumes you can enumerate the important cases. For a function like `buildEnvVars` that maps 20+ optional variables, exhaustive testing requires 2^20 combinations. The type system handles this with one `string | undefined` annotation. + +Moreover, TDD in this project's test setup reveals a practical problem: the mock utilities in `test-utils.ts` use `as any` to create mock environments. This means the test infrastructure *itself* undermines type safety. A type-first approach would make test mocks type-safe, ensuring tests don't pass against incorrect shapes. + +**Where I concede:** TDD excels at specifying *behavioral* contracts -- e.g., "when the JWT is expired, the middleware returns 401." Types cannot express temporal logic, ordering constraints, or side effects. For those, tests are irreplaceable. + +### Against Integration Testers: "Test the System, Not the Types" + +**Their argument:** What matters is whether the deployed Worker actually handles requests correctly. Integration tests against the real Cloudflare Sandbox give confidence that the system works end-to-end. Types only tell you about one module's internal consistency. + +**Rebuttal:** Integration tests are essential but *expensive*. They require a running Sandbox, network access, and real (or mocked) R2 buckets. They're slow, flaky, and test one scenario at a time. Types, by contrast, are checked in <2 seconds (`tsc --noEmit`), run locally without infrastructure, and cover every code path simultaneously. + +The real risk in this project is the *boundary* between the Worker and the container -- the environment variable mapping in `gateway/env.ts`. An integration test verifies that one specific variable passes through correctly. The type system verifies that *the interface between Worker and container is structurally sound*. These are complementary, but types give more coverage per unit of effort. + +**Where I concede:** Types cannot catch misconfigurations (wrong AI Gateway URL, invalid R2 credentials, network failures). The Worker-to-container boundary involves string serialization that the type system cannot validate at compile time. Integration tests are the right tool there. + +### Against Pragmatists: "Only Test What Matters" + +**Their argument:** Most bugs come from a few critical paths. Focus testing effort on auth, payment, and data integrity. Don't waste time on fancy types for configuration plumbing. + +**Rebuttal:** I agree with the prioritization principle, but types aren't "waste." They're *amortized* prevention. The time to define `MoltbotEnv` was spent once; it prevents misconfiguration bugs on every future code change. The cost of a branded type for API keys is 10 minutes; the cost of deploying with `ANTHROPIC_API_KEY` set to a Discord token is an outage. + +More importantly, the auth path in this project is *exactly* where types matter most. The `JWTPayload` interface defines the shape of security-critical data. The `AccessUser` type flows through middleware into route handlers. If these types are wrong, the auth system silently passes invalid data. Types are not opposed to pragmatism -- they're the *most pragmatic* way to prevent structural errors in critical paths. + +**Where I concede:** There is a real cost to over-typing. Branded types for every string field, phantom types for state machines, HKTs for middleware composition -- these can make the codebase harder to read and maintain. The pragmatist's instinct to avoid accidental complexity is sound. The question is where the line is, and I argue it should be drawn further toward "more types" than this codebase currently sits. + +### Against Anti-Mock Realists: "Types Don't Catch Runtime Behavior" + +**Their argument:** The type system says `env.ANTHROPIC_API_KEY` is `string | undefined`. At runtime, it might be an empty string `""`, a malformed key, or a key for the wrong environment. Types don't check *values*, only *shapes*. Runtime validation (and tests that exercise it) are what actually prevent bugs. + +**Rebuttal:** This is the strongest counterargument, and I take it seriously. Types *cannot* distinguish `""` from a valid API key. Types *cannot* verify that a JWT's `exp` field is in the future. Types *cannot* ensure that `MOLTBOT_GATEWAY_TOKEN` matches what the container expects. + +But this argument proves too much. By this logic, we should also abandon linting, code review, and static analysis -- none of them check runtime values. The question is not whether types are sufficient (they aren't), but whether they're *cost-effective* (they are). A typed environment interface costs minutes to maintain and prevents an entire category of "undefined is not a function" errors. Runtime validation *on top of* that catches value-level bugs. They compose; they don't compete. + +Furthermore, the anti-mock position actually *supports* stronger types. If you want to test against real systems instead of mocks, you need clean interfaces between components. Well-typed interfaces (e.g., a properly typed `CDPRequest` instead of `Record`) make it easier to build realistic test fixtures without `as any` escape hatches. + +**Where I concede:** For the CDP shim route, where the protocol is inherently dynamic (arbitrary Chrome DevTools Protocol messages), heavy typing may be counterproductive. The protocol has hundreds of commands with different parameter shapes. Attempting to type them all statically may create more maintenance burden than bugs prevented. A runtime validation layer (Zod, Valibot) might be more appropriate there, with types *derived from* the validation schemas rather than the reverse. + +--- + +## Specific Examples: How Better Types Would Prevent Bugs + +### Example 1: Environment Variable Desynchronization + +**Current state:** `MoltbotEnv` defines variables, `buildEnvVars` manually maps them, and `validateRequiredEnv` checks a subset. Nothing connects these three -- adding a new required variable to `MoltbotEnv` doesn't force updates to `validateRequiredEnv` or `buildEnvVars`. + +**With better types:** +```typescript +// Define which env vars are required vs optional +type RequiredEnvKeys = 'MOLTBOT_GATEWAY_TOKEN'; +type RequiredInProdKeys = 'CF_ACCESS_TEAM_DOMAIN' | 'CF_ACCESS_AUD'; + +// The validate function's return type would be derived from the interface, +// making omissions a compile error. +``` + +### Example 2: API Response Contract Drift + +**Current state:** Server returns `c.json({ success: true, requestId }, 200)`. Client expects `ApproveResponse` with `success: boolean; requestId: string; message?: string`. If the server adds a field or changes a name, the client type is silently wrong. + +**With better types:** Shared response types imported by both `routes/api.ts` and `client/api.ts`. The `apiRequest` generic would reference the shared type, and any server-side change would cause a client-side compile error. + +### Example 3: Process Status Exhaustiveness + +**Current state:** Process status is checked with string comparisons: +```typescript +if (proc.status === 'starting' || proc.status === 'running') { ... } +``` + +If `@cloudflare/sandbox` adds a `'paused'` status, this code silently ignores paused processes. No test catches this unless someone writes one for the new status. + +**With better types:** An exhaustive `switch` with a `never` default would make the compiler flag unhandled statuses: +```typescript +switch (proc.status) { + case 'running': case 'starting': return proc; + case 'completed': case 'failed': continue; + default: const _exhaustive: never = proc.status; // Compile error on new status +} +``` + +### Example 4: The `as any` Mock Problem + +**Current state:** Test utilities create mocks with `{} as any`: +```typescript +Sandbox: {} as any, +ASSETS: {} as any, +MOLTBOT_BUCKET: {} as any, +``` + +If a test calls `c.env.MOLTBOT_BUCKET.get('key')`, it gets a runtime error because the mock has no `get` method -- but TypeScript thinks it's fine because `as any` suppressed the check. + +**With better types:** Typed mock factories that implement the required interface methods, or using `satisfies` to ensure mock shapes match expected types without losing type checking. + +--- + +## Proposed Rules for This Project + +1. **`typecheck` runs first in CI.** Before lint, before tests. Rationale: type errors indicate structural problems that make test results unreliable. + +2. **Zero tolerance for `as any`.** Use `as unknown as T` when absolutely necessary (external library boundaries), and prefer runtime validation + type narrowing over type assertions. Each `as` assertion should have a comment explaining why it's necessary. + +3. **Shared response types between server and client.** Create a `src/shared/api-types.ts` module imported by both `routes/` and `client/`. The client's `apiRequest` generic should reference these shared types. + +4. **Exhaustive pattern matching for discriminated unions.** Use `switch` + `never` default for all union types: process status, middleware response type, and any future discriminated unions. + +5. **Branded types for security-sensitive strings.** API keys, JWT tokens, and gateway tokens should use branded types to prevent accidental mixing: + ```typescript + type APIKey = string & { readonly __brand: 'APIKey' }; + type GatewayToken = string & { readonly __brand: 'GatewayToken' }; + ``` + +6. **Derive, don't duplicate.** When runtime validation is needed (e.g., CDP params, env vars), use a schema library (Zod/Valibot) and derive TypeScript types from schemas -- not the reverse. This ensures types and validation stay in sync. + +7. **Type-safe test utilities.** Mock factories should implement typed interfaces, not use `as any`. If a mock is partial, use `Partial` or a dedicated mock type -- never `any`. + +--- + +## Honest Assessment of Weaknesses + +1. **Types have no runtime presence.** TypeScript types are erased at compile time. In a Cloudflare Workers environment where environment variables arrive as untyped strings from `wrangler secret`, the type system cannot validate actual values. Runtime validation is essential at system boundaries. + +2. **Diminishing returns on type complexity.** Branded types, conditional types, and mapped types increase cognitive load. Junior contributors may struggle with `Type>` patterns. There is a real trade-off between type safety and accessibility. + +3. **External library types are often wrong.** The `jose` library's types don't match our `JWTPayload`. Cloudflare's `@cloudflare/sandbox` types may lag behind runtime behavior. When third-party types are incorrect, the type system provides *false confidence* -- arguably worse than no types at all. + +4. **Types cannot express temporal or behavioral properties.** "The JWT must be verified before the route handler runs" is a *sequencing* constraint that types cannot enforce. Middleware ordering in Hono is a runtime concern. Tests (or careful code review) are the only way to verify these properties. + +5. **The cost is real.** Maintaining precise types requires ongoing effort. When the `@cloudflare/sandbox` SDK changes, type definitions must be updated. When the API schema evolves, shared types must be versioned. This is maintenance burden that tests don't carry (tests just break visibly). + +--- + +## Conclusion + +Types and tests are not opponents -- they are complementary tools that operate at different levels of abstraction. Types prevent *structural* defects (wrong shapes, missing fields, impossible states). Tests prevent *behavioral* defects (wrong logic, incorrect sequences, integration failures). But in terms of cost-effectiveness, types win decisively: one type annotation prevents a class of bugs across every call site, while one test prevents one bug in one scenario. + +For PoeClaw/OpenClaw, the evidence from the codebase -- the `as unknown as` casts, the duplicated client/server types, the `as any` test mocks, the string-typed booleans -- shows that we are *under-invested* in types. The type system is already doing significant work (strict mode, Hono generics, `MoltbotEnv`), but we're leaving value on the table by not pushing it further. Investing in precise types will pay compounding dividends as this codebase grows. diff --git a/docs/philosophy/positions/03-integration-first.md b/docs/philosophy/positions/03-integration-first.md new file mode 100644 index 000000000..7d83616ab --- /dev/null +++ b/docs/philosophy/positions/03-integration-first.md @@ -0,0 +1,224 @@ +# Position 03: Integration-First Testing + +**Author:** Integration-First Testing Advocate +**Date:** 2026-02-15 +**Status:** Position paper for scientific debate + +--- + +## Core Thesis + +In a system whose primary purpose is to proxy, orchestrate, and relay between external boundaries — a Cloudflare Worker bridging clients, sandbox containers, R2 storage, and authentication providers — **the bugs that ship to production live at the seams, not in the functions**. Unit tests that mock `sandbox.exec()`, `sandbox.wsConnect()`, and `sandbox.containerFetch()` verify that your code calls the right mock in the right order; they do not verify that the system works. Integration tests that exercise real boundary crossings are the minimum viable correctness guarantee for this architecture. + +--- + +## Key Arguments + +### 1. The Mock Fidelity Problem Is Structural, Not Incidental + +Examine `sync.test.ts`. The test for a successful R2 sync chains **seven sequential mock return values**: + +```typescript +execMock + .mockResolvedValueOnce(createMockExecResult('yes')) // rclone configured + .mockResolvedValueOnce(createMockExecResult('openclaw')) // config detect + .mockResolvedValueOnce(createMockExecResult()) // rclone sync config + .mockResolvedValueOnce(createMockExecResult()) // rclone sync workspace + .mockResolvedValueOnce(createMockExecResult()) // rclone sync skills + .mockResolvedValueOnce(createMockExecResult()) // date > last-sync + .mockResolvedValueOnce(createMockExecResult(timestamp)) // cat last-sync +``` + +This test asserts that `syncToR2` calls `sandbox.exec()` seven times with the right strings. But the actual bug surface is: does rclone with these flags, these paths, and these credentials actually sync data to R2? Does the `--exclude='.git/**'` flag work with rclone's glob syntax or does it silently fail? Does `rclone sync` with `--s3-no-check-bucket` behave correctly when the bucket doesn't exist yet? The mock returns success unconditionally. The real system has opinions. + +This is not a failure of test-writing discipline — it is a structural property of testing code whose job is to orchestrate external systems via shell commands. When the function under test is essentially a shell script builder, mocking the shell is mocking the thing you need to test. + +### 2. The WebSocket Proxy Cannot Be Meaningfully Unit Tested + +The WebSocket proxy in `src/index.ts:282-429` is 150 lines of bidirectional event relay with error transformation. It: + +1. Creates a `WebSocketPair` (Workers runtime API) +2. Calls `sandbox.wsConnect()` (Sandbox Durable Object API) +3. Accepts both WebSockets (runtime method) +4. Attaches event listeners for message, close, and error on both sides +5. Parses JSON messages from the container, transforms error messages, re-serializes +6. Handles `readyState` checks before sending +7. Truncates close reasons to 123 bytes (WebSocket spec limit) + +To unit test this, you would need to mock: `WebSocketPair`, `sandbox.wsConnect()`, WebSocket `addEventListener`, `readyState`, `send`, `close`, `accept`, and `JSON.parse` behavior on binary vs. string messages. The resulting test would be a specification of the implementation, not a specification of the behavior. Change the implementation (e.g., switch from event listeners to async iteration), and the test breaks even if the behavior is identical. + +An integration test that opens a real WebSocket to a running worker, sends a message through to a real container, and verifies the response exercises the actual behavior. It would catch: WebSocket upgrade failures, event ordering bugs, readyState race conditions, binary message handling, and error transformation on real gateway responses. + +### 3. The Auth Flow Requires Real Token Verification + +The auth middleware (`src/auth/middleware.ts`) has two bypass modes (`DEV_MODE`, `E2E_TEST_MODE`) and a production path that: + +1. Extracts JWT from `CF-Access-JWT-Assertion` header or `CF_Authorization` cookie +2. Verifies the JWT signature against Cloudflare Access's JWKS endpoint +3. Validates the audience claim +4. Returns 401/403 with appropriate content types (JSON vs HTML) + +The unit tests verify JWT extraction logic and bypass mode detection — useful, but not the bug surface. The bugs that ship are: JWKS key rotation causing verification failures, audience mismatch between environments, cookie parsing edge cases in different browsers, and the interaction between CF Access's redirect flow and the WebSocket token injection (`src/index.ts:296-300`). Specifically, line 296-300 shows that when CF Access strips query params during redirect, the worker re-injects the gateway token. This interaction between auth redirect behavior and token injection is invisible to any unit test. + +### 4. The Gateway Lifecycle Has Concurrency Bugs That Only Manifest Under Load + +`ensureMoltbotGateway()` in `src/gateway/process.ts` implements a check-then-act pattern: + +1. Check for existing process (`findExistingMoltbotProcess`) +2. If found, wait for port readiness (3-minute timeout) +3. If timeout, kill and restart +4. If not found, start new process + +The comment on line 71-73 is telling: + +> *"Always use full startup timeout — a process can be 'running' but not ready yet (e.g., just started by another concurrent request). Using a shorter timeout causes race conditions where we kill processes that are still initializing."* + +This is a concurrency bug that was discovered in production, not in unit tests. The unit test for `findExistingMoltbotProcess` verifies pattern matching on command strings — it cannot exercise the race condition where two concurrent requests both find no existing process and both start a new gateway. An integration test that sends two concurrent requests to a cold worker would expose this class of bug immediately. + +### 5. The CDP Shim Has Zero Unit Tests Because It Can't Be Meaningfully Unit Tested + +The CDP (Chrome DevTools Protocol) shim in `src/routes/cdp.ts` implements 50+ CDP methods by translating CDP JSON-RPC over WebSocket into Puppeteer calls against Cloudflare Browser Rendering. It has zero unit tests. This is not an oversight — it's an admission that mocking Puppeteer's behavior (page navigation timing, DOM mutation, screenshot encoding, request interception) would produce tests that verify nothing about whether the CDP translation is correct. + +The only meaningful test for this module is an integration test that connects a CDP client (like Playwright or chrome-remote-interface), sends real CDP commands, and verifies real browser behavior. The existing E2E tests partially cover this (they use Playwright through the CDP shim), but targeted integration tests for individual CDP domains would catch protocol translation bugs that neither unit tests nor full E2E tests efficiently surface. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "You need fast feedback loops" + +**The argument:** Unit tests run in milliseconds. Integration tests take seconds to minutes. Fast feedback loops are essential for developer productivity and TDD's red-green-refactor cycle. + +**The rebuttal:** I concede the speed advantage entirely. Unit tests are faster. But speed is not the only dimension of a feedback loop — *signal quality* matters too. A test suite that runs in 200ms and tells you "your mocks behave as expected" provides fast feedback about the wrong thing. A test suite that runs in 30 seconds and tells you "your worker correctly proxies a WebSocket connection through a real sandbox container" provides slower feedback about the right thing. + +The pragmatic synthesis: run integration tests on save for the module you're editing (scoped, not full suite). Use `vitest --watch` with a test filter. The feedback loop for a single integration test hitting a local miniflare instance is 2-5 seconds — fast enough for productive iteration. + +**Honest concession:** For pure logic functions (JWT parsing, command string building, config validation), unit tests are the right tool. I don't advocate eliminating them. I advocate demoting them from the primary testing strategy to a supplementary one. + +### Against Type Advocates: "Types catch it at compile time" + +**The argument:** TypeScript's type system catches interface mismatches, missing fields, wrong argument types. With strict types, many integration bugs become compile errors. + +**The rebuttal:** Types operate within a single compilation unit. They cannot span the worker-container boundary. The `Sandbox` type says `exec()` returns `Promise` — it cannot express that `rclone sync` with `--s3-no-check-bucket` silently succeeds but syncs zero files when the endpoint URL is wrong. The `WebSocket` type says `send()` accepts `string | ArrayBuffer` — it cannot express that the container's WebSocket sends binary frames for certain message types that the JSON parse in the error transformer will silently skip. + +Types are a necessary condition for correctness at boundaries, not a sufficient one. The type of `sandbox.exec(command)` is always `Promise` regardless of whether `command` is a valid rclone invocation or gibberish. The boundary is stringly-typed by nature. + +**Honest concession:** For the Hono route definitions, TypeScript's type system does catch real bugs (wrong response types, missing middleware). Types and integration tests are complementary, not competing. + +### Against Pragmatists: "Integration tests are slow and flaky" + +**The argument:** Integration tests require infrastructure setup, are sensitive to timing, and produce intermittent failures that erode trust in the test suite. + +**The rebuttal:** This is the strongest counterargument and I take it seriously. Integration test flakiness is a real engineering problem, not a strawman. My response has three parts: + +1. **Flakiness is a solvable engineering problem, not an inherent property.** The existing E2E tests in `test/e2e/` use Terraform to deploy real infrastructure and run Playwright against it. That's maximally flaky. I'm not proposing that. I'm proposing integration tests against `miniflare` (Cloudflare's local simulator) with a real sandbox container, which eliminates network variance and deployment timing. + +2. **The alternative is worse.** The current test suite has 130+ unit tests and zero integration tests (the E2E tests are a separate, heavy-weight suite). This means there is no automated verification that the worker correctly proxies requests, handles auth, or syncs to R2. The choice is not "flaky integration tests vs. reliable unit tests" — it's "flaky integration tests vs. no boundary testing at all." + +3. **Flaky tests that catch real bugs are more valuable than stable tests that catch hypothetical bugs.** A flaky WebSocket proxy integration test that fails 5% of the time due to timing — but catches real relay bugs the other 95% — provides more value than a stable unit test that verifies mock call ordering. + +**Honest concession:** Integration test infrastructure requires maintenance. Someone has to keep miniflare configs working, manage test container images, and handle the inevitable "works on my machine" problems. This is real cost that must be budgeted. + +### Against Anti-Mock Realists: Distinguishing Our Position + +**The argument:** Anti-mock advocates say "don't mock what you don't own." We largely agree but draw a different conclusion. + +**The distinction:** Anti-mock realism says: don't mock external interfaces, write thin wrappers, and test your logic in isolation. This is correct as far as it goes, but for PoeClaw it doesn't go far enough. The worker's logic *is* the orchestration of external interfaces. There is no "business logic" to isolate behind thin wrappers — the business logic is: "proxy this WebSocket, inject this token, transform this error, sync these files." If you extract the orchestration into thin wrappers and test the wrappers' callers in isolation, you've just moved the mocks one level up. + +The integration-first position says: stop trying to isolate the logic from the boundaries. The logic is the boundaries. Test the boundaries. + +**Point of agreement:** We share the core insight that mocks create a parallel universe that diverges from reality. Where we diverge is the remedy: anti-mock realists restructure code to minimize the need for mocks; integration-first advocates structure *tests* to minimize the need for mocks by testing against real systems. + +--- + +## Concrete Examples: Bugs Integration Tests Would Catch + +### Example 1: The rclone `--exclude` Flag Syntax Bug + +`syncToR2` passes `--exclude='*.lock'` to rclone via `sandbox.exec()`. But rclone's exclude syntax depends on the shell parsing. If the exec environment doesn't invoke a shell (direct exec), the single quotes are passed literally to rclone, which interprets `'*.lock'` as a literal filename pattern including quotes. The unit test mocks `sandbox.exec()` and never discovers this. An integration test running the actual rclone command in a real container would fail immediately. + +### Example 2: The WebSocket Token Injection Race + +Lines 296-300 of `src/index.ts` inject the gateway token into the WebSocket URL when CF Access strips query params. But `sandbox.wsConnect()` may not support URL mutation after the initial handshake in certain Sandbox versions. A unit test mocks `wsConnect` and returns a successful response regardless. An integration test would reveal that the mutated URL causes a 400 from the sandbox. + +### Example 3: The Gateway Process Kill-Restart Race + +When `ensureMoltbotGateway` kills a stuck process and starts a new one, the old process may hold the port briefly during teardown. The new process's `waitForPort` may succeed against the dying old process, then lose the connection. The unit test mocks `kill()` as instantly successful and `waitForPort` as deterministic. An integration test with a real container would expose the port-binding race. + +### Example 4: The R2 Sync Partial Failure Silencing + +`syncToR2` marks workspace and skills sync as non-fatal (`|| true`). If rclone silently fails on workspace sync (e.g., permission denied on a file), the function returns `{ success: true }` with only config actually synced. The user sees "sync successful" but their workspace isn't persisted. A unit test mocks all execs as successful. An integration test with a real file that has restricted permissions would reveal the silent data loss. + +### Example 5: The CDP Screenshot Encoding Mismatch + +The CDP shim implements `Page.captureScreenshot` by calling Puppeteer's `page.screenshot()` and returning base64-encoded data. If Puppeteer returns a `Buffer` and the CDP response expects a base64 string without the data URI prefix, the client receives corrupted data. No unit test can catch this because there's no Puppeteer to return a real `Buffer`. An integration test with a real browser connection would catch the encoding mismatch immediately. + +--- + +## Proposed Rules for PoeClaw + +### Rule 1: Every System Boundary Gets an Integration Test + +For each boundary crossing (Worker ↔ Container, Worker ↔ R2, Worker ↔ Client, Worker ↔ CF Access), write at least one integration test that exercises the real protocol. No mocks at the boundary. + +### Rule 2: Unit Tests for Pure Logic Only + +Unit tests are appropriate for: +- `buildEnvVars()` — pure data transformation +- `findExistingMoltbotProcess()` — pattern matching (though integration tests should also cover it) +- JWT extraction logic — string parsing +- `transformErrorMessage()` — pure string transformation +- Config validation — predicate logic + +Unit tests are **not** appropriate for: +- `syncToR2()` — shell command orchestration +- `ensureMoltbotGateway()` — process lifecycle with concurrency +- WebSocket proxy — runtime API interaction +- CDP shim — protocol translation +- Auth middleware end-to-end — multi-step verification + +### Rule 3: Use Miniflare for Local Integration Tests + +The integration test environment should be `miniflare` + a real sandbox container (or a lightweight container stub). This is faster than full deployment (E2E) but exercises real APIs. Target: individual integration tests complete in under 10 seconds. + +### Rule 4: Integration Tests Are First-Class CI Citizens + +Integration tests run in CI on every PR, not as a nightly job or manual step. If they're too slow for PR checks, that's a signal to invest in faster test infrastructure, not to demote them. + +### Rule 5: Flaky Tests Get Fixed, Not Deleted + +A flaky integration test is a test that's detecting a real timing-dependent bug. Investigate the flake. Add retries with backoff for infrastructure variance, but never silence a flaky test by deleting it or marking it as skipped without understanding the root cause. + +### Rule 6: The Test Pyramid Is Inverted for Proxy Architectures + +The traditional test pyramid (many unit tests, fewer integration tests, few E2E tests) assumes that most logic is internal computation. For a proxy/orchestrator like PoeClaw, most logic is boundary crossing. The appropriate shape is: + +``` + /‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾\ E2E (few, full deployment) + /‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾\ Integration (many, real boundaries) + \_______________________/ Unit (few, pure logic only) +``` + +--- + +## Acknowledged Weaknesses + +1. **Setup complexity.** Integration tests require a running sandbox container, R2 credentials (or emulator), and miniflare configuration. This is more infrastructure than `vitest` alone. + +2. **Slower feedback.** A full integration test suite will take 30-120 seconds vs. < 1 second for unit tests. This is a real cost for iterative development. + +3. **Environment parity.** Miniflare is not identical to the Cloudflare Workers runtime. There will be gaps. The CDP shim in particular requires real Browser Rendering, which is not available locally. + +4. **Debugging difficulty.** When an integration test fails, the failure may be in any layer of the stack. Unit test failures point directly at the broken function. Integration test failures require investigation. + +5. **Test data management.** Integration tests that touch R2 need bucket setup/teardown. Tests that start gateway processes need cleanup. This is ongoing maintenance burden. + +These are real costs. I argue they are worth paying because the alternative — a test suite that validates nothing about the system's actual boundary behavior — is more expensive in production incidents. + +--- + +## Summary + +PoeClaw is not a computation engine. It is a proxy, orchestrator, and boundary-crossing system. Its correctness properties are defined at the seams between components, not within them. A testing strategy that focuses on unit-testing isolated functions with mocked boundaries is testing the wrong thing with great precision. Integration tests that exercise real boundary crossings — real sandbox containers, real WebSocket connections, real rclone invocations — test the right thing with acceptable imprecision. + +The question is not "are integration tests harder to write?" They are. The question is "do integration tests catch the bugs that matter?" In a proxy architecture, they do. That is why they should be first. diff --git a/docs/philosophy/positions/04-pragmatic-minimalist.md b/docs/philosophy/positions/04-pragmatic-minimalist.md new file mode 100644 index 000000000..6138664d6 --- /dev/null +++ b/docs/philosophy/positions/04-pragmatic-minimalist.md @@ -0,0 +1,209 @@ +# Position 04: The Pragmatic Testing Minimalist + +**Author:** Pragmatist +**Position:** Test only what matters. Invest testing effort where the risk-reward ratio justifies it. + +--- + +## Core Thesis + +Testing is an economic activity, not a moral one. Every test written carries an ongoing maintenance cost that must be weighed against the bugs it will actually catch. In an experimental, rapidly-iterating project like PoeClaw — where entire subsystems may be rewritten or discarded within weeks — the correct testing strategy is surgical precision: test the invariants that protect users and data, skip the glue code that the type system already validates, and never confuse test count with test value. + +--- + +## Key Arguments + +### 1. This Project's Lifecycle Demands Testing Discipline, Not Testing Volume + +PoeClaw is explicitly labeled "experimental" and "proof of concept." The design document (`docs/plans/2026-02-15-poeclaw-design.md`) lists five implementation phases, and the current codebase sits at roughly Phase 0.5 — a single-tenant sandbox wrapper. Phases 1 through 4 will introduce session-based auth (replacing the current CF Access JWT flow), Poe provider integration, per-user R2 persistence, and an entirely new chat frontend. Each of these phases will rewrite or discard existing code. + +Writing exhaustive tests for code with a half-life of weeks is not engineering discipline — it is waste. The Lean Software Development principle of "decide as late as possible" applies to test investment as much as to architectural decisions. Testing the current `extractJWT` cookie-parsing logic (`src/auth/middleware.ts:32-41`) down to every edge case is futile when the design document explicitly calls for replacing it with a session-cookie system. + +**What to do instead:** Test the *invariants* that will survive the rewrites. The invariant "unauthenticated requests must be rejected" will persist regardless of whether auth uses JWTs or sessions. The invariant "R2 sync must not silently lose data" will persist regardless of whether it's single-tenant or multi-tenant. Test the contract, not the implementation. + +### 2. The Existing Test Suite Already Demonstrates Optimal ROI Testing + +The current 7 test suites (~900 lines) are a case study in pragmatic testing done well. Consider what they cover: + +- **`jwt.test.ts`**: Signature validation, expiry, audience/issuer checks. These are *security invariants* — a failure here means unauthorized access. High-severity, hard to catch by inspection. Correct to test. +- **`process.test.ts`**: Gateway vs. CLI command discrimination (`src/gateway/process.ts:19-31`). This is subtle string-matching logic with legacy compatibility requirements across four command patterns. A regression here means the worker kills the gateway or fails to find it. Correct to test. +- **`sync.test.ts`**: R2 persistence correctness — the difference between `rclone sync` (propagates deletions) and `rclone copy` (doesn't) is a data-loss bug. The exclusion patterns (`.git`, `.lock`, `.log`, `.tmp`) protect against syncing transient files. Correct to test. +- **`logging.test.ts`**: Sensitive parameter redaction. A failure means leaking tokens/passwords to logs. Correct to test. + +Now consider what they *don't* test: + +- `src/routes/public.ts` (70 lines): Health checks and asset serving. If these break, you'll know in seconds — the page won't load. +- `src/routes/admin-ui.ts` (19 lines): A one-line route delegation. The type system guarantees this compiles correctly. Testing it tests Hono's router, not your code. +- `src/assets/`: Static HTML templates. Testing markup structure is testing string literals. +- `src/index.ts` WebSocket relay logic (lines 282-429): Complex, but it's orchestration glue between the Sandbox SDK and the WebSocket API. The failure mode is visible immediately (the connection drops), and the logic is hard to unit-test without mocking away the parts that matter. + +This selective approach is not laziness. It is the recognition that testing effort follows a power law: a small number of well-chosen tests catch a disproportionate share of real bugs. + +### 3. Over-Testing Creates Concrete Costs That Under-Testing Advocates Ignore + +Every test has three costs beyond the initial writing: + +1. **Maintenance cost**: When the implementation changes, tests must be updated. The `sync.test.ts` suite has 7 sequential mock resolutions per test case (`execMock.mockResolvedValueOnce(...)` chained 7 times). If the sync implementation adds a step — say, a pre-sync integrity check — every test case breaks. This is not hypothetical; the codebase has already undergone a `clawdbot → openclaw` rename and an `s3fs/rsync → rclone` migration (git log: `95307a3`). +2. **False confidence cost**: A green test suite creates psychological permission to deploy. But if those tests are testing mocked interactions rather than real behaviors, the confidence is misplaced. The `sync.test.ts` tests verify that the correct rclone CLI strings are constructed — they do *not* verify that rclone actually syncs data correctly. That's an integration concern that unit tests fundamentally cannot address. +3. **Velocity cost**: In a project where the design document has 5 open questions ("Does `api.poe.com/v1/models` return a stable user/account ID?", "Can OpenClaw run in 1 GiB RAM?"), the scarce resource is iteration speed, not test coverage percentage. Every hour spent writing tests for code that may be deleted next week is an hour not spent answering those existential questions. + +### 4. The Risk Profile of This Codebase Is Highly Non-Uniform + +Not all code in PoeClaw carries equal risk. A testing strategy must respect this asymmetry: + +| Component | Severity if Broken | Detectability | Test Value | +|-----------|-------------------|---------------|------------| +| JWT verification | **Critical** — unauthorized access | Low (silent) | **High** | +| Process discrimination | **High** — kills gateway or starts duplicates | Medium (logs) | **High** | +| R2 sync correctness | **High** — data loss | Low (silent until next restore) | **High** | +| Credential redaction | **High** — secret leakage | Low (requires log review) | **High** | +| Env var building | Medium — wrong AI provider config | High (immediate error) | Medium | +| WebSocket relay | Medium — connection drops | High (user sees immediately) | Low | +| Route registration | Low — 404s | High (immediate) | **None** | +| HTML templates | Low — visual glitch | High (immediate) | **None** | +| Admin UI (React) | Low — cosmetic | High (immediate) | **None** | + +The correct strategy is to concentrate testing effort in the upper-left quadrant: high severity, low detectability. These are the bugs that slip through manual testing and cause real harm. Everything in the lower-right quadrant — low severity, high detectability — will be caught by the first person who opens the page. + +### 5. The 80/20 Rule Is Empirically Supported + +The Pareto principle in software defects is well-documented. Studies from Microsoft Research (Nagappan et al., 2005) and IBM (Fenton & Ohlsson, 2000) consistently show that a small fraction of modules contain most defects. In PoeClaw: + +- The `src/gateway/` directory (4 files, ~330 lines) handles process lifecycle, R2 persistence, and environment configuration — the three areas where silent failures cause real damage. +- The `src/auth/` directory (2 files, ~190 lines) handles the security boundary. +- Together, these ~520 lines represent about 8.5% of the production codebase but contain nearly all the high-severity bug surface. + +The existing ~900 lines of tests are almost entirely focused on these two directories. That's the 80/20 rule in action: 8.5% of the code gets nearly 100% of the testing attention. + +--- + +## Counterarguments and Rebuttals + +### Against the TDD Purist: "You're just being lazy" + +**Their claim:** Every function should be test-driven. Writing tests first ensures design quality and prevents regression. Skipping tests is a slippery slope to technical debt. + +**Rebuttal:** The TDD purist conflates the *discipline* of TDD (think before coding) with the *artifact* of TDD (a comprehensive test suite). I advocate for the discipline — careful design, consideration of edge cases, defensive coding — without insisting that every moment of thought be serialized into a `.test.ts` file. + +Consider `src/routes/admin-ui.ts` (19 lines). The TDD approach would have me write a test asserting that `GET /admin` returns an HTML response. But the entire file is: + +```typescript +app.route('/admin', adminUIRoutes); +``` + +What am I testing? That Hono's router works? That's Hono's test suite's job. The TDD purist would counter that this test catches regressions if someone changes the route. But a route change is a *deliberate* act, not an accidental regression — and the test would need to be updated to match, adding cost with zero bug-catching value. + +Laziness is writing no tests for JWT verification. Discipline is recognizing that some code doesn't warrant tests and investing that time where it matters. + +### Against the Type Advocate: "Types + tests = belt and suspenders" + +**Their claim:** TypeScript catches type errors at compile time, but runtime behavior can still diverge. You need both static types and dynamic tests for full confidence. + +**Rebuttal:** I agree with the premise but dispute the conclusion. Types and tests are not additive — they are *substitutive* in many cases. When TypeScript's strict mode guarantees that `buildEnvVars(env: MoltbotEnv)` receives a `MoltbotEnv` object, I don't need a test asserting that it throws on `null` input. The compiler already makes that impossible at the call site. + +Where types and tests are genuinely complementary is at *system boundaries*: the JWT that arrives as an opaque string, the rclone stdout that's parsed as text, the process list that comes from the Sandbox SDK. These boundaries are where runtime behavior diverges from type assumptions, and these are exactly where the current test suite focuses. + +The belt-and-suspenders metaphor proves my point. You wear a belt *or* suspenders — wearing both is redundancy, not safety. The question is which tool is better suited to each risk. For structural correctness (does this function accept the right types?), use the type system. For behavioral correctness (does this JWT verification reject expired tokens?), use tests. Don't use both for the same risk. + +### Against the Integration Tester: "You still need system-level verification" + +**Their claim:** Unit tests with mocks don't prove the system works end-to-end. You need integration tests that verify real Sandbox interactions, real R2 syncs, and real WebSocket connections. + +**Rebuttal:** I don't disagree — I agree that integration tests are valuable. My position is about *where the line is*, not whether integration tests exist. The project already has an `test/e2e/` directory with Terraform fixtures and browser automation for precisely this purpose. + +But the integration tester's argument actually *strengthens* my position against over-unit-testing. If the ultimate proof that `syncToR2` works is an integration test that actually syncs to R2, then the unit test that verifies the rclone command string is constructed correctly is a *lower-fidelity proxy* for the same assertion. It's not worthless — it catches regressions in the command construction cheaply and fast — but it's also not the kind of test you should be adding more and more of. The marginal unit test for sync has diminishing returns precisely because the integration test is where real confidence comes from. + +The pragmatic hierarchy is: a few high-fidelity integration tests for end-to-end confidence, plus targeted unit tests for complex logic that's hard to exercise through integration tests (JWT crypto, process matching heuristics). What you *don't* need is unit tests for the glue between them. + +### Against the CI Guardrails Advocate: "Automation covers what you skip" + +**Their claim:** If tests are automated in CI, the maintenance cost is near-zero. Just write them, add them to the pipeline, and let the machines do the work. + +**Rebuttal:** The CI advocate correctly identifies that *running* tests is cheap. But they undercount the cost of *maintaining* tests when code changes. CI doesn't write test updates for you — developers do. + +In PoeClaw's case, the `sync.test.ts` suite demonstrates the problem clearly. Each test case requires mocking 7 sequential `sandbox.exec()` calls in the exact right order with the exact right return values. When the sync implementation changed from s3fs/rsync to rclone (`95307a3`), every mock chain in this file had to be rewritten. CI caught the *failures* instantly, yes. But a developer still had to spend time rewriting the tests. If those had been 20 test cases instead of 7, that's 3x the developer time for a migration that the team had already decided to make. + +CI guardrails are most valuable for tests that are *stable* — tests for invariants that don't change when implementations change. Those are exactly the tests I advocate writing. The tests I advocate *not* writing are the ones that break on every refactor, turning CI from a guardrail into a speed bump. + +--- + +## Testing Framework for PoeClaw + +### ALWAYS Test (Non-Negotiable) + +| Category | Why | Example in Codebase | +|----------|-----|-------------------| +| **Security boundaries** | Failures are silent and critical | JWT verification (`auth/jwt.ts`), credential redaction (`utils/logging.ts`) | +| **Data integrity invariants** | Failures cause data loss | R2 sync direction (`rclone sync` vs `copy`), exclusion patterns | +| **Complex discrimination logic** | Multiple interacting conditions with non-obvious edge cases | Process matching (`gateway/process.ts:19-31`) — 4 gateway patterns, 5 CLI patterns, 2 status checks | +| **State machine transitions** | Wrong state = wrong behavior, hard to catch visually | Gateway lifecycle (starting → running → stuck → killed → restarted) | +| **Parsing and extraction** | Untrusted input from outside the type system | JWT extraction from cookies, JSON parsing from CLI stdout | + +### SOMETIMES Test (Judgment Call) + +| Category | When to test | When to skip | +|----------|-------------|-------------| +| **Configuration builders** | When the mapping is non-obvious or has conditional logic | When it's a 1:1 property mapping that TypeScript validates | +| **Error handling paths** | When the error response contains security-sensitive information or user-facing messages | When it's a generic 500 with a logged stack trace | +| **Orchestration logic** | When the ordering of operations matters (e.g., rclone config before sync) | When it's "call A, then call B" with no branching | + +### NEVER Test (Actively Resist) + +| Category | Why | Example in Codebase | +|----------|-----|-------------------| +| **Route registration** | Tests Hono, not your code. A 404 is immediately visible. | `routes/admin-ui.ts`, `routes/public.ts` | +| **Static HTML/templates** | String literal assertions test nothing. Visual bugs are caught visually. | `src/assets/*.html` | +| **One-line delegations** | The type system validates the call. The test is a tautology. | `app.route('/admin', adminUIRoutes)` | +| **Third-party SDK behavior** | Not your bug, not your test. Mock it at the boundary. | Testing that `sandbox.startProcess()` actually starts a process | +| **Console.log/debug output** | Format changes are not bugs. Log assertions are the most brittle tests. | `console.log('[Gateway] OpenClaw gateway is ready!')` | +| **Code slated for replacement** | The design document marks it for rewrite. Testing it is writing throwaway code to protect throwaway code. | Current cookie-parsing JWT extraction (being replaced by session auth) | + +--- + +## Proposed Rules for PoeClaw + +### Rule 1: The Severity × Detectability Test + +Before writing a test, answer two questions: + +1. **If this code breaks, how bad is it?** (Critical / High / Medium / Low) +2. **If this code breaks, how quickly will someone notice?** (Seconds / Minutes / Hours / Days+) + +Only write tests when severity is High+ AND detectability is Hours+. Everything else is caught by types, by the user, or by integration tests. + +### Rule 2: Test Contracts, Not Implementations + +Tests should assert *what* the code promises, not *how* it does it. Good: "unauthenticated requests return 401." Bad: "the middleware calls `verifyAccessJWT` with the token from the `CF-Access-JWT-Assertion` header." The first survives a refactor. The second breaks on any internal change. + +### Rule 3: One Invariant, One Test + +Each test should protect exactly one invariant. If you can't name the invariant in the `describe` block (e.g., "rejects expired tokens," "propagates deletions via rclone sync"), the test is probably testing implementation details rather than behavior. + +### Rule 4: Mocking Budget — Three Deep, No More + +If a test requires mocking more than 3 layers of dependencies, it's a sign that the code under test is orchestration glue, not testable logic. Extract the logic into a pure function and test that, or accept that it's integration-test territory and test it there. + +### Rule 5: Delete Tests That Cry Wolf + +If a test has broken more than twice due to non-bug refactors (implementation changes where the behavior was preserved), delete it. It's testing implementation details, and its maintenance cost has exceeded its bug-catching value. Track this informally — if updating a test feels routine rather than alarming, that's the signal. + +### Rule 6: New Tests Require a Threat Model + +When adding a new test, include a one-line comment stating the threat: what specific failure mode does this test catch? This forces the author to articulate the test's value and makes it easy to evaluate during refactors whether the threat still applies. + +```typescript +// Threat: CLI commands (e.g., "openclaw devices list") incorrectly matched as gateway process +it('returns null when only CLI commands are running', async () => { ... }); +``` + +### Rule 7: Coverage Targets Are Forbidden + +Do not set coverage thresholds. A coverage target incentivizes writing low-value tests to hit a number. Instead, review *which* code is covered and ask: "Is the uncovered code high-severity and low-detectability?" If not, leave it uncovered. + +--- + +## Conclusion + +The pragmatic minimalist position is not anti-testing — it is anti-waste. The current PoeClaw test suite is approximately right: it focuses on security boundaries, data integrity, and complex discrimination logic while leaving route registration, HTML templates, and orchestration glue untested. The correct response to "we should have more tests" is not "no," but "where, and why?" If the answer doesn't reference a specific failure mode with high severity and low detectability, the test isn't worth writing. + +In an experimental project with a 5-phase roadmap and multiple open design questions, the highest-leverage use of developer time is answering those questions — not writing tests for code that may not exist next month. Test the invariants that will survive the rewrites. Skip everything else. This is not laziness; it is the engineering judgment to allocate scarce resources where they create the most value. diff --git a/docs/philosophy/positions/05-property-based.md b/docs/philosophy/positions/05-property-based.md new file mode 100644 index 000000000..9edb0ce35 --- /dev/null +++ b/docs/philosophy/positions/05-property-based.md @@ -0,0 +1,523 @@ +# Position 05: Property-Based Testing + +**Author:** Property-Based Testing Advocate +**Date:** 2026-02-15 +**Status:** Position Paper for Development Philosophy Debate + +--- + +## Core Thesis + +Example-based tests verify that specific inputs produce specific outputs; property-based tests verify that *all* inputs satisfy *invariants*. For a codebase with cryptographic authentication, multi-tier configuration precedence, stateful sync operations, and process lifecycle management, the input space is combinatorially explosive and the consequences of missed edge cases range from data loss to security bypass. Property-based testing with frameworks like fast-check does not replace example-based tests but occupies a strictly superior position for the classes of bugs that matter most in PoeClaw: those that arise from unexpected input combinations, ordering dependencies, and state transitions that no developer would think to write by hand. + +--- + +## Key Arguments + +### 1. The `buildEnvVars` Precedence Logic Is a Textbook Case for Properties + +The current `buildEnvVars` function (`src/gateway/env.ts`) implements a multi-tier precedence system: Cloudflare AI Gateway keys, legacy AI Gateway overrides, direct provider keys, token remapping, and channel configuration. The existing 15 example-based tests (`src/gateway/env.test.ts`) cover individually chosen scenarios, but they cannot cover the combinatorial space. + +There are roughly 20 optional input fields. The interesting behavior lives in the *interactions* between them: when `AI_GATEWAY_API_KEY` and `AI_GATEWAY_BASE_URL` are both present, they override `ANTHROPIC_API_KEY` (line 34). When only one is present, the override doesn't fire. When `ANTHROPIC_BASE_URL` is also set, it's only used if the legacy gateway path isn't taken (line 35-37). + +**The property that must hold:** + +``` +For all env: MoltbotEnv, + let result = buildEnvVars(env) + // Precedence invariant + if env.AI_GATEWAY_API_KEY AND env.AI_GATEWAY_BASE_URL: + result.ANTHROPIC_API_KEY === env.AI_GATEWAY_API_KEY + else if env.ANTHROPIC_API_KEY: + result.ANTHROPIC_API_KEY === env.ANTHROPIC_API_KEY + + // No phantom keys: output keys are a subset of known keys + Object.keys(result) is a subset of KNOWN_OUTPUT_KEYS + + // Idempotency of URL normalization + result.AI_GATEWAY_BASE_URL does not end with '/' + + // No credential leakage: no input value appears under an unexpected key + for each value in Object.values(result): + value appears in Object.values(env) +``` + +The existing tests check 15 hand-picked points in a space of 2^20 combinations. A property test with fast-check generates thousands of random `MoltbotEnv` configurations and verifies the precedence invariant holds across *all of them*. This is not a theoretical advantage. The precedence logic at lines 29-37 has a subtle interaction: if you set `AI_GATEWAY_API_KEY` but *not* `AI_GATEWAY_BASE_URL`, and you also set `ANTHROPIC_API_KEY`, the direct key is used. But what if `AI_GATEWAY_BASE_URL` is an empty string? The current code treats empty string as falsy, which is correct in JavaScript, but a property test would have found and documented this boundary explicitly. + +### 2. JWT Verification Has Security Properties That Demand Formal Expression + +The JWT verification in `src/auth/jwt.ts` delegates to `jose`, but the *wrapper* has its own logic: team domain normalization (line 22). The current tests (`src/auth/jwt.test.ts`) mock `jose` entirely, testing that `verifyAccessJWT` calls `jwtVerify` with the right arguments. This is a test of wiring, not of security properties. + +The properties that matter: + +``` +// P1: Domain normalization is idempotent +For all domain: string, + normalize(normalize(domain)) === normalize(domain) + where normalize(d) = d.startsWith('https://') ? d : `https://${d}` + +// P2: No domain produces an invalid URL +For all domain: string (non-empty, valid hostname chars), + new URL(`${normalize(domain)}/cdn-cgi/access/certs`) does not throw + +// P3: Verification is a total function on the error path +For all token, domain, aud: string, + verifyAccessJWT(token, domain, aud) either resolves to JWTPayload or rejects + (never hangs, never returns undefined) +``` + +The CDP module's `timingSafeEqual` (`src/routes/cdp.ts:1907`) is another candidate. The current implementation has a known weakness: it returns `false` early when lengths differ (line 1908-1910), leaking length information via timing. A property test can express this: + +``` +// P4: Comparison time does not correlate with matching prefix length +For all a, b: string where a.length === b.length, + timingSafeEqual(a, b) completes in O(n) regardless of match position +``` + +This is not something an example-based test can express. You'd need to write an unbounded number of examples, or you need a property. + +### 3. R2 Sync Idempotency and Failure Isolation Are Compositional Properties + +`syncToR2` (`src/gateway/sync.ts`) performs three sequential sync operations with different failure semantics: config sync failure is fatal (line 59-65), workspace sync is non-fatal (line 68-71), skills sync is non-fatal (line 74-76). This is a compositional property: + +``` +// P1: Config failure is always fatal +For all env, sandbox where configSync fails, + syncToR2(sandbox, env).success === false + +// P2: Workspace/skills failures are never fatal +For all env, sandbox where configSync succeeds AND workspaceSync fails, + syncToR2(sandbox, env).success === true + +// P3: Successful sync always produces a timestamp +For all env, sandbox where syncToR2(sandbox, env).success === true, + syncToR2(sandbox, env).lastSync is a valid ISO 8601 string + +// P4: Config directory detection is deterministic +For all sandbox state, + detectConfigDir(sandbox) called twice yields same result +``` + +The existing `sync.test.ts` tests 5 scenarios. But the interaction between `ensureRcloneConfig` returning false, `detectConfigDir` returning null, config sync failing, workspace sync failing, and skills sync failing creates at least 2^5 = 32 paths. Properties test all of them implicitly. + +### 4. Process Discovery Is a Classification Problem Over Structured Strings + +`findExistingMoltbotProcess` (`src/gateway/process.ts:13-42`) implements a classifier: given a process command string and status, determine whether it's a running gateway. The classification logic uses substring matching with an exclusion list. This is exactly the kind of logic where property-based testing excels: + +``` +// P1: CLI commands are never classified as gateway processes +For all cmd containing "openclaw devices" | "openclaw --version" | "openclaw onboard", + classify(cmd, "running") === null + +// P2: Gateway commands are classified when running/starting +For all cmd containing "openclaw gateway" | "start-openclaw.sh", + AND NOT containing any CLI command substring, + classify(cmd, "running") !== null + classify(cmd, "starting") !== null + classify(cmd, "completed") === null + +// P3: Classification is prefix-independent +For all prefix: string, cmd: gateway_command, + classify(prefix + cmd, status) === classify(cmd, status) + // This property might FAIL, revealing that substring matching + // is sensitive to command prefixes (e.g., "/usr/bin/env openclaw gateway") +``` + +The existing 9 tests in `process.test.ts` cover the known cases. But property P3 would immediately reveal whether wrapping the command in a shell prefix (e.g., `bash -c 'openclaw gateway'`) breaks detection. This is the kind of bug that only manifests when the container runtime changes how it spawns processes. No human would think to write that example; fast-check generates it naturally. + +### 5. CDP Session State Has Type-State Invariants + +The CDP WebSocket handler (`src/routes/cdp.ts`) maintains a `CDPSession` with monotonically increasing counters (`nodeIdCounter`, `objectIdCounter`), maps that must stay consistent (`nodeMap`, `objectMap`, `pages`), and a protocol contract (response IDs must match request IDs). + +``` +// P1: Node IDs are unique within a session +For all sequences of querySelector/querySelectorAll calls, + all returned nodeIds are distinct + +// P2: Response ID matches request ID +For all request: CDPRequest, + handle(session, request).id === request.id + +// P3: Target lifecycle consistency +For all targetId: string, + after closeTarget(targetId), + session.pages.has(targetId) === false + AND any subsequent command targeting targetId returns an error + +// P4: Object release prevents use-after-free +For all objectId: string, + after releaseObject(objectId), + callFunctionOn({objectId}) returns an error +``` + +The CDP module has *zero* tests currently. It's 1,920 lines of untested protocol translation. Property-based tests are the most efficient way to bootstrap coverage here: define the protocol invariants, generate random sequences of CDP commands, and verify the invariants hold. This is more effective than writing 200 individual example tests because the generator explores ordering effects and state combinations that humans systematically miss. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "Properties are harder to write than examples" + +**Acknowledged:** Property-based tests have a higher initial cognitive cost. Writing `fc.property(fc.string(), fc.string(), (a, b) => ...)` requires thinking about invariants rather than input/output pairs. The learning curve is real: expect 2-4 hours for a developer's first property test to be productive. + +**Rebuttal:** The difficulty of writing the property *is the point*. If you cannot articulate the invariant, you do not understand the specification. The `buildEnvVars` precedence logic is a perfect example: the existing example tests encode the developer's *current understanding* of the precedence rules. A property test forces you to write the precedence rules as executable specification, which either confirms or contradicts the implementation. This is strictly more valuable than examples that confirm individual cases. + +Furthermore, the cost amortizes. Once you have a `MoltbotEnv` generator (an `fc.record` with optional string fields), you reuse it across every test that touches environment configuration. The generator becomes a project asset. + +**Concession:** For pure CRUD endpoints and simple data transformations, example-based tests are sufficient and more readable. I am not arguing for properties everywhere. I am arguing for properties where the input space is combinatorial and the invariants are non-trivial. + +### Against Pragmatists: "This is over-engineering for a PoC" + +**Acknowledged:** PoeClaw is evolving rapidly. Spending time on property tests that may be invalidated by architecture changes is a real cost. + +**Rebuttal:** The components I've identified are *not* PoC-level throwaway code. JWT authentication is security infrastructure. Environment variable precedence is configuration correctness. R2 sync is data durability. These components will survive any pivot because they're foundational plumbing. A bug in `buildEnvVars` precedence means credentials route to the wrong provider. A bug in `timingSafeEqual` means authentication can be bypassed via timing attack. These are not "over-engineering" concerns; they are "the system works correctly" concerns. + +Moreover, property tests for `buildEnvVars` would take approximately 30 minutes to write given the existing test infrastructure. The ROI is immediate: you get coverage of 2^20 input combinations for the cost of articulating 4-5 invariants. The pragmatic choice is the one that gives more coverage per hour of engineering time, and for combinatorial logic, that's properties. + +**Concession:** I would not argue for property tests on the Hono route handlers, the WebSocket upgrade negotiation, or the HTML templates. Those are integration-level concerns better served by integration tests or manual verification. Property tests target the pure logic beneath. + +### Against Integration Testers: "Properties test in isolation, not in context" + +**Acknowledged:** A property test on `buildEnvVars` doesn't tell you whether the container actually receives the environment variables. A property test on `findExistingMoltbotProcess` doesn't tell you whether Cloudflare's sandbox API actually returns processes in the expected format. + +**Rebuttal:** This is a misunderstanding of the testing pyramid, not an argument against properties. Property tests and integration tests answer different questions: + +- **Property test:** "Does `buildEnvVars` correctly implement the precedence specification for all possible inputs?" +- **Integration test:** "Does the container receive the environment variables that `buildEnvVars` produces?" + +Both are necessary. Neither subsumes the other. But the property test is *cheaper to run* (milliseconds vs. seconds for container startup), *more thorough* (thousands of inputs vs. a handful), and *more maintainable* (invariants change less frequently than wiring). + +The process classification logic in `findExistingMoltbotProcess` is a pure function from `(command: string, status: string) => boolean`. Testing it with property-based methods requires no mocking, no sandbox, and no integration. The integration test confirms that `sandbox.listProcesses()` returns the expected shape. The property test confirms that the classifier handles all strings correctly. These are complementary, not competing. + +### Against Anti-Mock Realists: "Property tests still use synthetic inputs" + +**Acknowledged:** fast-check generates random strings, not real JWT tokens. It generates random `MoltbotEnv` objects, not real Cloudflare Worker bindings. The inputs are synthetic by definition. + +**Rebuttal:** This objection conflates "synthetic" with "unrealistic." A fast-check string generator produces strings that are *more adversarial* than real-world inputs: empty strings, strings with null bytes, strings with Unicode edge cases, extremely long strings. If your `timingSafeEqual` breaks on a 0-length string, fast-check will find it. If your URL normalization chokes on a string with embedded newlines, fast-check will find it. Real-world inputs are a *subset* of what fast-check generates. + +For `buildEnvVars`, the generator would be: + +```typescript +const envArb = fc.record({ + ANTHROPIC_API_KEY: fc.option(fc.string()), + AI_GATEWAY_API_KEY: fc.option(fc.string()), + AI_GATEWAY_BASE_URL: fc.option(fc.string()), + ANTHROPIC_BASE_URL: fc.option(fc.string()), + // ... remaining fields +}, { requiredKeys: [] }); +``` + +This generates realistic-shaped configurations with random presence/absence of each field. It's not testing with "fake data" - it's testing with "all possible configurations." The synthetic inputs are the *strength*, not the weakness. + +**Concession:** For testing behavior that depends on specific external formats (e.g., the exact structure of Cloudflare Access JWKS responses), property tests need custom generators that reflect the real format. Naive string generation won't help. But this is a generator design problem, not a fundamental limitation. + +--- + +## Specific Property Tests for This Codebase + +### Test 1: `buildEnvVars` Precedence Properties + +```typescript +import * as fc from 'fast-check'; +import { buildEnvVars } from './env'; + +const KNOWN_OUTPUT_KEYS = new Set([ + 'CLOUDFLARE_AI_GATEWAY_API_KEY', 'CF_AI_GATEWAY_ACCOUNT_ID', + 'CF_AI_GATEWAY_GATEWAY_ID', 'ANTHROPIC_API_KEY', 'OPENAI_API_KEY', + 'AI_GATEWAY_BASE_URL', 'ANTHROPIC_BASE_URL', 'OPENCLAW_GATEWAY_TOKEN', + 'OPENCLAW_DEV_MODE', 'TELEGRAM_BOT_TOKEN', 'TELEGRAM_DM_POLICY', + 'DISCORD_BOT_TOKEN', 'DISCORD_DM_POLICY', 'SLACK_BOT_TOKEN', + 'SLACK_APP_TOKEN', 'CF_AI_GATEWAY_MODEL', 'CF_ACCOUNT_ID', + 'CDP_SECRET', 'WORKER_URL', 'R2_ACCESS_KEY_ID', + 'R2_SECRET_ACCESS_KEY', 'R2_BUCKET_NAME', +]); + +const moltbotEnvArb = fc.record({ + ANTHROPIC_API_KEY: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + OPENAI_API_KEY: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + AI_GATEWAY_API_KEY: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + AI_GATEWAY_BASE_URL: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + ANTHROPIC_BASE_URL: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + CLOUDFLARE_AI_GATEWAY_API_KEY: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + MOLTBOT_GATEWAY_TOKEN: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + TELEGRAM_BOT_TOKEN: fc.option(fc.string({ minLength: 1 }), { nil: undefined }), + // ... remaining fields +}, { requiredKeys: [] }); + +describe('buildEnvVars properties', () => { + it('output keys are always a subset of known keys', () => { + fc.assert(fc.property(moltbotEnvArb, (env) => { + const result = buildEnvVars(env as any); + for (const key of Object.keys(result)) { + expect(KNOWN_OUTPUT_KEYS.has(key)).toBe(true); + } + })); + }); + + it('legacy gateway always overrides direct ANTHROPIC_API_KEY', () => { + fc.assert(fc.property(moltbotEnvArb, (env) => { + const result = buildEnvVars(env as any); + if (env.AI_GATEWAY_API_KEY && env.AI_GATEWAY_BASE_URL) { + expect(result.ANTHROPIC_API_KEY).toBe(env.AI_GATEWAY_API_KEY); + } + })); + }); + + it('URL normalization is idempotent', () => { + fc.assert(fc.property(moltbotEnvArb, (env) => { + const result = buildEnvVars(env as any); + if (result.AI_GATEWAY_BASE_URL) { + expect(result.AI_GATEWAY_BASE_URL).not.toMatch(/\/+$/); + } + if (result.ANTHROPIC_BASE_URL && env.AI_GATEWAY_API_KEY && env.AI_GATEWAY_BASE_URL) { + expect(result.ANTHROPIC_BASE_URL).not.toMatch(/\/+$/); + } + })); + }); + + it('output values are always sourced from input values', () => { + fc.assert(fc.property(moltbotEnvArb, (env) => { + const result = buildEnvVars(env as any); + const inputValues = new Set(Object.values(env).filter(Boolean)); + for (const value of Object.values(result)) { + // Value is either directly from input or a normalized version + const isFromInput = inputValues.has(value); + const isNormalizedUrl = [...inputValues].some( + (iv) => typeof iv === 'string' && iv.replace(/\/+$/, '') === value + ); + expect(isFromInput || isNormalizedUrl).toBe(true); + } + })); + }); +}); +``` + +### Test 2: Process Classification Properties + +```typescript +const gatewayCommands = fc.oneof( + fc.constant('openclaw gateway'), + fc.constant('start-openclaw.sh'), + fc.constant('clawdbot gateway'), + fc.constant('start-moltbot.sh'), +); + +const cliCommands = fc.oneof( + fc.constant('openclaw devices'), + fc.constant('openclaw --version'), + fc.constant('openclaw onboard'), + fc.constant('clawdbot devices'), + fc.constant('clawdbot --version'), +); + +const activeStatuses = fc.oneof( + fc.constant('running' as const), + fc.constant('starting' as const), +); + +const inactiveStatuses = fc.oneof( + fc.constant('completed' as const), + fc.constant('failed' as const), +); + +describe('findExistingMoltbotProcess properties', () => { + it('active gateway commands are always detected', () => { + fc.assert(fc.asyncProperty(gatewayCommands, activeStatuses, async (cmd, status) => { + const proc = createFullMockProcess({ command: cmd, status }); + const { sandbox, listProcessesMock } = createMockSandbox(); + listProcessesMock.mockResolvedValue([proc]); + const result = await findExistingMoltbotProcess(sandbox); + expect(result).not.toBeNull(); + })); + }); + + it('CLI commands are never detected as gateway', () => { + fc.assert(fc.asyncProperty(cliCommands, activeStatuses, async (cmd, status) => { + const proc = createFullMockProcess({ command: cmd, status }); + const { sandbox, listProcessesMock } = createMockSandbox(); + listProcessesMock.mockResolvedValue([proc]); + const result = await findExistingMoltbotProcess(sandbox); + expect(result).toBeNull(); + })); + }); + + it('inactive gateway processes are never returned', () => { + fc.assert(fc.asyncProperty(gatewayCommands, inactiveStatuses, async (cmd, status) => { + const proc = createFullMockProcess({ command: cmd, status }); + const { sandbox, listProcessesMock } = createMockSandbox(); + listProcessesMock.mockResolvedValue([proc]); + const result = await findExistingMoltbotProcess(sandbox); + expect(result).toBeNull(); + })); + }); +}); +``` + +### Test 3: `timingSafeEqual` Security Properties + +```typescript +describe('timingSafeEqual properties', () => { + it('is reflexive: every string equals itself', () => { + fc.assert(fc.property(fc.string(), (s) => { + expect(timingSafeEqual(s, s)).toBe(true); + })); + }); + + it('is symmetric: equal(a, b) === equal(b, a)', () => { + fc.assert(fc.property(fc.string(), fc.string(), (a, b) => { + expect(timingSafeEqual(a, b)).toBe(timingSafeEqual(b, a)); + })); + }); + + it('agrees with strict equality for same-length strings', () => { + fc.assert(fc.property(fc.string(), fc.string(), (a, b) => { + if (a.length === b.length) { + expect(timingSafeEqual(a, b)).toBe(a === b); + } + })); + }); + + it('rejects all strings that differ from the secret', () => { + fc.assert(fc.property(fc.string({ minLength: 1 }), fc.string({ minLength: 1 }), (a, b) => { + fc.pre(a !== b); + expect(timingSafeEqual(a, b)).toBe(false); + })); + }); +}); +``` + +### Test 4: JWT Domain Normalization Properties + +```typescript +describe('JWT domain normalization properties', () => { + const validDomain = fc.stringMatching(/^[a-z0-9][a-z0-9.-]*[a-z0-9]$/); + + it('normalization is idempotent', () => { + fc.assert(fc.property(validDomain, (domain) => { + const once = domain.startsWith('https://') ? domain : `https://${domain}`; + const twice = once.startsWith('https://') ? once : `https://${once}`; + expect(once).toBe(twice); + })); + }); + + it('normalized domain always starts with https://', () => { + fc.assert(fc.property(validDomain, (domain) => { + const normalized = domain.startsWith('https://') ? domain : `https://${domain}`; + expect(normalized.startsWith('https://')).toBe(true); + })); + }); + + it('normalized domain never has double https://', () => { + fc.assert(fc.property( + fc.oneof(validDomain, fc.constant('https://').chain(prefix => validDomain.map(d => prefix + d))), + (domain) => { + const normalized = domain.startsWith('https://') ? domain : `https://${domain}`; + expect(normalized).not.toContain('https://https://'); + }, + )); + }); +}); +``` + +### Test 5: R2 Sync Failure Isolation Properties + +```typescript +describe('syncToR2 failure isolation properties', () => { + const syncOutcome = fc.oneof( + fc.constant('success'), + fc.constant('config-fail'), + fc.constant('workspace-fail'), + fc.constant('skills-fail'), + ); + + it('config failure implies overall failure', () => { + fc.assert(fc.asyncProperty(syncOutcome, async (outcome) => { + const sandbox = createMockSandbox(/* configured for outcome */); + if (outcome === 'config-fail') { + const result = await syncToR2(sandbox, env); + expect(result.success).toBe(false); + } + })); + }); + + it('workspace/skills failure does not imply overall failure', () => { + fc.assert(fc.asyncProperty( + fc.oneof(fc.constant('workspace-fail'), fc.constant('skills-fail')), + async (outcome) => { + // Configure sandbox where config succeeds but workspace/skills fail + const result = await syncToR2(sandbox, env); + expect(result.success).toBe(true); + }, + )); + }); + + it('successful sync always includes a timestamp', () => { + fc.assert(fc.asyncProperty(envArb, async (env) => { + // Configure sandbox for full success + const result = await syncToR2(sandbox, env); + if (result.success) { + expect(result.lastSync).toBeDefined(); + expect(() => new Date(result.lastSync!).toISOString()).not.toThrow(); + } + })); + }); +}); +``` + +--- + +## Proposed Rules for This Project + +### Rule 1: Property Tests Are Required for Functions with Combinatorial Input Spaces + +Any function that accepts a record/object with 5+ optional fields, or that implements conditional precedence logic, must have at least one property test asserting its core invariants. Currently applies to: `buildEnvVars`, `findExistingMoltbotProcess`, and future configuration builders. + +**Rationale:** The number of meaningful input combinations grows exponentially with optional fields. Example-based tests cover O(n) cases; property tests cover O(2^n) implicitly. + +### Rule 2: Security-Critical Code Gets Property Tests for Algebraic Properties + +Any function involved in authentication, authorization, or secret comparison must have property tests for reflexivity, symmetry, transitivity (where applicable), and totality (it always returns, never hangs). Currently applies to: `timingSafeEqual`, `verifyAccessJWT` domain normalization, JWT middleware `extractJWT`. + +**Rationale:** Security functions must be correct for *all* inputs, not just the ones an attacker hasn't tried yet. Properties encode this universality. + +### Rule 3: Failure Isolation Properties for Multi-Step Operations + +Any operation that performs multiple sub-operations with different failure semantics (fatal vs. non-fatal) must have a property test that generates all 2^n failure combinations and verifies the overall success/failure matches the specification. Currently applies to: `syncToR2`. + +**Rationale:** Failure isolation bugs are the hardest to find by example because they only manifest in specific failure combinations. They are the easiest to find by property because the generator naturally explores the failure matrix. + +### Rule 4: Invest in Generators as Reusable Project Assets + +Create and maintain a `src/test-generators.ts` file containing fast-check `Arbitrary` definitions for the project's core types: `MoltbotEnv`, `Process` (with realistic command strings), `CDPRequest` (with valid method/params combinations), and `SyncResult`. These generators are shared across all property tests. + +**Rationale:** The upfront cost of building good generators pays dividends across every test that uses them. A `MoltbotEnv` generator is written once and used by every test touching configuration logic. + +### Rule 5: Properties Complement, Never Replace, Examples + +Every property test file should include at least one traditional example test as documentation. The example shows what the function does; the property proves it does it correctly for all inputs. Property tests without examples are hard to read. Examples without properties are incomplete. + +**Rationale:** Properties are specifications, not documentation. A developer reading `fc.property(envArb, (env) => { ... })` needs a concrete example to build intuition before understanding the abstract invariant. + +### Rule 6: Start with Three, Expand with Bugs + +Begin by adding property tests to the three highest-value targets: `buildEnvVars`, `timingSafeEqual`, and `findExistingMoltbotProcess`. For all other modules, add property tests reactively: whenever a bug is found that *could have been caught by a property*, write the property. This bounds the upfront investment while building the property test suite organically around actual failure modes. + +**Rationale:** The pragmatists are right that unbounded investment is wasteful. But the solution isn't to avoid properties; it's to invest them where the cost-benefit ratio is highest and expand based on evidence. + +--- + +## Honest Assessment of Costs + +1. **Dependency:** Adds `fast-check` (~150KB, no transitive deps, well-maintained). Minimal risk. +2. **Learning curve:** 2-4 hours for first productive property test. Team-wide fluency in ~1 week with pair programming on the initial generators. +3. **Test runtime:** Property tests run 100 iterations by default (configurable). For pure functions like `buildEnvVars`, this adds <100ms. For async functions with mock setup, ~1-2 seconds. Negligible compared to container-based integration tests. +4. **Maintenance:** Properties change when *specifications* change, not when implementations change. This is more stable than example tests, which break on any refactor that changes intermediate behavior. +5. **Shrinking:** fast-check automatically shrinks failing inputs to minimal counterexamples. When a property fails, you get the *simplest* input that triggers the bug, not a 200-character random string. This makes debugging faster than with hand-written edge case tests. + +--- + +## Conclusion + +The PoeClaw codebase already has well-structured, testable pure functions (`buildEnvVars`, `findExistingMoltbotProcess`, `timingSafeEqual`, `verifyAccessJWT` normalization logic) surrounded by integration boundaries (sandbox API, R2 storage, WebSocket connections). Property-based testing is maximally effective precisely at this architecture: test the pure logic exhaustively with properties, test the integration boundaries with a smaller number of targeted integration tests. The result is a test suite that is both more thorough and more maintainable than either approach alone. + +The question is not "property tests vs. example tests." The question is "which invariants does your codebase enforce, and can you prove it?" If you can write the invariant as a sentence, you can write it as a property. And a property that runs against 10,000 generated inputs is worth more than 10 hand-written examples that confirm what you already believe. diff --git a/docs/philosophy/positions/06-mutation-testing.md b/docs/philosophy/positions/06-mutation-testing.md new file mode 100644 index 000000000..d2928802f --- /dev/null +++ b/docs/philosophy/positions/06-mutation-testing.md @@ -0,0 +1,249 @@ +# Position 06: Mutation Testing as the Measure of Test Effectiveness + +**Author:** Mutation Testing Advocate +**Position:** Test quality, measured by mutation score, is the only honest metric of test effectiveness. Code coverage is a necessary but grossly insufficient proxy. Mutation testing — systematically injecting faults and verifying that tests detect them — is the empirical method that separates tests which *prove correctness* from tests which merely *exercise code*. + +--- + +## Core Thesis + +A test suite's value is not how much code it runs, but how many bugs it would catch. Code coverage answers the question "was this line executed?" — mutation testing answers "would my tests notice if this line were wrong?" These are fundamentally different questions, and only the second one matters. In a project like PoeClaw, where authentication, credential handling, and process lifecycle management are safety-critical, the gap between these two questions is where production incidents live. + +--- + +## Key Arguments + +### 1. This Codebase Has Tests That Would Miss Real Bugs — Provably + +This is not a theoretical concern. I have identified concrete mutations in the current PoeClaw test suite that would survive — meaning the tests would continue to pass even with the bug injected. Consider `src/gateway/r2.ts:15`: + +```typescript +if (!env.R2_ACCESS_KEY_ID || !env.R2_SECRET_ACCESS_KEY || !env.CF_ACCOUNT_ID) { + return false; +} +``` + +**Mutation: change `||` to `&&`** — require *all three* to be missing before returning false. The existing tests at `r2.test.ts:17-33` each test *one* missing credential in isolation. Every individual test would still pass. But the production behavior changes catastrophically: a partially-configured R2 setup (e.g., key present, secret missing) would now proceed to write an invalid rclone config, causing silent data loss on sync. + +This is not a contrived example. This is a real bug that the current test suite cannot detect. Mutation testing would catch it automatically. + +### 2. Mock-Heavy Tests Create an Illusion of Safety + +PoeClaw's test suite relies extensively on mocking — `jose` is mocked in JWT tests, `sandbox.exec()` is mocked in R2/sync tests, `sandbox.listProcesses()` is mocked in process tests. Mocks are a legitimate tool, but they introduce a dangerous failure mode: **tests verify that mocks were called correctly, not that the system behaves correctly**. + +In `jwt.test.ts`, the tests verify that `jwtVerify` is called with the right parameters. But consider this mutation to `jwt.ts:22`: + +```typescript +// Original +const issuer = teamDomain.startsWith('https://') ? teamDomain : `https://${teamDomain}`; + +// Mutation: swap the ternary branches +const issuer = teamDomain.startsWith('https://') ? `https://${teamDomain}` : teamDomain; +``` + +This mutation produces `https://https://myteam.cloudflareaccess.com` when given a domain with the prefix, and bare `myteam.cloudflareaccess.com` without it. **The test at line 50 ("handles team domain with https:// prefix") would still pass** because the test verifies `jwtVerify` was called with `issuer: 'https://myteam.cloudflareaccess.com'` — and since `jwtVerify` is mocked, it doesn't actually validate the issuer. The mock returns success regardless. + +Wait — actually, the test *does* check the exact value passed to the mock. So *this particular* mutation would be caught by the assertion at line 73. But consider a subtler mutation: change `'https://'` to `'https:/'` in the `startsWith` check. The test with prefix `'https://myteam...'` still starts with `'https:/'`, so the ternary takes the same branch, and all tests pass. The real-world consequence: domains without `https://` prefix would be double-prefixed only when they happen to start with `'https:/'` — a bizarre edge case that would slip through. + +Mutation testing doesn't require you to reason about which mutations might survive. It mechanically checks *all of them*. + +### 3. String-Based Pattern Matching Is a Mutation Testing Gold Mine + +The process detection logic in `process.ts:19-30` uses `String.includes()` for critical security-relevant filtering — determining whether a sandbox process is a gateway process or a CLI command. This is exactly the kind of code where mutation testing excels: + +```typescript +const isGatewayProcess = + proc.command.includes('start-openclaw.sh') || + proc.command.includes('openclaw gateway') || + proc.command.includes('start-moltbot.sh') || + proc.command.includes('clawdbot gateway'); +``` + +**Mutation: remove the third `||` branch** (`start-moltbot.sh`). The test at `process.test.ts:83` catches this — good. But **remove the `isCliCommand` negation** at line 32 (change `&& !isCliCommand` to just `&&`), and only the single test at line 135 (`openclaw onboard`) fails. The other 8 tests pass. This means one mutation survivor reveals that the CLI-exclusion logic has only a single test guarding it — a fact invisible to coverage metrics, since every line is already "covered." + +More critically: what about the command `"openclaw gateway devices"`? It matches *both* `isGatewayProcess` (contains "openclaw gateway") and `isCliCommand` (contains "openclaw devices" — wait, no it doesn't, it contains "gateway devices"). This kind of boundary analysis is precisely what emerges from examining surviving mutants. + +### 4. Coverage Metrics Actively Mislead in This Codebase + +The `env.test.ts` file has 17 test cases covering `buildEnvVars`. Line coverage is likely near 100%. But consider the mutation: **delete line 30** entirely (the `replace(/\/+$/, '')` normalization): + +```typescript +if (env.AI_GATEWAY_API_KEY && env.AI_GATEWAY_BASE_URL) { + // const normalizedBaseUrl = env.AI_GATEWAY_BASE_URL.replace(/\/+$/, ''); + envVars.AI_GATEWAY_BASE_URL = env.AI_GATEWAY_BASE_URL; // Use raw value + envVars.ANTHROPIC_BASE_URL = env.AI_GATEWAY_BASE_URL; + envVars.ANTHROPIC_API_KEY = env.AI_GATEWAY_API_KEY; +} +``` + +Test at line 76 ("strips trailing slashes") catches this. Good. But now try a subtler mutation: change the regex from `/\/+$/` to `/\/$/` (match only a single trailing slash). The test input is `'https://...anthropic///'`. With `/\/$/`, `replace` still strips the last `/`, yielding `https://...anthropic//`. The test expects `https://...anthropic` — so this mutation *is* caught. But change the test input to `https://...anthropic/` (single trailing slash) and both regexes produce the same output. The test *happens* to use `///` but this was likely coincidental, not a deliberate boundary test. + +Mutation testing systematically surfaces these "coincidental catches" vs. "deliberate assertions," giving you signal about test *intent* rather than test *luck*. + +### 5. Security-Critical Code Demands Proof, Not Probability + +The `extractJWT` function in `middleware.ts:32-41` has a real bug that no existing test catches: + +```typescript +const jwtCookie = c.req.raw.headers + .get('Cookie') + ?.split(';') + .find((cookie) => cookie.trim().startsWith('CF_Authorization=')) + ?.split('=')[1]; +``` + +If the JWT value itself contains `=` (which base64-encoded JWTs can, since base64 uses `=` for padding), `split('=')[1]` truncates the token. The test at line 84 uses `'cookie.payload.signature'` — no `=` signs. This is not a mutation testing finding per se (it's a real bug), but mutation testing *would* surface it indirectly: mutating `split('=')[1]` to `split('=')[0]` would produce different output for the test inputs, but mutating it to `split('=').slice(1).join('=')` wouldn't change test behavior — revealing that the tests don't cover the `=`-in-value case. + +For authentication code, "tests pass" is not the same as "authentication works." Mutation testing is the closest thing we have to a proof obligation on our test suite. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "TDD naturally produces good tests" + +**Their claim:** If you write tests first and follow red-green-refactor, you naturally get tests that encode behavior. Each test was red before the code was written, so by construction it tests something real. + +**Rebuttal:** TDD produces tests that were *once* red. It does not guarantee they *remain* discriminating as the code evolves. Consider: a TDD practitioner writes `isDevMode` with the test `expect(isDevMode({DEV_MODE: 'true'})).toBe(true)`. The code is `return env.DEV_MODE === 'true'`. Later, during a refactor, someone changes it to `return !!env.DEV_MODE`. The test still passes (for the input `'true'`). The TDD workflow provided no protection because the test was written for the *original* implementation, and the refactored code happens to satisfy the same test while changing semantics. + +TDD is a *design* methodology. Mutation testing is a *verification* methodology. They are complementary, not substitutes. TDD tells you "write a test for each behavior you intend." Mutation testing tells you "your tests would miss these behaviors they claim to cover." The latter is strictly more information. + +Furthermore: TDD in practice often leads to heavy mocking, which — as demonstrated above — creates tests that verify *interaction protocols* rather than *observable behavior*. Mutation testing is agnostic to whether mocks are used; it simply asks whether the test suite, mocks and all, can detect faults. + +### Against Pragmatists: "Mutation testing is too slow and expensive for CI" + +**Their claim:** Stryker or similar tools can take 10-100x longer than the test suite itself. For a project like PoeClaw with fast Vitest tests, adding 20+ minutes of mutation testing to every PR is impractical. + +**Rebuttal:** This is a real concern, and I will not dismiss it. But the framing is wrong. The question is not "should mutation testing run on every push?" It's "should we ever know whether our tests are effective?" + +Practical mitigations: + +1. **Incremental mutation testing.** Stryker supports `--since` to only mutate files changed in the current PR. For a typical PoeClaw PR touching 2-3 files, this reduces runtime to seconds, not minutes. + +2. **Scheduled runs.** Run full mutation testing nightly or weekly. The mutation score becomes a tracked metric, like technical debt. Regressions are caught within 24 hours. + +3. **Targeted runs on critical paths.** Run mutation testing only on `src/auth/` and `src/gateway/` — the modules where bugs have production consequences. You don't need to mutation-test the entire codebase to get value. Authentication and credential handling code is where the ROI is highest. + +4. **Local developer workflow.** Developers run `stryker run --mutate src/auth/jwt.ts` before submitting PRs that touch auth code. This takes seconds for a single file and catches the most dangerous class of test gaps. + +The cost of a surviving mutant in production — an auth bypass, a silent data loss in R2 sync, a credential leak through insufficient redaction — dwarfs the CI cost. A 5-minute mutation testing step on auth code is cheaper than one incident response. + +### Against Integration Testers: "Mutation testing only works on unit tests" + +**Their claim:** PoeClaw is fundamentally about orchestrating external systems — Cloudflare Sandbox, R2, Access JWTs. The real bugs are at integration boundaries. Mutation testing on unit tests with mocked dependencies is testing the mocks, not the system. + +**Rebuttal:** This critique has merit but misidentifies the target. Mutation testing does not replace integration testing. It measures the quality of *whatever tests you have*. If your integration tests are your primary defense, mutation testing tells you whether those integration tests actually catch faults. + +But more importantly: integration tests have the *worst* mutation detection ratio of any test type, precisely because they test through so many layers. An integration test for the full sync flow might execute 200 lines of code but only assert `expect(result.success).toBe(true)` — a single assertion covering a massive mutation surface. Mutation testing reveals this: "your integration test exercises all this code but would not detect faults in 80% of it." + +The practical response is not "don't integration test" — it's "use mutation testing to identify *which specific behaviors* your integration tests fail to verify, then add targeted unit tests or more precise integration assertions to close the gaps." + +In PoeClaw specifically: the `syncToR2` integration-style tests (with mocked `exec`) execute the full sync orchestration logic but only assert on `result.success` and `result.lastSync`. Mutation testing would immediately reveal that rclone flags, exclude patterns, timeout values, and error messages could all be mutated without test failure. + +### Against Property Testers: "Properties already encode invariants" + +**Their claim:** Property-based tests encode *invariants* — statements that must hold for all inputs. A property like "redactSensitiveParams never leaks a param matching the sensitive pattern" is stronger than any point-example test and implicitly covers all mutations. + +**Rebuttal:** I agree that property-based testing is powerful, and in theory, a perfectly-specified property test is mutation-proof. In practice: + +1. **Properties are only as good as the invariants you think to encode.** If you write `forAll(url => redactSensitiveParams(url) does not contain sensitive values)`, you've covered redaction. But did you write a property for "non-sensitive params are preserved unchanged"? For "output is valid URL query syntax"? For "empty input produces empty output"? Each unwritten property is a class of surviving mutants. + +2. **Properties and mutations are complementary.** Mutation testing identifies *which properties are missing*. Running Stryker after writing property tests tells you "your properties cover redaction but not preservation, not ordering, not encoding." This is actionable feedback that makes the property tests better. + +3. **PoeClaw doesn't use property testing today.** This is a debate about this project's current state. The existing tests are exclusively example-based with Vitest `it()` blocks. Mutation testing provides immediate value for the test suite *as it exists*, not as it might exist after a hypothetical property testing adoption. + +4. **Even with property tests, boundary mutation matters.** A property test for `isGatewayProcess` might state "any command containing 'openclaw gateway' should match." But the implementation uses `includes()`, which also matches `'run openclaw gateway devices list'`. The property as stated is correct but incomplete — it doesn't specify what should *not* match. Mutation testing on the negation logic would surface this gap. + +--- + +## Specific Codebase Areas Where Mutation Testing Would Add Value + +### Critical Priority (security/data integrity) + +| File | Mutation Target | Why It Matters | +|------|----------------|----------------| +| `src/auth/middleware.ts:37-38` | `split('=')[1]` in cookie parsing | JWT truncation on base64-padded tokens | +| `src/auth/middleware.ts:54` | `\|\|` in dev/e2e mode check | Auth bypass if logic inverted | +| `src/gateway/r2.ts:15` | `\|\|` to `&&` in credential validation | Silent write of invalid rclone config | +| `src/utils/logging.ts:11` | Remove key/value check alternation | Credential leak in logs | +| `src/auth/jwt.ts:22` | `startsWith` string check | JWKS URL construction failure | + +### High Priority (operational correctness) + +| File | Mutation Target | Why It Matters | +|------|----------------|----------------| +| `src/gateway/process.ts:32` | Remove `!isCliCommand` negation | Gateway process misidentification | +| `src/gateway/process.ts:33` | Remove `'starting'` status check | Race condition in process reuse | +| `src/gateway/sync.ts:56` | `rclone sync` to `rclone copy` | Deletions not propagated to R2 | +| `src/gateway/sync.ts:69` | `\|\| true` error suppression | Silent workspace sync failure | +| `src/gateway/env.ts:29` | `&&` to `\|\|` in legacy gateway check | Partial config used as complete | + +### Medium Priority (correctness) + +| File | Mutation Target | Why It Matters | +|------|----------------|----------------| +| `src/gateway/env.ts:30` | Regex `/\/+$/` to `/\/$/` | Multi-slash URLs not fully normalized | +| `src/gateway/sync.ts:29-30` | Swap openclaw/clawdbot precedence | Wrong config dir selected | +| `src/gateway/r2.ts:22-23` | Flag file check result | Rclone reconfigured on every request | +| `src/utils/logging.ts:6` | Remove `/i` flag from regex | Case-sensitive redaction misses uppercase | + +--- + +## Proposed Rules for This Project + +### Rule 1: Mutation Score Gate on Auth and Credential Code + +Files in `src/auth/` and any file handling credentials (`r2.ts`, `env.ts`, `logging.ts`) must maintain a mutation score of **80% or higher**. This is checked in CI via `stryker run --since` on PRs touching these files. + +**Rationale:** These are the modules where a surviving mutant maps directly to a security vulnerability or data loss. 80% is achievable and meaningful — it allows pragmatic exclusions (equivalent mutants, unreachable code) while requiring that the vast majority of injectable faults are detected. + +### Rule 2: New Test Files Must Kill Their Mutants + +When a PR adds a new `*.test.ts` file, the mutation score for the corresponding source file must be reported in the PR description. No hard gate initially, but visibility creates accountability. + +### Rule 3: Surviving Mutants on Critical Code Require Justification + +If Stryker reports a surviving mutant in `src/auth/` or `src/gateway/`, the PR author must either: +- Add a test that kills the mutant, or +- Document why the mutant is equivalent (semantically identical to the original) or unreachable + +This prevents the "acknowledge and ignore" antipattern that degrades mutation score over time. + +### Rule 4: Stryker Configuration + +```json +{ + "mutate": [ + "src/auth/**/*.ts", + "src/gateway/**/*.ts", + "src/utils/**/*.ts", + "!src/**/*.test.ts", + "!src/**/test-utils.ts" + ], + "testRunner": "vitest", + "reporters": ["clear-text", "html"], + "thresholds": { + "high": 80, + "low": 60, + "break": 50 + }, + "incremental": true, + "incrementalFile": ".stryker-incremental.json" +} +``` + +Focus on the core logic modules. Exclude test files and test utilities. Use incremental mode to keep CI fast. Break the build only below 50% — a floor, not a ceiling. + +### Rule 5: Weekly Full Mutation Report + +A scheduled CI job runs full mutation testing weekly and posts the report as a GitHub Actions artifact. This tracks trends and catches slow degradation that incremental runs miss. + +--- + +## Conclusion + +The PoeClaw codebase is well-structured and has a meaningful test suite. But "meaningful" and "effective" are different claims, and only mutation testing can distinguish them empirically. The examples above are not hypothetical — they are specific, verifiable predictions about which faults the current tests would miss. Running Stryker on this codebase would either confirm these predictions (validating the approach) or refute them (proving the tests are stronger than analysis suggests). Either outcome is valuable. + +The cost is modest: minutes of CI time for incremental runs, a weekly full report. The benefit is the only empirical answer to the question every test suite claims but none can prove without mutation testing: *"Would these tests catch a real bug?"* + +Test quality is not a feeling. It's a mutation score. diff --git a/docs/philosophy/positions/07-contract-first.md b/docs/philosophy/positions/07-contract-first.md new file mode 100644 index 000000000..529290328 --- /dev/null +++ b/docs/philosophy/positions/07-contract-first.md @@ -0,0 +1,347 @@ +# Position 07: Contract/API-First Testing + +**Author:** Contract Testing Advocate +**Date:** 2026-02-15 +**Position:** Define the API contract first, then test against it + +--- + +## Core Thesis + +PoeClaw is a *gateway* — its entire value proposition is mediating between browser clients, the Poe API, Cloudflare's sandbox infrastructure, and the OpenClaw runtime. The correctness of a gateway is determined exclusively by whether it honors the promises it makes at its boundaries. Therefore, the primary testing discipline for this project should be defining explicit, versioned API contracts at each boundary and writing tests that verify adherence to those contracts — not testing internal implementation details that can (and should) change freely. + +--- + +## Key Arguments + +### 1. Gateway Architecture Demands Interface-First Thinking + +PoeClaw has at least five distinct integration boundaries: + +| Boundary | Consumer | Provider | +|----------|----------|----------| +| Browser → Worker HTTP API | Chat UI, Login page | Hono routes (`/api/auth/*`, `/api/admin/*`, `/api/status`) | +| Browser → Worker SSE stream | `useChat` hook | `/v1/chat/completions` proxy | +| Worker → Poe API | Session auth module | `api.poe.com/v1/models`, `/v1/chat/completions` | +| Worker → Sandbox API | Process manager, sync | `sandbox.startProcess()`, `sandbox.containerFetch()`, `sandbox.wsConnect()` | +| Worker → R2 Storage | Sync module | Rclone via `sandbox.exec()`, R2 bucket bindings | + +Each boundary is a *contract*. When the chat UI sends `POST /api/auth/login` with `{ poe_api_key: "..." }`, it expects a specific response shape, specific status codes, specific cookie behavior. When the worker calls `api.poe.com/v1/models`, it expects a specific JSON schema. These expectations exist whether we write them down or not. Contract-first testing makes them explicit and testable. + +A unit test on the `validatePoeKey()` function tells you the function works. A contract test on `POST /api/auth/login` tells you the *system* works from the consumer's perspective. In a gateway, only the latter matters to users. + +### 2. Consumer-Driven Contracts Protect Against Multi-Tenant Regression + +The PoeClaw transformation introduces multi-tenancy: per-user Durable Objects, session-scoped sandbox resolution, namespaced R2 paths. This multiplies the contract surface. A request from User A must never leak data from User B's sandbox. A session cookie from User A must never resolve to User B's Durable Object. + +These are *contract violations*, not implementation bugs. They cannot be reliably caught by unit-testing individual functions because the invariant spans the full request lifecycle: cookie → session verification → user hash → DO resolution → sandbox isolation → R2 namespacing. Contract tests that exercise the full boundary — "given this session cookie, I get back only this user's data" — are the natural expression of this requirement. + +Consider what happens when someone refactors the session module. Unit tests on the old module pass (because they're deleted or rewritten). Unit tests on the new module pass (because they test the new implementation). But if the cookie format changed subtly, the browser client breaks silently. A contract test catches this immediately because it tests the *promise*, not the *mechanism*. + +### 3. The WebSocket and SSE Protocols Are Contract-Dense + +The existing WebSocket proxy (`src/index.ts:283-429`) performs bidirectional message relay with error transformation. Messages from the container have their error strings rewritten before reaching the client. Close codes are propagated. The planned SSE streaming for chat completions will have its own protocol: chunked `data:` lines, `[DONE]` sentinel, specific JSON shapes per chunk. + +These are wire protocols — they *are* contracts. Testing them as contracts (expected input → expected output at the boundary) is more natural and more valuable than testing the relay implementation internals. If someone replaces the WebSocket relay with a different approach, every contract test should still pass, because the *client's experience* hasn't changed. + +Specific protocol contracts that need definition: + +- **SSE stream shape**: Each `data:` line must contain `{"choices":[{"delta":{"content":"..."}}]}` matching OpenAI's streaming format +- **Error transformation**: `gateway token missing/mismatch` must become a user-friendly redirect message +- **Close code propagation**: WebSocket close code 1000 from container must yield 1000 to client +- **Cold start behavior**: `/api/status` must return `{"ok":false,"status":"not_running"}` before container boot, and `{"ok":true,"status":"running"}` after + +### 4. Poe API Integration Has Known Sharp Edges That Need Contract Pinning + +The design document explicitly identifies Poe API incompatibilities: non-standard model names (`Claude-Sonnet-4.5` vs `claude-sonnet-4-5`), silent failures on tool calling + streaming, silently ignored `response_format`. These are places where the *contract we expect* diverges from *what the API actually does*. + +Contract tests against the Poe API boundary serve dual purposes: + +1. **Provider contract tests** verify our assumptions about Poe's behavior. When Poe changes (fixes streaming, adds models, changes rate limiting), these tests tell us immediately. +2. **Consumer contract tests** verify that our worker correctly translates Poe's responses into the OpenAI-compatible format our UI expects. + +Without explicit contracts here, we're relying on manual discovery of breakage. With contracts, the CI pipeline surfaces it automatically. + +### 5. Session Cookie Contract Is a Security Boundary + +The authentication flow (`POST /api/auth/login` → HMAC session cookie → encrypted key in DO storage) is a security-critical contract. The design specifies: + +- Cookie attributes: `HttpOnly; Secure; SameSite=Lax; Max-Age=86400` +- Rate limiting: 10 attempts/IP/minute +- Key display: `***...last4` only +- Timing-safe comparison + +These are testable contract properties. A contract test can verify that a valid login returns a cookie with the correct attributes, that an invalid key returns 401 without setting a cookie, that 11 rapid requests from the same IP yield 429. These tests are more valuable than unit tests on the HMAC function because they test the *security promise* as experienced by an attacker, not just the correctness of a cryptographic primitive. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "Contracts are just a subset of TDD" + +**Their argument:** Contract tests are just a specific kind of test. TDD already covers this — write the test first, then implement. Red-green-refactor naturally produces correct interfaces. + +**Rebuttal:** TDD says *write a test first*. It does not say *which* test, or *at what level*. In practice, TDD practitioners start with the smallest unit — a function, a class method — and work outward. This produces excellent function-level coverage but can miss integration-level contract violations. + +The critical distinction is *who defines the test*. In TDD, the implementer writes tests that express their understanding of requirements. In consumer-driven contract testing, the *consumer* defines what they need, and the provider must satisfy it. For a gateway, these perspectives diverge. The implementer might TDD a `parsePoeModels()` function that correctly parses the current Poe response format. But the consumer contract says "the `/api/models` endpoint must return `[{id: string, name: string}]`" — which survives any refactoring of the parse logic. + +TDD and contracts are complementary, not competing. Use TDD for implementation correctness. Use contracts for interface stability. But if you must choose one discipline for a gateway, choose contracts — because the interfaces *are* the product. + +### Against Type Advocates: "Hono already gives type-safe routes" + +**Their argument:** Hono's typed routes already define the contract. `c.json(data)` ensures the response matches the type. TypeScript catches contract violations at compile time. Adding contract tests is redundant. + +**Rebuttal:** Types are necessary but insufficient. They address three specific gaps: + +1. **Types don't cross the wire.** TypeScript types are erased at runtime. The browser client doesn't run TypeScript — it sends HTTP requests and parses JSON. A type says `DeviceListResponse` has a `pending` field. A contract test verifies that the actual HTTP response body, as bytes on the wire, contains a `pending` field with the correct shape. Types prevent the *author* from making mistakes; contracts prevent the *system* from making mistakes. + +2. **Types can't express behavioral contracts.** "When the session cookie is expired, return 401" is a contract. "When the Poe API returns 429, retry with backoff and surface a user-friendly error" is a contract. "The SSE stream must end with `data: [DONE]`" is a contract. None of these can be expressed in TypeScript's type system. They require runtime verification. + +3. **Types don't prevent regression across deployments.** If someone changes the `DeviceListResponse` type and updates all the callsites, TypeScript is happy. But every deployed client that cached the old response shape is now broken. A contract test against the *published* contract (not the current type) catches this because it tests the *promise made to existing consumers*. + +Types and contracts are complementary. Use types for compile-time safety within the codebase. Use contracts for runtime safety across the wire and across time. + +### Against Integration Testers: "Contracts test interfaces, not behavior" + +**Their argument:** A contract test says "this endpoint returns 200 with this shape." An integration test says "when the user logs in, a sandbox starts, the config is patched, and chat works end-to-end." Contracts test the surface; integration tests test the substance. + +**Rebuttal:** This is a real tension, and I partly concede the point. Contract tests alone are insufficient for a system with complex internal orchestration. The gateway startup sequence (find existing process → start if needed → wait for port → inject env vars) involves real behavioral complexity that pure contract tests won't cover. + +However, I argue that for PoeClaw, the *majority of the valuable test surface* is at the contract level: + +- **80% of bugs users experience are contract violations**: wrong status code, missing field, malformed SSE chunk, broken cookie. These are the bugs that break the chat UI, cause the login to fail silently, or leak data between users. +- **20% of bugs are behavioral**: wrong startup sequence, race conditions in sandbox resolution, R2 sync ordering. These warrant targeted integration tests. + +The 80/20 split means contracts should be the *primary* discipline, supplemented by integration tests for complex orchestration. Not the other way around. + +Furthermore, well-designed contracts *do* test behavior indirectly. A contract that says "after `POST /api/auth/login` with a valid key, subsequent requests with the returned cookie to `/v1/chat/completions` must return a streaming response" — this implicitly tests that login, session creation, sandbox startup, config patching, and chat routing all work. The contract doesn't care *how* they work, only that the end-to-end promise is kept. + +### Against Pragmatists: "Too much ceremony for a small project" + +**Their argument:** PoeClaw is a small team, maybe one developer. Writing formal API contracts, maintaining schema files, running contract test suites — this is enterprise overhead that slows down a project targeting 10-50 users. + +**Rebuttal:** This is the strongest counterargument, and I take it seriously. Here is my honest assessment: + +**What costs more than it's worth:** +- Full Pact-style consumer-driven contract infrastructure with a broker +- OpenAPI specification maintained as a separate YAML file that must be kept in sync +- Contract tests for internal-only interfaces (e.g., the sync module's internal functions) + +**What is lightweight and high-value:** +- A single test file per boundary (5 files total) with request-response pairs +- Response shape assertions using the TypeScript types already defined in the codebase +- Snapshot-style contract tests: "this request produces this response shape" stored as fixtures +- Running these in CI on every push (~30 seconds of additional test time) + +The ceremony scales with the tool choice, not the philosophy. Contract-first testing for PoeClaw means: + +```typescript +// src/__tests__/contracts/auth.contract.test.ts +describe('POST /api/auth/login', () => { + it('returns session cookie on valid key', async () => { + const res = await app.request('/api/auth/login', { + method: 'POST', + body: JSON.stringify({ poe_api_key: 'valid-test-key' }), + }) + expect(res.status).toBe(200) + expect(res.headers.get('set-cookie')).toMatch(/HttpOnly.*Secure.*SameSite=Lax/) + const body = await res.json() + expect(body).toMatchObject({ ok: true, models: expect.any(Array) }) + }) + + it('returns 401 on invalid key', async () => { + const res = await app.request('/api/auth/login', { + method: 'POST', + body: JSON.stringify({ poe_api_key: 'invalid' }), + }) + expect(res.status).toBe(401) + expect(res.headers.has('set-cookie')).toBe(false) + }) +}) +``` + +This is *not* enterprise ceremony. It is straightforward Vitest. The philosophical commitment is simply: write *these* tests *first*, before writing the implementation. That's the entire ceremony. + +--- + +## Specific Contracts for This Project + +### Contract 1: Authentication Boundary + +``` +POST /api/auth/login + Request: { poe_api_key: string } + Success: 200, Set-Cookie: session=; HttpOnly; Secure; SameSite=Lax; Max-Age=86400 + Body: { ok: true, user: { display: "***...last4" }, models: [{id, name}] } + Invalid: 401, No Set-Cookie + Body: { ok: false, error: "invalid_key" } + Rate: 429 after 10 attempts/IP/min + Body: { ok: false, error: "rate_limited", retry_after: number } + +POST /api/auth/logout + Request: Cookie: session= + Success: 200, Set-Cookie: session=; Max-Age=0 + Body: { ok: true } +``` + +### Contract 2: Gateway Status + +``` +GET /api/status + No Auth Required + Running: 200 { ok: true, status: "running", processId: string } + Starting: 200 { ok: false, status: "not_running" } + Error: 200 { ok: false, status: "not_responding", error: string } +``` + +### Contract 3: Chat Completions Proxy + +``` +POST /v1/chat/completions + Request: Cookie: session= + Body: { model: string, messages: [{role, content}], stream?: boolean } + Auth: 401 if no/invalid session + No Container: 503 { error: { message: "Container starting", type: "service_unavailable" } } + Non-streaming: 200 { id, object: "chat.completion", choices: [{message: {role, content}}] } + Streaming: 200, Content-Type: text/event-stream + data: {"id":"...","choices":[{"delta":{"role":"assistant"}}]} + data: {"id":"...","choices":[{"delta":{"content":"Hello"}}]} + ... + data: [DONE] +``` + +### Contract 4: Device Management (Admin) + +``` +GET /api/admin/devices + Auth: Session cookie required + Success: 200 { pending: PendingDevice[], paired: PairedDevice[] } + PendingDevice: { requestId: string, deviceId: string, ts: number, displayName?: string, platform?: string } + PairedDevice: { deviceId: string, createdAtMs: number, approvedAtMs: number, displayName?: string } + +POST /api/admin/devices/:requestId/approve + Success: 200 { success: true, requestId: string, message: string } + NotFound: 404 { success: false, error: "device_not_found" } + +POST /api/admin/devices/approve-all + Success: 200 { approved: string[], failed: [{requestId, success: false, error?}] } +``` + +### Contract 5: Storage/Sync + +``` +GET /api/admin/storage + Success: 200 { configured: boolean, lastSync?: string, missing?: string[] } + +POST /api/admin/storage/sync + Success: 200 { success: true, lastSync: string } + Failed: 200 { success: false, error: string, details?: string } + Not Configured: 200 { success: false, error: "not_configured", missing: string[] } +``` + +### Contract 6: Multi-Tenant Isolation (Cross-Cutting) + +``` +INVARIANT: Request with User A's session cookie MUST NEVER return data from User B's sandbox. + - GET /api/admin/devices with session_A returns devices from sandbox_A only + - POST /v1/chat/completions with session_A routes to container_A only + - R2 sync for session_A writes to users/{hash_A}/ only + +INVARIANT: Expired session cookies MUST return 401, never a stale response. + - Any authenticated endpoint with expired cookie → 401 + - No partial data, no cached responses from previous session +``` + +### Contract 7: WebSocket Proxy (Legacy/CDP) + +``` +WS /ws + Upgrade: 101 Switching Protocols + Relay: Messages from client forwarded verbatim to container + Error Transform: Container error "gateway token missing" → user-friendly message + Close: Container close code propagated to client + +WS /cdp?secret= + Auth: 403 if secret missing/invalid + Upgrade: 101 Switching Protocols + Protocol: Chrome DevTools Protocol JSON-RPC + Request: { id: number, method: "Domain.command", params?: {} } + Response: { id: number, result?: {}, error?: { code: number, message: string } } + Events: { method: "Domain.event", params?: {} } +``` + +--- + +## Proposed Rules + +### Rule 1: Define Before Implement + +Before writing any new endpoint or modifying an existing one, write the contract test first. The contract specifies: HTTP method, path, request shape, response shape per status code, required headers (including cookies), and error cases. The implementation is done when the contract tests pass. + +### Rule 2: Contracts Are Versioned Promises + +Once a contract test is green in CI, changing the response shape requires updating the contract *explicitly*. This forces a conscious decision: "I am changing the promise I made to consumers." The diff is reviewable. The breakage is visible. + +### Rule 3: One Contract File Per Boundary + +Organize contract tests by boundary, not by feature: + +``` +src/__tests__/contracts/ + auth.contract.test.ts # Authentication boundary + status.contract.test.ts # Gateway status + chat.contract.test.ts # Chat completions proxy + devices.contract.test.ts # Device management + storage.contract.test.ts # R2 storage/sync + ws.contract.test.ts # WebSocket proxy + isolation.contract.test.ts # Multi-tenant isolation invariants +``` + +### Rule 4: Contracts Use Hono's Test Client + +Use Hono's built-in `app.request()` for contract tests. No need for external HTTP servers or Pact brokers. Mock the sandbox and external APIs at the boundary, not inside the implementation: + +```typescript +// Mock the EXTERNAL boundary (Poe API, Sandbox), test the INTERNAL contract +const app = createApp({ sandbox: mockSandbox, env: testEnv }) +const res = await app.request('/api/auth/login', { ... }) +// Assert on the response — the CONTRACT +``` + +### Rule 5: Contract Tests Run First in CI + +Contract tests should execute before unit tests in CI. If a contract is broken, nothing else matters — the system is failing its consumers. Unit tests can pass perfectly while a contract is violated (e.g., a type change that ripples correctly through the implementation but breaks the wire format). + +### Rule 6: No Contract Tests for Internal Interfaces + +Do not write contract tests for module-to-module interactions within the worker (e.g., how `sync.ts` calls `r2.ts`). Contracts are for *external* boundaries — where a different system (browser, Poe API, sandbox container) is on the other side. Internal module boundaries are better served by types and, where necessary, unit tests. + +### Rule 7: Maintain Honest Overhead Assessment + +The contract test suite for this project should be: +- **7 contract files** (one per boundary above) +- **~30-50 test cases** total across all boundaries +- **< 60 seconds** CI execution time +- **Updated only when the API changes** — not on every internal refactor + +If the contract suite grows beyond 100 tests or takes > 2 minutes, something has gone wrong — either contracts are being written for internal interfaces (violating Rule 6) or tests are too granular. Refactor the suite, don't add more. + +--- + +## Honest Limitations + +1. **Contract tests won't catch race conditions.** Two users logging in simultaneously, sandbox resolution contention, R2 sync conflicts — these require different testing approaches (load testing, chaos testing, or carefully designed integration tests). + +2. **Contract tests require good mocks.** The contract test for `POST /api/auth/login` needs a mock Poe API that returns realistic responses. If the mock diverges from reality, the contract test gives false confidence. Provider contract tests against the *real* Poe API (run periodically, not on every push) mitigate this. + +3. **The maintenance burden is real but bounded.** Every API change requires updating both the implementation and the contract test. This is deliberate friction — it forces you to think about backwards compatibility. But for a solo developer moving fast, it can feel like paperwork. The mitigation is Rule 6: only contract-test external boundaries, and keep the suite small. + +4. **Contracts test shape, not semantics.** A contract test can verify that `/v1/chat/completions` returns a streaming response with the right JSON shape. It cannot verify that the *content* of the response is a sensible AI completion. For that, you need end-to-end tests with real (or realistic) LLM responses. + +5. **Not all boundaries are equal.** The CDP protocol has 50+ commands. Writing a contract test for each is not worth the investment — the CDP shim is a compatibility layer, not a core product surface. Prioritize contracts for the authentication, chat, and multi-tenant isolation boundaries. + +--- + +## Summary + +PoeClaw is a gateway. Its product is its interfaces. Contract-first testing is not an enterprise indulgence — it is the natural testing discipline for a system whose value is entirely in the promises it makes to its consumers. Define the contracts. Test them first. Let the implementation change freely behind them. diff --git a/docs/philosophy/positions/08-deploy-safety.md b/docs/philosophy/positions/08-deploy-safety.md new file mode 100644 index 000000000..c9a2c2c86 --- /dev/null +++ b/docs/philosophy/positions/08-deploy-safety.md @@ -0,0 +1,194 @@ +# Position 08: Continuous Deployment Safety + +**Advocate:** Deploy Safety Agent +**Position:** Testing exists to enable safe deployment, not the reverse + +--- + +## Core Thesis + +The purpose of every test, type check, and lint rule in this project is to answer one question: *can we ship this right now and sleep well?* A test suite that passes but leaves you afraid to deploy has failed at its actual job. Deployment confidence is the metric; tests are one input to that metric, not the metric itself. + +--- + +## Key Arguments + +### 1. `npm run deploy` Is a One-Way Door With No Guardrails + +The deploy script in `package.json` is `npm run build && wrangler deploy`. That's it. One command, and every connected client — Telegram bots, Discord bots, Slack integrations, the admin UI, every active Sandbox Durable Object — is running new code. There is no staging environment. There is no canary. There is no traffic splitting. There is no automated rollback. + +This is the central fact of the project's deployment posture: the blast radius of `wrangler deploy` is 100% of production traffic, instantly. Every testing philosophy debate is academic if the deployment mechanism itself is a loaded gun. The CI pipeline (`.github/workflows/test.yml`) runs unit tests, e2e tests across four matrix configurations (base, telegram, discord, workers-ai), lint, format check, and typecheck — but the pipeline does not deploy. Deployment is manual. There is no gate between "CI passed" and "someone ran `npm run deploy`." + +The gap between "tests pass in CI" and "production is healthy after deploy" is where incidents live. + +### 2. The Multi-Channel Architecture Multiplies Failure Modes + +This worker serves as the control plane for a container-based sandbox (Durable Object + Cloudflare Container), with multiple ingress channels: + +- **HTTP/WebSocket** via Hono routes (`src/routes/`) +- **Telegram** via bot token and webhook +- **Discord** via bot token +- **Slack** via bot + app tokens +- **Admin UI** via static assets and API routes +- **CDP endpoint** for browser automation +- **R2** for persistent storage and backup + +A regression in the auth middleware (`src/auth/middleware.ts`) doesn't just break one client — it breaks all of them simultaneously. A bug in `src/gateway/sync.ts` could corrupt R2 backups silently. A change to `src/gateway/env.ts` could misconfigure the sandbox environment for every user. + +The e2e test matrix tests telegram, discord, and base configurations in isolation. But in production, all channels are active concurrently on the same worker instance. The interaction effects between channels — shared Durable Object state, concurrent R2 access, auth middleware running for heterogeneous request patterns — are not tested by any existing mechanism. + +Post-deploy verification is the only way to confirm that the actual production deployment, with real secrets, real R2 buckets, real bot tokens, and real concurrent channel traffic, is functioning correctly. + +### 3. Container Boot Is a 1-2 Minute Cold Start With External Dependencies + +The design document acknowledges a 1-2 minute cold start: image pull, R2 restore, OpenClaw onboard, config patch, gateway start. This is not a simple HTTP endpoint where you can verify health in milliseconds. The Sandbox Durable Object must: + +1. Pull the container image +2. Restore state from R2 (`src/gateway/r2.ts`) +3. Start the Moltbot gateway process (`src/gateway/process.ts`) +4. Wait for the gateway to become ready + +Any step can fail silently. R2 credentials could be misconfigured. The container image could have changed. The gateway process could crash on startup due to an environment variable change. None of these are caught by unit tests (which mock the runtime) or by e2e tests (which use test credentials and isolated infrastructure). Only a post-deploy smoke test against the real production environment can verify this chain. + +### 4. Observability Is Enabled But Not Leveraged + +The `wrangler.jsonc` has `"observability": { "enabled": true }`, which means Cloudflare is collecting telemetry. But there is no evidence of: + +- Alerting on error rates post-deploy +- Dashboard monitoring for deploy events +- Automated comparison of pre/post-deploy metrics +- Health check endpoints that verify end-to-end functionality + +Observability without deployment correlation is just data collection. The infrastructure for safe deployment exists in Cloudflare's platform (Workers Analytics, Logpush, Tail Workers) — this project simply doesn't use it. + +### 5. The E2E Tests Are Expensive But Don't Gate Deployment + +The CI pipeline's e2e job is thorough: it deploys real infrastructure via Terraform, runs Playwright tests, records video, and posts results to PRs. This is excellent — but it runs on `push` and `pull_request` to `main`. It does not run as a pre-deploy check or post-deploy verification. Someone can merge a PR, see green e2e results, then run `npm run deploy` an hour later after other changes have been merged. + +The e2e tests prove the code *can* work in an environment similar to production. They do not prove the code *is* working in production after a specific deploy. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "You're conflating testing with operations" + +**Their argument:** Testing is about design feedback and correctness. Operations is about monitoring and deployment. These are separate concerns. You're trying to smuggle ops practices into a testing philosophy discussion. + +**Rebuttal:** The separation is artificial. When a TDD practitioner writes a test, they're making a claim: "if this test passes, this behavior is correct." But correctness in what context? The test runs in Vitest with mocked Cloudflare bindings. Production runs on Cloudflare's edge with real Durable Objects, real R2 buckets, and real container orchestration. The gap between those contexts is a *testing gap*, not an ops gap. + +The TDD red-green-refactor cycle implicitly assumes that the test environment is a faithful proxy for production. When it isn't — and for a Cloudflare Worker with containers, Durable Objects, and external bot integrations, it demonstrably isn't — you need additional verification that closes the gap. Post-deploy smoke tests are tests. Health check endpoints are testable assertions. Canary analysis is automated comparison of expected vs. actual behavior. These are testing practices applied at a different layer. + +I'm not conflating testing with operations. I'm pointing out that your testing stops too early. + +### Against Type Advocates: "Deploy safety is orthogonal to type safety" + +**Their argument:** Types prevent entire categories of bugs at compile time. If the types are right, the code is right. Deploy safety is a separate concern that doesn't invalidate the value of static analysis. + +**Rebuttal:** I agree that types are valuable — and I agree they're insufficient. Consider: `wrangler.jsonc` defines secrets like `ANTHROPIC_API_KEY`, `MOLTBOT_GATEWAY_TOKEN`, `R2_ACCESS_KEY_ID`. TypeScript can ensure these are declared in the `Env` interface. It cannot ensure they are set correctly in production. A type-correct deploy with a missing secret is still a broken deploy. + +More precisely: the `MoltbotEnv` type in `src/types.ts` declares what the environment *should* look like. The runtime environment is what it *actually* looks like. Types enforce the former. Deploy safety verifies the latter. They are complementary, not orthogonal — because a type-safe program that crashes on a missing secret at runtime has not delivered on the promise that type safety implies. + +The `validateRequiredEnv` function in `src/index.ts` is already an admission that types alone aren't enough for environment correctness. Deploy safety extends this principle to the full production stack. + +### Against Integration Testers: "We already test the system" + +**Their argument:** The e2e test suite deploys real infrastructure, runs real Terraform, uses real Cloudflare accounts, and tests real browser interactions across multiple configurations. This *is* system-level verification. + +**Rebuttal:** The e2e suite is impressive — and it tests a *different system* than production. The e2e tests use: + +- `E2E_CLOUDFLARE_API_TOKEN` (not the production API token) +- `E2E_CF_ACCOUNT_ID` (possibly the same, possibly different) +- `E2E_R2_ACCESS_KEY_ID` / `E2E_R2_SECRET_ACCESS_KEY` (separate R2 credentials) +- `fake-telegram-bot-token-for-e2e` / `fake-discord-bot-token-for-e2e` +- An isolated worker name (via `E2E_TEST_RUN_ID`) + +This is a parallel universe that resembles production. It is not production. The e2e tests prove that the *code* works on Cloudflare's platform. They do not prove that *this specific deploy*, with *these specific secrets*, to *this specific worker name*, is healthy. + +Furthermore, the e2e `continue-on-error: true` on the test step means the pipeline collects results even on failure — the failure is only enforced by a separate "Fail if E2E tests failed" step. This is good for video recording, but it means the e2e signal flows through a more complex path than a simple pass/fail gate. + +I'm not saying integration tests are worthless. I'm saying they answer a different question than "is production healthy right now?" + +### Against Pragmatists: "Monitoring isn't testing" + +**Their argument:** You're dressing up monitoring and ops practices as a testing philosophy. Monitoring is reactive. Testing is proactive. They serve different purposes. + +**Rebuttal:** The distinction between proactive and reactive breaks down at the deployment boundary. A post-deploy smoke test that runs 30 seconds after `wrangler deploy` and hits the production health endpoint is proactive — it catches problems before users do. A canary release that routes 5% of traffic to the new version and compares error rates is proactive — it limits blast radius before full rollout. + +Monitoring becomes reactive when it's the *only* safety net. I'm not proposing monitoring instead of testing. I'm proposing a deployment pipeline where: + +1. Tests (unit, integration, e2e) gate the merge +2. Deploy is automated, not manual +3. Post-deploy smoke tests verify production health immediately +4. Monitoring compares pre/post-deploy error rates +5. Automated rollback triggers if health checks fail + +This is a *testing pipeline that extends through deployment*, not monitoring replacing testing. + +--- + +## Specific Deployment Safety Measures for This Project + +### Immediate (No Architecture Changes) + +1. **Health check endpoint:** Add `GET /healthz` that verifies: + - Worker is responding + - Sandbox Durable Object can be instantiated + - R2 bucket is accessible (list with limit 1) + - Auth middleware loads correctly + +2. **Post-deploy smoke script:** A shell script that runs after `wrangler deploy`: + ```bash + # deploy-and-verify.sh + npm run build && wrangler deploy + sleep 5 + curl -sf https://YOUR_WORKER.workers.dev/healthz || { echo "DEPLOY FAILED"; wrangler rollback; exit 1; } + ``` + +3. **Deploy script upgrade:** Replace `"deploy": "npm run build && wrangler deploy"` with a script that includes post-deploy verification. + +4. **Wrangler rollback awareness:** Document and test `wrangler rollback` so the team knows exactly how to revert a bad deploy. + +### Short-Term (CI Pipeline Changes) + +5. **Deploy-from-CI only:** Add a deploy workflow that runs *only* after the test workflow succeeds, eliminating the manual deploy gap. + +6. **Post-deploy e2e in CI:** After deploy, re-run a minimal smoke subset of the e2e suite against production (not the test infrastructure). + +7. **Deploy notifications:** Post to the team's Slack/Discord/Telegram channels when a deploy happens, what commit it includes, and whether smoke tests passed. + +### Medium-Term (Architecture) + +8. **Gradual rollout via Workers versions:** Use Cloudflare's [Gradual Rollouts](https://developers.cloudflare.com/workers/configuration/versions-and-deployments/) to deploy to a percentage of traffic first. + +9. **Tail Worker for deploy monitoring:** A Tail Worker that captures error events from the main worker and alerts on spikes correlated with deploys. + +10. **Structured health reporting:** The `/healthz` endpoint returns structured JSON with component-level status, enabling automated comparison over time. + +--- + +## Proposed Rules + +1. **No manual deploys.** All production deploys go through CI. The `npm run deploy` script is for local development only. Production deploys require the full test suite to pass first. + +2. **Every deploy is verified.** Post-deploy smoke tests are not optional. If the smoke test fails, the deploy is considered failed and rollback is initiated — automatically if possible, manually with documentation if not. + +3. **Health checks are tests.** The `/healthz` endpoint is a first-class test artifact. It is maintained with the same rigor as unit tests. If it returns 200, the system is healthy. If it doesn't, the deploy is bad. + +4. **Blast radius is bounded.** Use gradual rollouts when available. Never deploy 100% of traffic to untested code. The current `wrangler deploy` behavior (instant 100% rollout) is a known risk to be mitigated. + +5. **Deploy events are observable.** Every deploy is logged, timestamped, and correlated with the git commit. Post-deploy metrics (error rate, latency, R2 operation success rate) are compared against pre-deploy baselines. + +6. **Rollback is practiced.** The team runs `wrangler rollback` at least once per quarter on a non-critical change to ensure the rollback path works and everyone knows how to execute it. + +7. **Channel health is verified independently.** Post-deploy checks verify each channel (HTTP, Telegram, Discord, Slack) independently, because a deploy can break one channel while leaving others functional. + +--- + +## Summary + +The testing philosophy debate in this project is incomplete without addressing the deployment boundary. We have unit tests, type checks, linting, and an impressive e2e suite — but none of them answer the question that matters most: "is production healthy after this deploy?" + +The answer to that question requires mechanisms that extend beyond the test suite: health endpoints, post-deploy verification, gradual rollouts, automated rollback, and deploy-correlated monitoring. These are not operations concerns smuggled into a testing debate. They are the completion of the testing story — the final link in the chain from "the code is correct" to "the users are served." + +A test suite that gives you confidence to merge is good. A deployment pipeline that gives you confidence to ship is better. This project needs both. diff --git a/docs/philosophy/positions/09-anti-mock.md b/docs/philosophy/positions/09-anti-mock.md new file mode 100644 index 000000000..60c62273a --- /dev/null +++ b/docs/philosophy/positions/09-anti-mock.md @@ -0,0 +1,230 @@ +# Position 09: Anti-Mock Realism + +**Agent:** Anti-Mock Realist +**Date:** 2026-02-15 +**Project:** PoeClaw/OpenClaw (Cloudflare Workers + Sandbox Containers) + +--- + +## Core Thesis + +Mocks are executable lies. Every mock encodes an assumption about how an external system behaves, and that assumption ossifies into the test suite while the real system continues to evolve, drift, and surprise. In a project like PoeClaw — where the dominant source of production bugs is the behavioral gap between the Cloudflare Sandbox API and what developers expect it to do — a mock-heavy test suite doesn't test your code against reality; it tests your code against your beliefs about reality. When those beliefs are wrong, the tests still pass, the deploy goes green, and the bug ships. + +--- + +## Key Arguments + +### 1. The Sandbox API's Most Important Behaviors Are Precisely What Mocks Cannot Capture + +The AGENTS.md file documents four critical behavioral quirks of the sandbox API, and every single one of them is invisible to our mock-based tests: + +- **`proc.status` timing**: The sandbox API's `proc.status` may not update immediately after a process completes. AGENTS.md explicitly warns: "Instead of checking `proc.status === 'completed'`, verify success by checking for expected output." Yet `createMockProcess()` in `test-utils.ts:26-36` returns an instantly-settled status. The mock process is born `completed` with `exitCode: 0` — it has never been in a liminal state. The real bug class — code that reads status too early and acts on stale data — is structurally undetectable. + +- **s3fs timestamp incompatibility**: s3fs doesn't support setting timestamps, causing `rsync -a` to fail with "Input/output error." This was a real production bug that required changing to `rsync -r --no-times`. The old s3fs-based mocks would have passed with `rsync -a` just fine. Now we use rclone, but `createMockExecResult()` returns `{ success: true, exitCode: 0 }` regardless of what command string was passed. A test could assert `rclone sync` with completely fabricated flags and the mock would nod approvingly. + +- **Mount state detection**: AGENTS.md says "Don't rely on `sandbox.mountBucket()` error messages to detect 'already mounted' state. Instead check `mount | grep s3fs`." The mock sandbox doesn't even include `mountBucket` — it was presumably dropped when rclone replaced s3fs. But the lesson stands: the mock had no opinion about mount semantics, so it couldn't have caught the bug. + +- **`waitForPort` race conditions**: The real code comment at `process.ts:71-73` is damning: "a process can be 'running' but not ready yet (e.g., just started by another concurrent request). Using a shorter timeout causes race conditions where we kill processes that are still initializing." The mock's `waitForPort: vi.fn()` resolves instantly. It cannot reproduce the race condition that motivated the 3-minute timeout. + +### 2. Sequential Mock Chains Encode Fragile Ordering Assumptions + +Consider `sync.test.ts:47-54`, which tests `syncToR2`: + +```typescript +execMock + .mockResolvedValueOnce(createMockExecResult('yes')) // rclone configured + .mockResolvedValueOnce(createMockExecResult('openclaw')) // config detect + .mockResolvedValueOnce(createMockExecResult()) // rclone sync config + .mockResolvedValueOnce(createMockExecResult()) // rclone sync workspace + .mockResolvedValueOnce(createMockExecResult()) // rclone sync skills + .mockResolvedValueOnce(createMockExecResult()) // date > last-sync + .mockResolvedValueOnce(createMockExecResult(timestamp)); // cat last-sync +``` + +This is seven sequentially-ordered mock responses for a single function. The test is a script that says "if you call exec exactly seven times, in exactly this order, I will return these values." This has several fatal properties: + +- **It's a change detector, not a behavior test.** If the implementation reorders workspace and skills sync (which are logically independent), the mock chain breaks. If someone adds a health check exec call between steps 2 and 3, every mock index shifts and the test fails — not because the code is wrong, but because the script deviated from the mock's choreography. + +- **It cannot represent partial failure.** Real rclone sync can succeed for config but fail for workspace due to network flake, then succeed for skills. The mock chain either works perfectly or injects a single failure at a specific index. The combinatorial space of real partial-failure modes is vast; the mock explores exactly one path per test. + +- **It cannot represent timing.** In reality, `sandbox.exec()` has variable latency. The 120-second timeout on sync operations (`sync.ts:58`) exists because rclone can legitimately take minutes on large workspaces. The mock resolves in microseconds. Any timeout-sensitive logic is untestable. + +### 3. The Auth Mocks Replace a Security Boundary With a Rubber Stamp + +The JWT verification tests mock the entire `jose` library: + +```typescript +vi.mock('jose', () => ({ + createRemoteJWKSet: vi.fn(() => 'mock-jwks'), + jwtVerify: vi.fn(), +})); +``` + +This replaces the JWKS-fetching, cryptographic-verification, and claim-validation pipeline with `vi.fn()`. The test then programs `jwtVerify` to return a payload and asserts that the code reads the email from it. But: + +- The mock cannot catch a real JWKS endpoint returning unexpected key formats. +- The mock cannot catch clock skew issues with `exp`/`iat` claims (the test uses `Math.floor(Date.now() / 1000) + 3600`, which will always be valid). +- The mock cannot catch the Cloudflare Access team domain URL construction edge cases (with/without `https://` prefix) in the actual HTTP call — only in the string-construction unit logic. +- `createRemoteJWKSet` returns the string `'mock-jwks'`. In production, it returns a function. If any code path ever inspects or calls the JWKS set directly rather than passing it through `jwtVerify`, the mock would not catch the type mismatch. + +For a security-critical path, this level of mocking is not testing authentication — it's testing that your code can read properties from an object. + +### 4. `createMockSandbox` Encodes a Frozen API Surface That Silently Diverges + +The mock sandbox in `test-utils.ts:62-90` implements six methods: `listProcesses`, `startProcess`, `containerFetch`, `exec`, `writeFile`, and `wsConnect`. The cast `as unknown as Sandbox` explicitly discards type safety: + +```typescript +const sandbox = { + listProcesses: listProcessesMock, + // ... 5 more methods +} as unknown as Sandbox; +``` + +When the real `@cloudflare/sandbox` SDK adds a new method (say `readFile`, or `getMetrics`), the mock won't break — it simply won't have the method, and any test using it will get `undefined` instead of a function. The `as unknown as Sandbox` double-cast means TypeScript won't catch this either. The mock is a snapshot of the API as someone understood it at one point in time, with no mechanism to detect staleness. + +### 5. Mocks Make the Test Suite a Mirror of the Implementation, Not a Specification of Behavior + +Look at `r2.test.ts:52-71`. The test for "writes rclone config when not configured" is: + +```typescript +execMock + .mockResolvedValueOnce(createMockExecResult('no')) // flag check + .mockResolvedValueOnce(createMockExecResult()) // mkdir + .mockResolvedValueOnce(createMockExecResult()); // touch flag +``` + +This doesn't test "can the container access R2?" It tests "does `ensureRcloneConfig` call `exec` three times and `writeFile` once in the expected order?" The test is a line-by-line transliteration of the implementation into mock-language. If someone refactored `ensureRcloneConfig` to use a single `exec` call with `&&`-chained commands (functionally equivalent), the test would fail. If someone introduced a subtle bug in the rclone config format that R2 rejects (wrong endpoint format, missing `no_check_bucket`), the test would pass — because `writeFile` is mocked and never validates the content against R2's actual requirements. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "Mocks Enable Fast Feedback" + +**Their argument:** Mocks let you run the test suite in milliseconds. Fast feedback loops accelerate development. Real dependencies make tests slow and flaky. + +**Rebuttal:** Fast feedback on the wrong thing is not feedback — it's false confidence. The PoeClaw test suite runs in ~2 seconds. It "covers" rclone sync, JWT verification, process management, and R2 configuration. But when the team discovered that `proc.status` doesn't update immediately, that s3fs can't handle timestamps, and that `waitForPort` needs a 3-minute timeout for race conditions — none of those discoveries came from the test suite. They came from production. The fast feedback loop told the team "everything works" while the slow feedback loop (production) told them "nothing works the way you assumed." + +I do not argue for eliminating all fast tests. Pure functions like `buildEnvVars` and `detectConfigDir` (with real exec output) are fine. The problem is mocking the *boundaries* — the sandbox API, the auth pipeline, the storage layer — where the real bugs live. + +### Against Type Advocates: "Typed Mocks at Least Match the Interface" + +**Their argument:** If the mock implements the same TypeScript interface as the real dependency, you get compile-time guarantees that the contract is respected. + +**Rebuttal:** PoeClaw's own codebase refutes this. `createMockSandbox` uses `as unknown as Sandbox` — a double-cast that explicitly bypasses the type system. The mock doesn't implement the `Sandbox` interface; it impersonates it via type erasure. Even if the cast were removed, TypeScript interfaces describe *shape*, not *behavior*. The `Sandbox` interface says `exec` returns `Promise`. It doesn't say "exec can take 120 seconds," "exec might fail silently if the container is under memory pressure," or "exec output may include unexpected ANSI escape codes." The behavioral contract — the part that causes bugs — is not in the types. + +### Against Integration Testers: Distinguishing Contract Testing from Reality Testing + +**Their argument (my natural allies):** "We agree — test against real dependencies at the integration layer." + +**My distinction:** Integration testing often still uses controlled, sanitized environments. A staging Cloudflare Sandbox is not a production Cloudflare Sandbox. The quirks documented in AGENTS.md — timing, mount semantics, process state — may differ between environments. My position is stronger: where feasible, use the *actual* Cloudflare Sandbox API in tests, running against the same infrastructure that production uses. The e2e tests in `test/e2e/` already do this with `cctr` and real containers. That's the right direction. The gap is that the unit tests mock everything, and the e2e tests are run rarely and manually. + +The ideal is a layered approach: (1) no-mock unit tests for pure logic, (2) real-sandbox contract tests for API interactions, (3) e2e smoke tests for full flows. Layer 2 is currently missing entirely. + +### Against Pragmatists: "Real Dependencies Are Slow and Unreliable" + +**Their argument:** Running tests against real Cloudflare Sandbox containers takes 1-2 minutes for cold start alone. It requires real credentials, real network access, and real money. Tests become flaky due to network issues, rate limits, and shared state. + +**Rebuttal:** I acknowledge this is the strongest counterargument, and I do not dismiss it. The cost is real. But consider the alternative cost: the team has already documented at least four classes of bugs (in AGENTS.md) that were discovered in production because tests couldn't catch them. Each of these bugs required emergency debugging in a live system. The 20-second `CLI_TIMEOUT_MS` and 180-second `STARTUP_TIMEOUT_MS` constants in the codebase are scar tissue from production incidents that mocked tests couldn't predict. + +My practical proposal is not "run the full sandbox in CI on every commit." It is: + +1. **Record-replay for sandbox interactions.** Capture real `exec` output, real timing, real error messages from actual sandbox sessions. Replay them in tests. This is still faster than mocks to write (no manual mock construction) and captures real behavioral quirks. +2. **Contract tests on nightly or merge.** Run a focused set of tests against a real sandbox in CI on merge to main. This catches API drift. +3. **Reserve mocks for pure logic.** `buildEnvVars`, string parsing, config validation — these don't need a sandbox. Test them without mocks, with real data structures. + +### Against Deploy Safety Advocates: "Production Testing Validates What Mocks Can't" + +**Their argument (partial ally):** Production is the ultimate test environment. Canary deploys, feature flags, and observability catch what pre-production tests miss. + +**My distinction:** I agree that production testing is necessary, but it is not sufficient as the *only* validation of external behavior. Production testing tells you "it's broken" after deployment. Real-dependency testing tells you "it's broken" before deployment. The ideal is both: pre-merge contract tests against real sandbox APIs to catch behavioral drift early, plus production observability to catch the long tail of issues that only manifest under real load and real data. + +The deploy safety position is about how to recover from failures. My position is about how to prevent failures from reaching production in the first place. + +--- + +## Specific Examples: Where Mocks Hide Real Bugs + +### Example 1: The `proc.status` Stale Read + +**Bug class:** Code checks `proc.status === 'running'` immediately after `startProcess()`. In production, the status may still be `'starting'` or even undefined for a brief window. The mock at `test-utils.ts:31` returns `status: 'completed'` by default, and `createFullMockProcess` at `process.test.ts:10` returns `status: 'running'`. Neither reproduces the transient states. + +**What would catch it:** A real sandbox test where `startProcess()` is called and status is polled over time, revealing the actual state transition sequence. + +### Example 2: The rclone Config Format + +**Bug class:** `ensureRcloneConfig` writes an rclone config to the container via `sandbox.writeFile()`. The config includes `no_check_bucket = true`. If this flag were misspelled (`no_check_buket = true`) or if a future rclone version changed the option name, the mock test would still pass — `writeFileMock` accepts any string. The bug would surface only when `rclone sync` fails in production with a cryptic error. + +**What would catch it:** A test that writes the config and then runs `rclone lsd r2:` to verify the config is parseable and the connection works. + +### Example 3: The Exec Output Parsing + +**Bug class:** `detectConfigDir` in `sync.ts:24-32` parses the stdout of a shell command that chains `test -f ... && echo ... || ...`. The mock returns clean strings like `'openclaw'` or `'clawdbot'`. Real exec output might include trailing newlines, ANSI escape codes, or stderr leaking into stdout. The `.trim()` on line 28 handles newlines, but other output contamination would break the `=== 'openclaw'` comparison silently. + +**What would catch it:** Running the actual shell command in a real container and parsing the actual output. + +### Example 4: The 120-Second Timeout + +**Bug class:** `syncToR2` passes `{ timeout: 120000 }` to `sandbox.exec()` for rclone operations (`sync.ts:58`). The mock ignores this parameter entirely. If the timeout were accidentally removed, or if rclone sync legitimately exceeds 120 seconds for a large workspace, the mock test passes. The real behavior — does the sandbox API actually respect the timeout parameter? Does it kill the process or let it run? — is unknown to the test suite. + +**What would catch it:** A real sandbox test that syncs a non-trivial amount of data and verifies the timeout behavior. + +### Example 5: The `waitForPort` Mode + +**Bug class:** `process.ts:76` calls `waitForPort(MOLTBOT_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS })`. The mock's `waitForPort: vi.fn()` ignores both the mode and the timeout. If the sandbox API doesn't support `mode: 'tcp'` (or if it defaults to HTTP and the gateway isn't HTTP-ready at that point), the mock wouldn't know. Real discovery of this would require a real container where the port is open at TCP level but not yet HTTP-ready. + +--- + +## Proposed Rules for This Project + +### Rule 1: No Mocking of `@cloudflare/sandbox` API Methods + +The sandbox is the core external dependency. Its behavioral quirks are the primary source of production bugs. Mocking it creates a parallel universe where those quirks don't exist. + +**Instead:** For unit tests, extract pure logic into functions that take plain data (not `Sandbox` objects). For integration tests, use a real sandbox or recorded real outputs. + +### Rule 2: No Mocking of Security Boundaries + +JWT verification, JWKS fetching, and Cloudflare Access validation must not be mocked. These are the authentication perimeter. + +**Instead:** Use real (test) JWTs signed with known keys. Test the full `jose` verification pipeline with test keys, not `vi.fn()`. + +### Rule 3: Record-Replay Over Hand-Written Mocks + +When real dependencies are too slow for CI, capture real interaction transcripts and replay them. A recorded `ExecResult` from a real `rclone sync` command carries the actual stdout format, actual timing characteristics, and actual error messages. + +**Instead of:** +```typescript +execMock.mockResolvedValueOnce(createMockExecResult('yes')) +``` + +**Use:** +```typescript +execMock.mockResolvedValueOnce(recordedExecResult('rclone-config-check-positive')) +``` + +Where the recorded result was captured from an actual sandbox session, warts and all. + +### Rule 4: Pure Logic Needs No Mocks + +Functions like `buildEnvVars`, `getR2BucketName`, and `rcloneRemote` are pure transformations. Test them with real data structures, not mocked environments. `createMockEnv` is acceptable for constructing test data, but the function under test should take the data directly, not a mocked service. + +### Rule 5: Contract Tests Run on Every Merge to Main + +A focused set of tests that call the real Cloudflare Sandbox API must run before code reaches production. These tests verify: +- `sandbox.exec()` returns the expected `ExecResult` shape with real stdout/stderr +- `sandbox.startProcess()` status transitions behave as documented +- `sandbox.writeFile()` content is readable back via `sandbox.exec('cat ...')` +- `waitForPort` actually waits and times out as specified + +### Rule 6: Delete Mock Infrastructure That Encodes Behavioral Assumptions + +`createMockProcess` with a default `status: 'completed'` is a lie about process lifecycle. `createMockExecResult` with a default `exitCode: 0` is a lie about command success rates. If these utilities must exist during transition, they should be marked `@deprecated` with comments explaining what real behavior they fail to capture. + +--- + +## Conclusion + +The PoeClaw test suite is a monument to good intentions: high coverage, fast execution, clear assertions. But it tests the developer's mental model of the Cloudflare Sandbox, not the Cloudflare Sandbox itself. The four documented quirks in AGENTS.md are four bugs that passed through a mock-heavy test suite and were discovered in production. The mock didn't fail because the code was correct — the mock succeeded because the mock was wrong. + +Real systems are strange. They have timing windows, partial failures, format surprises, and behavioral evolution. A mock is a bet that none of that matters. In a project where the entire value proposition depends on orchestrating a container runtime with known quirks, that bet is irresponsible. Test with real dependencies, or accept that your green test suite is a green lie. diff --git a/docs/philosophy/positions/10-guardrails.md b/docs/philosophy/positions/10-guardrails.md new file mode 100644 index 000000000..5b8f843fe --- /dev/null +++ b/docs/philosophy/positions/10-guardrails.md @@ -0,0 +1,274 @@ +# Position 10: CI Guardrails & Static Analysis + +**Author:** Guardrails Advocate +**Position:** Linting, formatting, type-checking, and CI guardrails prevent more bugs than tests do, at lower cost. + +--- + +## Core Thesis + +Static analysis and CI guardrails eliminate entire *categories* of defects — unused bindings, type mismatches, unreachable code, import errors, formatting drift — before a single test executes, and they do so with zero ongoing authoring cost after initial configuration. A well-configured lint/format/typecheck pipeline is the highest-ROI quality investment because it operates at the speed of the compiler, scales to every line of code without per-feature effort, and catches the most *frequent* classes of error in a TypeScript codebase. Tests remain essential for verifying business logic, but they are the *second* layer of defense, not the first. + +--- + +## Key Arguments + +### 1. The Existing Pipeline Already Proves the Point + +Examine the CI job ordering in `.github/workflows/test.yml`: + +```yaml +- name: Lint + run: npm run lint +- name: Format check + run: npm run format:check +- name: Type check + run: npm run typecheck +- name: Run tests + run: npm test +``` + +Lint, format, and typecheck run *before* tests — sequentially, as gates. This is not accidental. The project's own architecture acknowledges that these checks are prerequisites: there is no value running a test suite against code that doesn't type-check or contains lint violations. The pipeline already embodies the guardrails-first philosophy. The question is whether we invest further in strengthening these gates or divert effort toward writing more tests for defects the gates already prevent. + +### 2. Oxlint's Plugin Architecture Covers Vast Surface Area at Near-Zero Marginal Cost + +The `.oxlintrc.json` enables six plugin families: `react`, `typescript`, `unicorn`, `oxc`, `import`, `vitest`. Together these enforce: + +- **Correctness** (error): Guaranteed failures for provably wrong code — unreachable branches, invalid regex, incorrect API usage. +- **Suspicious** (warn): Likely-buggy patterns — unnecessary type assertions, confusing operator precedence, assignments in conditions. +- **Perf** (warn): Performance anti-patterns — unnecessary spreads, inefficient iterations. +- **Import** hygiene: Missing or circular imports detected before runtime. +- **Vitest** rules: Test-file-specific correctness (e.g., no focused tests leaking to CI). + +Each plugin adds hundreds of rules. Enabling a new plugin is a one-line config change that retroactively covers the entire `src/` tree. Contrast this with tests: every new behavior requires a new test function, new assertions, and ongoing maintenance. The cost curves are fundamentally different — guardrails are O(1) to add, tests are O(n) where n is the number of behaviors. + +### 3. TypeScript Strict Mode Is Already the Project's Most Powerful Bug Preventer + +`tsconfig.json` sets `"strict": true`, which activates: + +- `strictNullChecks` — eliminates an entire class of null/undefined runtime errors +- `strictFunctionTypes` — prevents contravariant function assignment bugs +- `noImplicitAny` — forces explicit typing, which the AGENTS.md mandates ("Explicit types preferred over inference") +- `strictPropertyInitialization` — catches uninitialized class properties + +Tony Hoare called null references a "billion-dollar mistake." `strictNullChecks` alone prevents more production crashes than most test suites catch. The compiler has *perfect* coverage of every code path — no test suite achieves this. Combined with `noEmit: true` and `isolatedModules: true`, TypeScript serves purely as a static verifier, exactly the role this position advocates: the compiler as the first and most complete line of defense. + +### 4. Formatting Enforcement Eliminates an Entire Category of Review Friction + +`oxfmt --check src/` in CI means formatting debates never happen. No PR comment will ever say "add a newline here" or "use consistent indentation." This is not a trivial benefit. Studies on code review consistently find that *stylistic* comments are the most frequent and least valuable category of feedback. By automating formatting, reviewer attention is freed for logic, architecture, and correctness — the areas where human judgment is irreplaceable. + +Moreover, consistent formatting makes `git blame` meaningful. Without format enforcement, a single reformatting commit pollutes the blame history for entire files. With `oxfmt` gating CI, every line's blame points to the commit that changed its *semantics*, not its whitespace. + +### 5. Guardrails Scale With Zero Marginal Author Effort + +When a new contributor adds a route handler in `src/routes/`, the guardrails automatically apply: + +- Oxlint checks for unused variables, suspicious patterns, and incorrect imports. +- Oxfmt enforces the project's formatting conventions. +- `tsc --noEmit` verifies type correctness against the strict configuration. + +No test needs to be written for these checks to apply. No reviewer needs to remember to verify these properties. The guardrails are *passive* — they protect the codebase by default. Tests are *active* — they only protect what someone explicitly chose to test. In a project with AI-assisted contributions (which CONTRIBUTING.md explicitly addresses), passive guardrails are especially critical because they catch the classes of errors that AI code generators most frequently produce: type mismatches, unused imports, inconsistent formatting. + +--- + +## Counterarguments and Rebuttals + +### Against TDD Purists: "Guardrails don't verify behavior" + +**The objection:** Static analysis can tell you the code compiles and follows rules, but it cannot tell you the code *does the right thing*. Only tests verify behavior. + +**The rebuttal:** Correct, but this overstates the gap. Consider what "behavior" means in practice: + +1. **Type-level behavior** is verified by the compiler. If a function's signature says it returns `Response`, it returns `Response`. If a variable is `string | undefined`, every consumer must handle both cases. This *is* behavioral verification — it's just expressed in the type system rather than in assertions. + +2. **The majority of bugs in TypeScript codebases are not logical errors** — they are wiring errors: wrong imports, mismatched types, unused variables that indicate incomplete refactors, null dereferences. Guardrails catch these systematically. TDD catches them case-by-case if someone happens to write a test that exercises the specific path. + +3. **Guardrails and tests are not competing for the same budget.** Guardrails are a one-time configuration cost. Tests are an ongoing authoring cost. Investing in guardrails does not diminish test coverage — it *frees* testing effort to focus on the genuinely behavioral questions that static analysis cannot answer. + +The TDD position is correct that business logic requires tests. But TDD is an *authoring methodology*, not a *quality assurance strategy*. Guardrails are a quality assurance strategy that works regardless of authoring methodology. + +### Against Type System Advocates: Natural Allies, But We Go Further + +**The objection:** Types already do what you're describing. Why add linting and formatting on top? + +**The rebuttal:** Types are necessary but not sufficient. The type system cannot detect: + +- **Dead code and unused exports** — oxlint's `no-unused-vars` catches variables the type checker silently accepts. +- **Suspicious patterns** — `if (x = 5)` type-checks perfectly. Oxlint's `suspicious` category flags it. +- **Performance anti-patterns** — The type system has no performance model. Oxlint's `perf` rules catch patterns like unnecessary object spreads in hot paths. +- **Import hygiene** — Circular imports type-check but cause runtime initialization failures in ESM. The `import` plugin detects these. +- **Test-specific correctness** — The `vitest` plugin catches `.only` tests that would silently skip the rest of the suite in CI. +- **Formatting consistency** — Types say nothing about readability. Oxfmt enforces it. + +The type system is the *foundation* of the guardrails stack. Linting and formatting are the *superstructure*. They complement rather than compete. This position advocates for the full stack, not types alone. + +### Against Integration Testers: "Static analysis can't test runtime behavior" + +**The objection:** This is a Cloudflare Worker that proxies WebSocket connections, manages container lifecycles, and interacts with R2 storage. Static analysis can't verify any of that works. You need integration tests against real infrastructure. + +**The rebuttal:** This is the strongest counterargument, and it deserves an honest answer: **integration tests are irreplaceable for verifying infrastructure interactions.** The E2E test matrix in `test.yml` — testing base, telegram, discord, and workers-ai configurations against real Cloudflare infrastructure — is exactly the right approach for verifying runtime behavior. + +However, the integration testing argument actually *strengthens* the guardrails position: + +1. **Integration tests are expensive.** The E2E job has a 20-minute timeout, requires Terraform, Playwright, cloud credentials, and video recording infrastructure. Each run consumes real cloud resources. You want to run these tests *only* on code that has already passed every cheap check. Guardrails are the filter that prevents wasting expensive integration test cycles on code that won't even type-check. + +2. **Integration test failures are hard to diagnose.** When an E2E test fails, the failure could be in the code, the infrastructure, the test itself, or a timing issue. When a lint check fails, the error message points to the exact line and explains the problem. Guardrails produce *actionable* failures; integration tests often produce *ambiguous* ones. + +3. **Integration tests have low coverage-per-test.** Each E2E scenario tests one path through a complex system. Guardrails check every path the compiler can see. The correct strategy is: maximize guardrail coverage (cheap, complete) to reduce the surface area that integration tests (expensive, partial) must verify. + +### Against Pragmatists: "Guardrails ARE the pragmatic choice" + +**The objection:** We agree, mostly. But don't over-invest in tooling configuration when you could be shipping features. + +**The rebuttal:** This is less a counterargument than a calibration question, and it's fair. The pragmatist and guardrails positions are natural allies. The key distinction: + +1. **Guardrails have front-loaded cost and near-zero ongoing cost.** Configuring oxlint plugins, setting up CI gates, adding pre-commit hooks — these are one-time investments. Once configured, they protect every subsequent commit forever. The pragmatist should recognize this as the *most* pragmatic investment: highest long-term ROI with minimal maintenance burden. + +2. **The risk of under-investment in guardrails is invisible.** If you skip a test, you might get a bug report. If you skip a guardrail, you get a slow accumulation of tech debt — unused imports, inconsistent formatting, type assertions that hide real errors — that makes the codebase progressively harder to work with. The pragmatist optimizes for shipping speed; guardrails *preserve* shipping speed over time. + +3. **The project already has the guardrails infrastructure.** Oxlint, oxfmt, and `tsc --noEmit` are configured and running in CI. The marginal cost of strengthening these (adding rules, adding pre-commit hooks) is tiny compared to the initial setup cost that's already been paid. + +### Against Property Testers: "Linters can't find logical errors" + +**The objection:** Property-based testing can discover invariant violations, edge cases, and logical errors that no amount of static analysis will find. Guardrails operate on syntax and types; property tests operate on semantics. + +**The rebuttal:** This is correct and important. Property testing and guardrails address orthogonal concerns: + +- **Guardrails** verify *structural* correctness: the code is well-formed, consistently styled, type-safe, and free of known anti-patterns. +- **Property tests** verify *semantic* correctness: the code's behavior satisfies stated invariants across a wide input space. + +These are complementary, not competing. The guardrails position does not claim to replace property testing for semantic verification. Rather, it claims that: + +1. **Structural defects are more common than logical defects** in a well-typed TypeScript codebase. The majority of PR-blocking CI failures in projects with strong guardrails are type errors and lint violations, not test failures — because the guardrails catch the most frequent error classes before tests run. + +2. **Guardrails improve property test quality.** When property tests aren't cluttered with failures from type mismatches or import errors, their signal-to-noise ratio improves. Clean static analysis output means every test failure represents a genuine semantic issue worth investigating. + +3. **The marginal cost of adding a lint rule is lower than the marginal cost of adding a property test.** Both are valuable; guardrails are cheaper. + +--- + +## Specific Guardrails to Add to This Project + +### 1. Pre-commit Hooks via Lefthook (or Husky + lint-staged) + +The project currently has **no local pre-commit enforcement** — all checks run only in CI. This means developers push commits, wait for CI, discover lint/format failures, fix them, and push again. This wastes CI cycles and developer time. + +**Recommendation:** Add `lefthook` (Rust-based, zero-dependency) with staged-file-only checks: + +```yaml +# .lefthook.yml +pre-commit: + parallel: true + commands: + lint: + glob: "*.{ts,tsx}" + run: oxlint {staged_files} + format: + glob: "*.{ts,tsx}" + run: oxfmt --check {staged_files} + typecheck: + run: tsc --noEmit +``` + +This catches failures in seconds locally rather than minutes in CI. The fast feedback loop is the single highest-value guardrail improvement available. + +### 2. Promote Suspicious Warnings to Errors + +The current `.oxlintrc.json` sets `"suspicious": "warn"`. Warnings are ignored. Promote to error: + +```json +{ + "categories": { + "correctness": "error", + "suspicious": "error", + "perf": "warn" + } +} +``` + +Suspicious patterns — assignments in conditions, confusing operator precedence, unnecessary type assertions — are bugs-in-waiting. A warning that doesn't block CI is a suggestion, not a guardrail. + +### 3. Promote `no-unused-vars` to Error + +Currently `"no-unused-vars": "warn"`. Unused variables are the #1 indicator of incomplete refactors and dead code paths. This should be an error: + +```json +{ + "rules": { + "no-unused-vars": "error" + } +} +``` + +### 4. Add `oxlint` Security Plugin + +Oxlint supports a `security` category. Enable it: + +```json +{ + "categories": { + "correctness": "error", + "suspicious": "error", + "security": "error", + "perf": "warn" + } +} +``` + +For a Worker that handles authentication (JWT, JWKS in `src/auth/`), proxies WebSocket connections, and manages cloud infrastructure credentials, security-focused static analysis is not optional. + +### 5. CI Caching for Faster Guardrail Feedback + +The current CI uses `cache: npm` for Node.js dependencies but doesn't cache oxlint or oxfmt binaries. Since these are Rust binaries installed via npm, they're re-downloaded on every run. Explicit caching or using a pre-built action would reduce CI feedback time. + +### 6. Branch Protection Rules + +Require the `unit` job (which includes lint, format, typecheck, and test gates) to pass before merging any PR. If not already configured, this turns the CI pipeline from advisory to mandatory. + +### 7. Commit Message Linting + +Add `commitlint` or a lightweight equivalent to enforce conventional commits. This improves `git log` readability, enables automated changelog generation, and catches low-effort commit messages from drive-by contributions — a concern explicitly raised in CONTRIBUTING.md regarding AI-generated PRs. + +--- + +## Proposed Rules for This Project + +### Rule 1: No Code Merges Without Green Guardrails +Every PR must pass lint, format, and typecheck gates before merge. No exceptions, no `--no-verify` bypasses. Tests may be temporarily skipped with documented justification; guardrails may not. + +### Rule 2: Warnings Are Technical Debt — Promote or Suppress Explicitly +Every oxlint warning should be either promoted to error (if it represents a real defect class) or explicitly suppressed with an inline `// oxlint-ignore` comment that explains *why*. The warn level should be used only during evaluation of new rules, not as a permanent state. + +### Rule 3: Local Guardrails Match CI Guardrails +Pre-commit hooks must run the same checks as CI. A developer should never be surprised by a CI guardrail failure. The feedback loop must be local and fast. + +### Rule 4: New Lint Rules Require Migration, Not Grandfathering +When a new lint rule is enabled, fix all existing violations in a dedicated PR before enabling the rule as an error. Do not use baseline files or violation counts to grandfather existing code. The guardrail must protect the *entire* codebase uniformly. + +### Rule 5: Guardrails Before Tests in CI Pipeline Order +The current pipeline ordering (lint → format → typecheck → test) is correct and must be preserved. Cheap checks gate expensive checks. If a future CI restructuring is proposed, this ordering constraint must be maintained. + +### Rule 6: Security Rules Are Non-Negotiable Errors +Any rule in the `security` category must be set to error severity. Security-related lint suppressions require code review approval from a maintainer. + +--- + +## Acknowledged Limitations + +This position does not claim that guardrails are *sufficient* for software quality. They are necessary but not sufficient. Specifically: + +1. **Business logic correctness** requires tests. No guardrail can verify that the WebSocket proxy correctly routes messages to the right container, or that R2 backup restoration preserves data integrity. These are behavioral properties that require behavioral verification. + +2. **Runtime environment interactions** require integration tests. The Worker's interaction with Cloudflare's container sandbox, R2, Access authentication, and AI Gateway cannot be statically analyzed. + +3. **User experience** requires manual testing and observation. The admin UI's usability, the gateway's responsiveness, and the onboarding flow's clarity are human-judgment questions. + +4. **Concurrency and timing** issues largely escape static analysis in JavaScript's event-loop model. Race conditions in WebSocket message ordering or container lifecycle management require targeted tests. + +What guardrails *do* provide is a floor — a guaranteed minimum quality level that holds across every commit, every file, every contributor, without anyone needing to remember to check. Tests raise the ceiling; guardrails raise the floor. Both matter. But the floor is more important, because when the floor drops, everything built on top collapses. + +--- + +## Summary + +The cost structure of software quality is asymmetric: preventing defects is cheaper than detecting them, and detecting them is cheaper than debugging them. Guardrails operate at the prevention layer. Tests operate at the detection layer. Investing in the cheapest, broadest layer first — and this project already does — is not just pragmatic; it is the mathematically optimal quality strategy. The recommendation is to deepen the existing investment: promote warnings to errors, add pre-commit hooks for local enforcement, enable security rules, and maintain the principle that no code merges without green guardrails. diff --git a/docs/plans/2026-02-15-poeclaw-design.md b/docs/plans/2026-02-15-poeclaw-design.md new file mode 100644 index 000000000..4bbea349e --- /dev/null +++ b/docs/plans/2026-02-15-poeclaw-design.md @@ -0,0 +1,244 @@ +# PoeClaw Design Document + +> Multi-tenant OpenClaw platform powered by Poe API keys on Cloudflare Workers + +## 1. Overview + +PoeClaw transforms the single-tenant moltworker/OpenClaw project into a multi-tenant platform where users authenticate with their Poe API key and get their own sandboxed AI agent instance. + +**User flow:** +1. Visit landing page +2. Paste POE_API_KEY +3. Key is validated against `api.poe.com/v1/models` +4. Container spins up with their key as the LLM provider +5. Chat via a Poe-style dark-themed UI + +## 2. Architecture + +``` +User Browser + | ++-------------------------------------------+ +| Cloudflare Worker (Hono) | +| | +| GET / -> Landing/Login Page | +| POST /api/auth/login -> Validate key | +| | | +| v | +| Session Middleware | +| - Verify session cookie | +| - Resolve user's Sandbox DO | +| - Decrypt POE_API_KEY from DO storage | +| | | +| v | +| Per-User Sandbox (Durable Object) | +| - Container with OpenClaw | +| - Poe provider config patched in | +| - Proxy HTTP/SSE to container | ++-------------------------------------------+ +``` + +**Key architectural decisions:** +- One Sandbox Durable Object per user, keyed by stable user identifier (hashed) +- Poe API accessed via OpenAI-compatible endpoint (`api.poe.com/v1/chat/completions`) +- Custom Poe provider config patched into OpenClaw at container boot +- HTTP API + SSE for chat (not WebSocket) to avoid protocol complexity +- Session cookies with HMAC-SHA256 signing + +## 3. Design Decisions (Informed by Adversarial Debate) + +Five parallel research agents investigated risks. Key findings and mitigations: + +### 3.1 Multi-Tenancy (H1: Conditionally Feasible) + +**How it works:** `getSandbox(env.Sandbox, userHash, options)` — each unique hash creates a separate Durable Object with its own container. + +**Constraints:** +- `max_instances` must be raised from 1 (current) to expected concurrent users +- Account-level hard cap: ~400 GiB memory = 100 concurrent `standard-1` or 400 `basic` containers +- `keepAlive: true` is catastrophic for multi-tenant — must use `sleepAfter` + +**Decision:** Use `basic` instance type (1 GiB RAM) with `sleepAfter: "1h"`. Target 10-50 concurrent users for v1. + +### 3.2 Poe API Compatibility (H2: Sharp Edges) + +**What works:** Basic chat completions via `api.poe.com/v1/chat/completions` with bearer token auth. + +**What breaks:** +- Model names: Poe uses `Claude-Sonnet-4.5`, `GPT-5.2` — not standard OpenAI IDs +- Tool calling + streaming: Known Poe bug causes silent failures +- `response_format`, `strict` mode: Silently ignored +- 500 RPM rate limit could be hit during heavy agentic use + +**Decision:** Create a custom `poe` provider entry in OpenClaw config (same pattern as KimiClaw). Start with non-streaming for reliability, add streaming as a follow-up. Hardcode 2-3 model names initially. + +### 3.3 Authentication Security (H3: Approved with Mitigations) + +**Model:** Paste API key -> validate -> HMAC session cookie -> encrypted key in DO storage. + +**Required mitigations:** +1. Rate limit login: 10 attempts/IP/minute +2. Cookie: `HttpOnly; Secure; SameSite=Lax; Max-Age=86400` +3. Separate secrets: one for HMAC signing, one for key encryption +4. Never display full API key in UI (show `***...last4` only) +5. Timing-safe comparison for session validation +6. Input validation on key format before sending to Poe + +**Key rotation risk:** If user rotates their Poe key, the old key's hash (DO ID) orphans their data. Mitigation: check if Poe's `/v1/models` response includes a stable user ID — use that for DO ID instead of key hash. + +**Decision:** Store encrypted key in DO storage (not KV) — colocated, strongly consistent, no extra cost. Use DO storage for session metadata too. + +### 3.4 Cost Model (H4: Viable) + +| Instance | Per-user/mo (30min/day, 1h sleep) | 100 users/mo | +|----------|-----------------------------------|--------------| +| standard-1 (4 GiB) | ~$3.24 | ~$329 | +| basic (1 GiB) | ~$0.92 | ~$97 | + +**Cold start: 1-2 minutes** (heavy Dockerfile: image pull + R2 restore + OpenClaw onboard + config patch + gateway start). This is the #1 UX risk. + +**Decision:** Use `basic` instances. Accept cold starts with a good loading UI. Explore Dockerfile optimization to reduce boot time as a follow-up. + +### 3.5 Chat UI Architecture (H5: Feasible via HTTP API) + +**Decision:** Use OpenClaw's HTTP API (`/v1/chat/completions` + SSE) instead of the WebSocket JSON-RPC protocol. + +**Why:** The WebSocket protocol requires a complex challenge-response handshake, device identity, and custom event parsing. The HTTP API is standard OpenAI-compatible — just `fetch` + SSE. + +**Requirements:** +- Enable `gateway.http.endpoints.chatCompletions.enabled: true` in config patch +- Model switching via `model` field or `x-openclaw-agent-id` header +- Add Worker endpoints: `GET /api/sessions` (proxy to CLI), `GET /api/models` +- Handle cold starts: check `/api/status` before attempting chat + +## 4. File Changes + +### Modified Files + +| File | Change | +|------|--------| +| `src/index.ts` | Per-user sandbox resolution, session middleware, remove CF Access requirement, add auth routes | +| `src/types.ts` | Add `POE_API_KEY`, session types | +| `src/gateway/env.ts` | Add `POE_API_KEY` env var passthrough to container | +| `src/gateway/process.ts` | Per-user env var injection at container startup | +| `src/gateway/sync.ts` | Namespace R2 paths: `users/{userHash}/` | +| `src/gateway/r2.ts` | Same R2 namespacing | +| `start-openclaw.sh` | Detect `POE_API_KEY`, create Poe provider config, enable HTTP API, accept `OPENAI_BASE_URL` | +| `wrangler.jsonc` | Rename to `poeclaw`, instance_type `basic`, raise `max_instances`, add rate limiting | +| `package.json` | Rename to `poeclaw` | + +### New Files + +| File | Purpose | +|------|---------| +| `src/auth/session.ts` | Session creation, cookie management, key encryption, HMAC signing | +| `src/auth/poe.ts` | POE_API_KEY validation via `/v1/models`, model list extraction | +| `src/routes/auth.ts` | `POST /api/auth/login`, `POST /api/auth/logout` | +| `src/client/pages/LoginPage.tsx` | Dark-themed landing page with key input | +| `src/client/pages/LoginPage.css` | Landing page styles | +| `src/client/pages/ChatPage.tsx` | Poe-style chat: sidebar + messages + input | +| `src/client/pages/ChatPage.css` | Dark theme chat styles | +| `src/client/hooks/useChat.ts` | SSE streaming hook for chat completions | +| `src/client/hooks/useGatewayStatus.ts` | Poll `/api/status` for cold start handling | + +### Removed/Replaced + +| File | Reason | +|------|--------| +| `src/auth/jwt.ts` | No CF Access — replaced by session auth | +| `src/auth/middleware.ts` | Replaced by session middleware | +| `src/client/pages/AdminPage.tsx` | Replaced by ChatPage | +| `src/routes/admin-ui.ts` | Replaced by chat UI serving | + +## 5. Implementation Phases + +### Phase 1: Auth & Multi-Tenant Core +*Foundation — must be first. Estimated: largest phase.* + +- `src/auth/poe.ts` — validate POE_API_KEY against `api.poe.com/v1/models` +- `src/auth/session.ts` — HMAC session tokens, cookie management, key encryption in DO storage +- `src/routes/auth.ts` — login/logout endpoints with rate limiting +- `src/index.ts` — per-user Sandbox DO resolution (`getSandbox(env.Sandbox, userHash, options)`), session middleware +- `src/types.ts` — new types for `POE_API_KEY`, sessions, Poe models +- `wrangler.jsonc` — rename to `poeclaw`, `instance_type: "basic"`, `max_instances: 50`, `sleepAfter: "1h"` +- Minimal unstyled login page to test end-to-end + +**Testable outcome:** User pastes key, gets session cookie, Worker resolves per-user Sandbox DO. + +### Phase 2: Poe Provider Integration +*Make containers work with Poe API. Estimated: medium.* + +- `src/gateway/env.ts` — add `POE_API_KEY` to container env vars +- `start-openclaw.sh` — detect `POE_API_KEY`, create custom Poe provider config in `openclaw.json`: + ```json + { + "poe": { + "baseUrl": "https://api.poe.com/v1", + "apiKey": "${POE_API_KEY}", + "api": "openai-completions", + "models": [ + { "id": "Claude-Sonnet-4.5", "name": "Claude Sonnet 4.5" }, + { "id": "GPT-5.2", "name": "GPT 5.2" }, + { "id": "Gemini-3-Pro", "name": "Gemini 3 Pro" } + ] + } + } + ``` +- Enable HTTP chat completions endpoint in config patch +- Set `OPENCLAW_DEV_MODE=true` to skip device pairing (Worker handles auth) +- Auto-generate per-user gateway token from `userHash` + +**Testable outcome:** Login -> container starts -> can chat via OpenClaw's built-in Control UI through the Worker proxy. + +### Phase 3: Per-User R2 Persistence +*Data isolation. Estimated: small.* + +- `src/gateway/sync.ts` — namespace R2 paths with `users/{userHash}/` +- `src/gateway/r2.ts` — pass user prefix to rclone config +- `start-openclaw.sh` — restore/sync from user-prefixed R2 paths + +**Testable outcome:** User's conversations persist across container restarts. Different users have isolated data. + +### Phase 4: Poe-Style Chat Frontend +*The UI. Estimated: large.* + +- `src/client/pages/LoginPage.tsx` — dark-themed landing with key input, link to `poe.com/api_key` +- `src/client/pages/ChatPage.tsx` — left sidebar (models, conversations), main chat area, input bar +- `src/client/hooks/useChat.ts` — `fetch` to `/v1/chat/completions` with SSE streaming +- `src/client/hooks/useGatewayStatus.ts` — poll `/api/status`, show loading during cold start +- `src/client/App.tsx` — router between login/chat based on session state +- Model selector populated from cached `/v1/models` response (stored during login) +- Markdown rendering for code blocks, lists, etc. +- Remove old AdminPage + +**Testable outcome:** Full Poe-like chat experience with model switching and conversation history. + +### Phase 5: Polish & Harden +*Production readiness. Estimated: medium.* + +- Loading page redesign with PoeClaw branding +- Error pages (invalid key, expired session, container errors, Poe credit exhaustion) +- CSP headers for XSS protection +- Model icons/metadata in sidebar +- README and deployment docs +- Dockerfile optimization to reduce cold start time +- Investigate streaming + tool calling workaround +- Session expiry and renewal flow + +## 6. Open Questions + +1. **Does `api.poe.com/v1/models` return a stable user/account ID?** If yes, use it for DO ID instead of key hash (solves key rotation problem). +2. **Can OpenClaw run in 1 GiB RAM?** Need to validate `basic` instance type works. Fallback: `standard-1` at higher cost. +3. **Streaming + tool calling on Poe:** Is this fixed? If not, non-streaming MVP is the safe path. +4. **Poe model name discovery:** Should we hardcode models or dynamically populate from the `/v1/models` response at login? + +## 7. Risk Matrix + +| Risk | Severity | Likelihood | Mitigation | +|------|----------|------------|------------| +| Cold start UX (1-2 min) | HIGH | CERTAIN | Good loading UI, Dockerfile optimization | +| Tool calling + streaming breaks | HIGH | HIGH | Non-streaming MVP, follow up with Poe | +| OpenClaw won't run on 1 GiB | MEDIUM | MEDIUM | Fall back to standard-1 | +| Key rotation orphans data | MEDIUM | LOW | Use stable Poe user ID for DO ID | +| 100-user account cap | LOW | LOW | basic instances push to 400; sufficient for v1 | +| Poe API rate limit (500 RPM) | LOW | LOW | Backoff logic, mostly chat-only use | diff --git a/docs/plans/2026-02-15-poeclaw-implementation.md b/docs/plans/2026-02-15-poeclaw-implementation.md new file mode 100644 index 000000000..909d43115 --- /dev/null +++ b/docs/plans/2026-02-15-poeclaw-implementation.md @@ -0,0 +1,2800 @@ +# PoeClaw Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use executing-plans to implement this plan task-by-task. + +**Goal:** Transform the single-tenant moltworker/OpenClaw sandbox into a multi-tenant platform where users authenticate with their Poe API key and get their own sandboxed AI agent instance. + +**Architecture:** Each user pastes their POE_API_KEY, which is validated against `api.poe.com/v1/models`. A session cookie (HMAC-SHA256 signed) is issued. The Worker resolves a per-user Durable Object sandbox via `getSandbox(env.Sandbox, userHash)`. The container boots with the user's Poe key mapped as a custom OpenAI-compatible provider. Chat happens via HTTP API + SSE (not WebSocket). + +**Tech Stack:** Cloudflare Workers (Hono), Durable Objects + Sandbox containers, React 19, Vite, Vitest, Poe OpenAI-compatible API + +**Design doc:** `docs/plans/2026-02-15-poeclaw-design.md` + +--- + +## Phase 1: Auth & Multi-Tenant Core + +### Task 1: Rename project to PoeClaw + +**Files:** +- Modify: `package.json:2` (name field) +- Modify: `wrangler.jsonc:3` (name field) + +**Step 1: Update package.json name** + +In `package.json`, change: +```json +"name": "moltbot-sandbox", +``` +to: +```json +"name": "poeclaw", +``` + +**Step 2: Update wrangler.jsonc name** + +In `wrangler.jsonc`, change: +```jsonc +"name": "moltbot-sandbox", +``` +to: +```jsonc +"name": "poeclaw", +``` + +**Step 3: Commit** + +```bash +git add package.json wrangler.jsonc +git commit -m "chore: rename project from moltbot-sandbox to poeclaw" +``` + +--- + +### Task 2: Update wrangler.jsonc for multi-tenancy + +**Files:** +- Modify: `wrangler.jsonc:36-43` (containers block) + +**Step 1: Change container config** + +In `wrangler.jsonc`, replace the containers block: +```jsonc + "containers": [ + { + "class_name": "Sandbox", + "image": "./Dockerfile", + "instance_type": "standard-1", + "max_instances": 1, + }, + ], +``` +with: +```jsonc + "containers": [ + { + "class_name": "Sandbox", + "image": "./Dockerfile", + "instance_type": "basic", + "max_instances": 50, + }, + ], +``` + +**Step 2: Verify config is valid JSON** + +Run: `node -e "const fs = require('fs'); JSON.parse(fs.readFileSync('wrangler.jsonc','utf8').replace(/\/\/.*/g,'').replace(/,(\s*[}\]])/g,'$1'));" && echo "valid"` + +Expected: `valid` (or use `npx wrangler deploy --dry-run` if available) + +**Step 3: Commit** + +```bash +git add wrangler.jsonc +git commit -m "feat: configure multi-tenant containers (basic, max 50 instances)" +``` + +--- + +### Task 3: Add PoeClaw types + +**Files:** +- Modify: `src/types.ts` + +**Step 1: Add new types** + +In `src/types.ts`, add `POE_API_KEY` to `MoltbotEnv`, add session types, and add `SESSION_SECRET` and `ENCRYPTION_SECRET`: + +Replace the entire `MoltbotEnv` interface — add these new fields alongside the existing ones: + +After line 44 (`WORKER_URL?: string;`), add: +```typescript + // PoeClaw session auth + SESSION_SECRET?: string; // HMAC-SHA256 key for session cookies + ENCRYPTION_SECRET?: string; // AES-GCM key for encrypting stored API keys +``` + +Add after the `AccessUser` interface (after line 53): + +```typescript +/** + * Poe session user (from session cookie) + */ +export interface PoeSessionUser { + userHash: string; // SHA-256 hash used as DO ID + keyLast4: string; // Last 4 chars of API key for display + models: PoeModel[]; // Available models from /v1/models + createdAt: number; // Session creation timestamp (epoch ms) +} + +/** + * Poe model from /v1/models response + */ +export interface PoeModel { + id: string; // e.g., "Claude-Sonnet-4.5" + name: string; // Display name +} +``` + +Update `AppEnv.Variables` to add `poeUser`: +```typescript +export type AppEnv = { + Bindings: MoltbotEnv; + Variables: { + sandbox: Sandbox; + accessUser?: AccessUser; + poeUser?: PoeSessionUser; + }; +}; +``` + +**Step 2: Verify types compile** + +Run: `cd /Volumes/dev/poeclaw && npx tsc --noEmit` +Expected: No errors (or only pre-existing errors) + +**Step 3: Commit** + +```bash +git add src/types.ts +git commit -m "feat: add PoeClaw session and Poe model types" +``` + +--- + +### Task 4: Create Poe API key validation module + +**Files:** +- Create: `src/auth/poe.ts` +- Create: `src/auth/poe.test.ts` + +**Step 1: Write the failing test** + +Create `src/auth/poe.test.ts`: +```typescript +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { validatePoeApiKey } from './poe'; + +describe('validatePoeApiKey', () => { + beforeEach(() => { + vi.restoreAllMocks(); + }); + + it('rejects empty key', async () => { + const result = await validatePoeApiKey(''); + expect(result.valid).toBe(false); + expect(result.error).toMatch(/empty/i); + }); + + it('rejects key without proper prefix', async () => { + const result = await validatePoeApiKey('not-a-poe-key'); + expect(result.valid).toBe(false); + expect(result.error).toMatch(/format/i); + }); + + it('returns models on successful validation', async () => { + const mockResponse = { + object: 'list', + data: [ + { id: 'Claude-Sonnet-4.5', object: 'model' }, + { id: 'GPT-5.2', object: 'model' }, + ], + }; + + vi.stubGlobal( + 'fetch', + vi.fn().mockResolvedValue({ + ok: true, + json: () => Promise.resolve(mockResponse), + }), + ); + + const result = await validatePoeApiKey('pb-test-key-12345'); + expect(result.valid).toBe(true); + expect(result.models).toHaveLength(2); + expect(result.models![0].id).toBe('Claude-Sonnet-4.5'); + }); + + it('returns invalid on 401 response', async () => { + vi.stubGlobal( + 'fetch', + vi.fn().mockResolvedValue({ + ok: false, + status: 401, + statusText: 'Unauthorized', + }), + ); + + const result = await validatePoeApiKey('pb-invalid-key-999'); + expect(result.valid).toBe(false); + expect(result.error).toMatch(/invalid|unauthorized/i); + }); + + it('handles network errors gracefully', async () => { + vi.stubGlobal( + 'fetch', + vi.fn().mockRejectedValue(new Error('Network error')), + ); + + const result = await validatePoeApiKey('pb-test-key-12345'); + expect(result.valid).toBe(false); + expect(result.error).toMatch(/network|failed/i); + }); +}); +``` + +**Step 2: Run tests to verify they fail** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/auth/poe.test.ts` +Expected: FAIL — `validatePoeApiKey` not found + +**Step 3: Implement the module** + +Create `src/auth/poe.ts`: +```typescript +import type { PoeModel } from '../types'; + +export interface PoeValidationResult { + valid: boolean; + models?: PoeModel[]; + error?: string; +} + +const POE_MODELS_URL = 'https://api.poe.com/v1/models'; + +/** + * Validate a Poe API key by calling /v1/models. + * Returns the list of available models on success. + */ +export async function validatePoeApiKey(apiKey: string): Promise { + if (!apiKey || apiKey.trim() === '') { + return { valid: false, error: 'API key is empty' }; + } + + // Poe keys typically start with "pb-" but we'll be lenient + // and just check it's a reasonable string + if (apiKey.length < 10 || /\s/.test(apiKey)) { + return { valid: false, error: 'Invalid key format' }; + } + + try { + const response = await fetch(POE_MODELS_URL, { + headers: { + Authorization: `Bearer ${apiKey}`, + }, + }); + + if (!response.ok) { + if (response.status === 401 || response.status === 403) { + return { valid: false, error: 'Invalid or unauthorized API key' }; + } + return { valid: false, error: `Poe API returned ${response.status} ${response.statusText}` }; + } + + const data = (await response.json()) as { data?: Array<{ id: string; object?: string }> }; + const models: PoeModel[] = (data.data || []).map((m) => ({ + id: m.id, + name: m.id, // Poe uses the ID as the display name + })); + + return { valid: true, models }; + } catch (err) { + return { + valid: false, + error: `Failed to reach Poe API: ${err instanceof Error ? err.message : 'Unknown error'}`, + }; + } +} +``` + +**Step 4: Run tests to verify they pass** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/auth/poe.test.ts` +Expected: All 5 tests PASS + +**Step 5: Commit** + +```bash +git add src/auth/poe.ts src/auth/poe.test.ts +git commit -m "feat: add Poe API key validation module" +``` + +--- + +### Task 5: Create session management module + +**Files:** +- Create: `src/auth/session.ts` +- Create: `src/auth/session.test.ts` + +**Step 1: Write the failing tests** + +Create `src/auth/session.test.ts`: +```typescript +import { describe, it, expect } from 'vitest'; +import { + createSessionToken, + verifySessionToken, + hashApiKey, + encryptApiKey, + decryptApiKey, +} from './session'; + +const TEST_SESSION_SECRET = 'test-session-secret-32-chars-ok!'; +const TEST_ENCRYPTION_SECRET = 'test-encrypt-secret-32-chars-ok!'; + +describe('hashApiKey', () => { + it('produces a consistent hex hash', async () => { + const hash1 = await hashApiKey('pb-test-key-12345'); + const hash2 = await hashApiKey('pb-test-key-12345'); + expect(hash1).toBe(hash2); + expect(hash1).toMatch(/^[a-f0-9]{64}$/); // SHA-256 hex + }); + + it('produces different hashes for different keys', async () => { + const hash1 = await hashApiKey('pb-key-aaa'); + const hash2 = await hashApiKey('pb-key-bbb'); + expect(hash1).not.toBe(hash2); + }); +}); + +describe('createSessionToken / verifySessionToken', () => { + it('creates and verifies a valid session token', async () => { + const payload = { userHash: 'abc123', keyLast4: '5678', models: [], createdAt: Date.now() }; + const token = await createSessionToken(payload, TEST_SESSION_SECRET); + expect(typeof token).toBe('string'); + expect(token.length).toBeGreaterThan(0); + + const verified = await verifySessionToken(token, TEST_SESSION_SECRET); + expect(verified).not.toBeNull(); + expect(verified!.userHash).toBe('abc123'); + expect(verified!.keyLast4).toBe('5678'); + }); + + it('rejects a tampered token', async () => { + const payload = { userHash: 'abc123', keyLast4: '5678', models: [], createdAt: Date.now() }; + const token = await createSessionToken(payload, TEST_SESSION_SECRET); + const tampered = token.slice(0, -4) + 'xxxx'; + + const verified = await verifySessionToken(tampered, TEST_SESSION_SECRET); + expect(verified).toBeNull(); + }); + + it('rejects token signed with wrong secret', async () => { + const payload = { userHash: 'abc123', keyLast4: '5678', models: [], createdAt: Date.now() }; + const token = await createSessionToken(payload, TEST_SESSION_SECRET); + + const verified = await verifySessionToken(token, 'wrong-secret-wrong-secret-12345'); + expect(verified).toBeNull(); + }); + + it('rejects expired token (older than 24h)', async () => { + const payload = { + userHash: 'abc123', + keyLast4: '5678', + models: [], + createdAt: Date.now() - 25 * 60 * 60 * 1000, // 25 hours ago + }; + const token = await createSessionToken(payload, TEST_SESSION_SECRET); + + const verified = await verifySessionToken(token, TEST_SESSION_SECRET); + expect(verified).toBeNull(); + }); +}); + +describe('encryptApiKey / decryptApiKey', () => { + it('encrypts and decrypts a key round-trip', async () => { + const original = 'pb-test-key-12345-secret'; + const encrypted = await encryptApiKey(original, TEST_ENCRYPTION_SECRET); + expect(encrypted).not.toBe(original); + + const decrypted = await decryptApiKey(encrypted, TEST_ENCRYPTION_SECRET); + expect(decrypted).toBe(original); + }); + + it('produces different ciphertext each time (random IV)', async () => { + const original = 'pb-test-key-12345-secret'; + const encrypted1 = await encryptApiKey(original, TEST_ENCRYPTION_SECRET); + const encrypted2 = await encryptApiKey(original, TEST_ENCRYPTION_SECRET); + expect(encrypted1).not.toBe(encrypted2); + }); + + it('fails to decrypt with wrong secret', async () => { + const encrypted = await encryptApiKey('pb-test-key', TEST_ENCRYPTION_SECRET); + await expect(decryptApiKey(encrypted, 'wrong-secret-wrong-secret-12345')).rejects.toThrow(); + }); +}); +``` + +**Step 2: Run tests to verify they fail** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/auth/session.test.ts` +Expected: FAIL — modules not found + +**Step 3: Implement session module** + +Create `src/auth/session.ts`: +```typescript +import type { PoeSessionUser } from '../types'; + +const SESSION_MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours + +/** + * SHA-256 hash of an API key, used as stable user/DO identifier. + */ +export async function hashApiKey(apiKey: string): Promise { + const data = new TextEncoder().encode(apiKey); + const hashBuffer = await crypto.subtle.digest('SHA-256', data); + return Array.from(new Uint8Array(hashBuffer)) + .map((b) => b.toString(16).padStart(2, '0')) + .join(''); +} + +/** + * Derive a CryptoKey from a string secret for HMAC-SHA256. + */ +async function deriveHmacKey(secret: string): Promise { + const keyData = new TextEncoder().encode(secret); + return crypto.subtle.importKey('raw', keyData, { name: 'HMAC', hash: 'SHA-256' }, false, [ + 'sign', + 'verify', + ]); +} + +/** + * Create a signed session token (base64url of payload + HMAC signature). + */ +export async function createSessionToken( + payload: PoeSessionUser, + secret: string, +): Promise { + const payloadStr = JSON.stringify(payload); + const payloadB64 = btoa(payloadStr); + const key = await deriveHmacKey(secret); + const sig = await crypto.subtle.sign('HMAC', key, new TextEncoder().encode(payloadB64)); + const sigB64 = btoa(String.fromCharCode(...new Uint8Array(sig))); + return `${payloadB64}.${sigB64}`; +} + +/** + * Verify and decode a session token. Returns null if invalid or expired. + */ +export async function verifySessionToken( + token: string, + secret: string, +): Promise { + try { + const [payloadB64, sigB64] = token.split('.'); + if (!payloadB64 || !sigB64) return null; + + const key = await deriveHmacKey(secret); + const sig = Uint8Array.from(atob(sigB64), (c) => c.charCodeAt(0)); + const valid = await crypto.subtle.verify( + 'HMAC', + key, + sig, + new TextEncoder().encode(payloadB64), + ); + if (!valid) return null; + + const payload: PoeSessionUser = JSON.parse(atob(payloadB64)); + + // Check expiry + if (Date.now() - payload.createdAt > SESSION_MAX_AGE_MS) { + return null; + } + + return payload; + } catch { + return null; + } +} + +/** + * Encrypt an API key with AES-GCM for storage in DO. + * Returns base64 string of IV (12 bytes) + ciphertext. + */ +export async function encryptApiKey(apiKey: string, secret: string): Promise { + const keyData = new TextEncoder().encode(secret.padEnd(32, '0').slice(0, 32)); + const cryptoKey = await crypto.subtle.importKey('raw', keyData, 'AES-GCM', false, ['encrypt']); + + const iv = crypto.getRandomValues(new Uint8Array(12)); + const ciphertext = await crypto.subtle.encrypt( + { name: 'AES-GCM', iv }, + cryptoKey, + new TextEncoder().encode(apiKey), + ); + + // Concatenate IV + ciphertext + const combined = new Uint8Array(iv.length + new Uint8Array(ciphertext).length); + combined.set(iv); + combined.set(new Uint8Array(ciphertext), iv.length); + return btoa(String.fromCharCode(...combined)); +} + +/** + * Decrypt an API key from AES-GCM encrypted base64 string. + */ +export async function decryptApiKey(encrypted: string, secret: string): Promise { + const keyData = new TextEncoder().encode(secret.padEnd(32, '0').slice(0, 32)); + const cryptoKey = await crypto.subtle.importKey('raw', keyData, 'AES-GCM', false, ['decrypt']); + + const combined = Uint8Array.from(atob(encrypted), (c) => c.charCodeAt(0)); + const iv = combined.slice(0, 12); + const ciphertext = combined.slice(12); + + const plaintext = await crypto.subtle.decrypt({ name: 'AES-GCM', iv }, cryptoKey, ciphertext); + return new TextDecoder().decode(plaintext); +} + +/** + * Build the Set-Cookie header value for a session cookie. + */ +export function buildSessionCookie(token: string): string { + return `poeclaw_session=${token}; HttpOnly; Secure; SameSite=Lax; Max-Age=86400; Path=/`; +} + +/** + * Build a Set-Cookie header that clears the session cookie. + */ +export function clearSessionCookie(): string { + return 'poeclaw_session=; HttpOnly; Secure; SameSite=Lax; Max-Age=0; Path=/'; +} + +/** + * Extract the session token from a Cookie header. + */ +export function extractSessionToken(cookieHeader: string | undefined): string | null { + if (!cookieHeader) return null; + const match = cookieHeader.match(/poeclaw_session=([^;]+)/); + return match ? match[1] : null; +} +``` + +**Step 4: Run tests to verify they pass** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/auth/session.test.ts` +Expected: All tests PASS + +**Step 5: Commit** + +```bash +git add src/auth/session.ts src/auth/session.test.ts +git commit -m "feat: add HMAC session tokens and AES-GCM key encryption" +``` + +--- + +### Task 6: Create auth routes (login/logout) + +**Files:** +- Create: `src/routes/auth.ts` +- Modify: `src/routes/index.ts` (add export) + +**Step 1: Create auth routes** + +Create `src/routes/auth.ts`: +```typescript +import { Hono } from 'hono'; +import type { AppEnv } from '../types'; +import { validatePoeApiKey } from '../auth/poe'; +import { + hashApiKey, + createSessionToken, + encryptApiKey, + buildSessionCookie, + clearSessionCookie, +} from '../auth/session'; + +const auth = new Hono(); + +// POST /api/auth/login - Validate Poe API key and create session +auth.post('/login', async (c) => { + const body = await c.req.json<{ apiKey?: string }>().catch(() => ({})); + const apiKey = body.apiKey?.trim(); + + if (!apiKey) { + return c.json({ error: 'API key is required' }, 400); + } + + // Validate key against Poe API + const validation = await validatePoeApiKey(apiKey); + if (!validation.valid) { + return c.json({ error: validation.error || 'Invalid API key' }, 401); + } + + const sessionSecret = c.env.SESSION_SECRET; + const encryptionSecret = c.env.ENCRYPTION_SECRET; + if (!sessionSecret || !encryptionSecret) { + console.error('[AUTH] Missing SESSION_SECRET or ENCRYPTION_SECRET'); + return c.json({ error: 'Server configuration error' }, 500); + } + + // Create user identity + const userHash = await hashApiKey(apiKey); + const keyLast4 = apiKey.slice(-4); + + // Create session token + const sessionPayload = { + userHash, + keyLast4, + models: validation.models || [], + createdAt: Date.now(), + }; + const token = await createSessionToken(sessionPayload, sessionSecret); + + // Encrypt API key for DO storage + const encryptedKey = await encryptApiKey(apiKey, encryptionSecret); + + // Store encrypted key in the user's sandbox DO storage + const sandbox = c.get('sandbox'); + await sandbox.exec( + `mkdir -p /tmp/poeclaw && echo '${encryptedKey}' > /tmp/poeclaw/encrypted-key`, + ); + + // Set session cookie + return c.json( + { + ok: true, + userHash, + keyLast4, + models: validation.models, + }, + 200, + { + 'Set-Cookie': buildSessionCookie(token), + }, + ); +}); + +// POST /api/auth/logout - Clear session +auth.post('/logout', (c) => { + return c.json( + { ok: true }, + 200, + { + 'Set-Cookie': clearSessionCookie(), + }, + ); +}); + +// GET /api/auth/me - Return current session info (if valid) +auth.get('/me', (c) => { + const poeUser = c.get('poeUser'); + if (!poeUser) { + return c.json({ authenticated: false }, 401); + } + return c.json({ + authenticated: true, + userHash: poeUser.userHash, + keyLast4: poeUser.keyLast4, + models: poeUser.models, + }); +}); + +export { auth }; +``` + +**Step 2: Add auth export to routes/index.ts** + +In `src/routes/index.ts`, add: +```typescript +export { auth } from './auth'; +``` + +**Step 3: Verify types compile** + +Run: `cd /Volumes/dev/poeclaw && npx tsc --noEmit` +Expected: No new errors + +**Step 4: Commit** + +```bash +git add src/routes/auth.ts src/routes/index.ts +git commit -m "feat: add login/logout/me auth routes" +``` + +--- + +### Task 7: Update auth/index.ts exports + +**Files:** +- Modify: `src/auth/index.ts` + +**Step 1: Add new exports** + +Replace `src/auth/index.ts` with: +```typescript +export { verifyAccessJWT } from './jwt'; +export { createAccessMiddleware, isDevMode, extractJWT } from './middleware'; +export { validatePoeApiKey } from './poe'; +export { + hashApiKey, + createSessionToken, + verifySessionToken, + encryptApiKey, + decryptApiKey, + buildSessionCookie, + clearSessionCookie, + extractSessionToken, +} from './session'; +``` + +**Step 2: Commit** + +```bash +git add src/auth/index.ts +git commit -m "feat: export poe and session modules from auth index" +``` + +--- + +### Task 8: Rewrite src/index.ts for per-user sandbox resolution + +**Files:** +- Modify: `src/index.ts` + +This is the largest single change. The key modifications: +1. Replace single-tenant `getSandbox(env.Sandbox, 'moltbot', options)` with per-user resolution +2. Replace CF Access middleware with session middleware +3. Remove `validateRequiredEnv` checks for CF Access and AI provider keys +4. Add auth routes +5. Login route resolves sandbox *after* auth (needs userHash) +6. Keep the catch-all proxy logic largely intact + +**Step 1: Rewrite index.ts** + +Replace `src/index.ts` entirely. The key changes are annotated inline: + +```typescript +/** + * PoeClaw - Multi-tenant OpenClaw platform powered by Poe API keys + * + * User flow: + * 1. Visit landing page + * 2. Paste POE_API_KEY + * 3. Key validated against Poe API + * 4. Per-user sandbox resolves via getSandbox(env.Sandbox, userHash) + * 5. Chat via Poe-style UI using HTTP API + SSE + */ + +import { Hono } from 'hono'; +import { getSandbox, Sandbox, type SandboxOptions } from '@cloudflare/sandbox'; + +import type { AppEnv, MoltbotEnv, PoeSessionUser } from './types'; +import { MOLTBOT_PORT } from './config'; +import { verifySessionToken, extractSessionToken, decryptApiKey, hashApiKey } from './auth/session'; +import { ensureMoltbotGateway, findExistingMoltbotProcess } from './gateway'; +import { publicRoutes, api, debug, cdp, auth } from './routes'; +import { redactSensitiveParams } from './utils/logging'; +import loadingPageHtml from './assets/loading.html'; + +export { Sandbox }; + +/** + * Build sandbox options for multi-tenant PoeClaw. + * Always uses sleepAfter (never keepAlive) to bound memory usage. + */ +function buildSandboxOptions(env: MoltbotEnv): SandboxOptions { + const sleepAfter = env.SANDBOX_SLEEP_AFTER?.toLowerCase() || '1h'; + if (sleepAfter === 'never') { + return { keepAlive: true }; + } + return { sleepAfter }; +} + +// Main app +const app = new Hono(); + +// ============================================================================= +// MIDDLEWARE: Logging +// ============================================================================= + +app.use('*', async (c, next) => { + const url = new URL(c.req.url); + const redactedSearch = redactSensitiveParams(url); + console.log(`[REQ] ${c.req.method} ${url.pathname}${redactedSearch}`); + await next(); +}); + +// ============================================================================= +// PUBLIC ROUTES: No auth required +// ============================================================================= + +// Health checks, logos, status +app.route('/', publicRoutes); + +// CDP routes (shared secret auth) +app.route('/cdp', cdp); + +// Auth routes (login/logout/me) — mounted before session middleware +// Login doesn't need a session (it creates one) +// Logout/me are handled inside the route +app.route('/api/auth', auth); + +// Serve the SPA for unauthenticated users (login page) +// The SPA handles client-side routing between login and chat +app.get('/', async (c) => { + const url = new URL(c.req.url); + return c.env.ASSETS.fetch(new Request(new URL('/index.html', url.origin).toString())); +}); + +// ============================================================================= +// SESSION MIDDLEWARE: Verify session cookie, resolve per-user sandbox +// ============================================================================= + +app.use('*', async (c, next) => { + const sessionSecret = c.env.SESSION_SECRET; + if (!sessionSecret) { + // Dev mode: skip session auth + if (c.env.DEV_MODE === 'true') { + // In dev mode, use a single sandbox + const options = buildSandboxOptions(c.env); + const sandbox = getSandbox(c.env.Sandbox, 'dev-user', options); + c.set('sandbox', sandbox); + return next(); + } + return c.json({ error: 'Server not configured (missing SESSION_SECRET)' }, 500); + } + + // Extract session token from cookie + const cookieHeader = c.req.header('Cookie'); + const token = extractSessionToken(cookieHeader); + + if (!token) { + // No session — redirect to login for HTML requests, 401 for API + const acceptsHtml = c.req.header('Accept')?.includes('text/html'); + if (acceptsHtml) { + return c.redirect('/'); + } + return c.json({ error: 'Authentication required', hint: 'POST /api/auth/login' }, 401); + } + + // Verify session token + const poeUser = await verifySessionToken(token, sessionSecret); + if (!poeUser) { + const acceptsHtml = c.req.header('Accept')?.includes('text/html'); + if (acceptsHtml) { + return c.redirect('/'); + } + return c.json({ error: 'Session expired or invalid' }, 401); + } + + // Resolve per-user sandbox + const options = buildSandboxOptions(c.env); + const sandbox = getSandbox(c.env.Sandbox, poeUser.userHash, options); + c.set('sandbox', sandbox); + c.set('poeUser', poeUser); + + await next(); +}); + +// ============================================================================= +// PROTECTED ROUTES: Session required +// ============================================================================= + +// Mount API routes (admin, storage, etc.) +app.route('/api', api); + +// Debug routes (protected + DEBUG_ROUTES flag) +app.use('/debug/*', async (c, next) => { + if (c.env.DEBUG_ROUTES !== 'true') { + return c.json({ error: 'Debug routes are disabled' }, 404); + } + return next(); +}); +app.route('/debug', debug); + +// ============================================================================= +// CATCH-ALL: Proxy to user's OpenClaw gateway +// ============================================================================= + +app.all('*', async (c) => { + const sandbox = c.get('sandbox'); + const request = c.req.raw; + const url = new URL(request.url); + + console.log('[PROXY] Handling request:', url.pathname); + + // Check if gateway is already running + const existingProcess = await findExistingMoltbotProcess(sandbox); + const isGatewayReady = existingProcess !== null && existingProcess.status === 'running'; + + const isWebSocketRequest = request.headers.get('Upgrade')?.toLowerCase() === 'websocket'; + const acceptsHtml = request.headers.get('Accept')?.includes('text/html'); + + if (!isGatewayReady && !isWebSocketRequest && acceptsHtml) { + console.log('[PROXY] Gateway not ready, serving loading page'); + c.executionCtx.waitUntil( + ensureMoltbotGateway(sandbox, c.env).catch((err: Error) => { + console.error('[PROXY] Background gateway start failed:', err); + }), + ); + return c.html(loadingPageHtml); + } + + try { + await ensureMoltbotGateway(sandbox, c.env); + } catch (error) { + console.error('[PROXY] Failed to start gateway:', error); + const errorMessage = error instanceof Error ? error.message : 'Unknown error'; + return c.json( + { + error: 'Gateway failed to start', + details: errorMessage, + hint: 'Your container may need a moment to boot. Try again.', + }, + 503, + ); + } + + // Proxy WebSocket connections + if (isWebSocketRequest) { + console.log('[WS] Proxying WebSocket connection'); + // Inject gateway token if configured + let wsRequest = request; + if (c.env.MOLTBOT_GATEWAY_TOKEN && !url.searchParams.has('token')) { + const tokenUrl = new URL(url.toString()); + tokenUrl.searchParams.set('token', c.env.MOLTBOT_GATEWAY_TOKEN); + wsRequest = new Request(tokenUrl.toString(), request); + } + return sandbox.wsConnect(wsRequest, MOLTBOT_PORT); + } + + // Proxy HTTP requests + console.log('[HTTP] Proxying:', url.pathname + url.search); + const httpResponse = await sandbox.containerFetch(request, MOLTBOT_PORT); + return new Response(httpResponse.body, { + status: httpResponse.status, + statusText: httpResponse.statusText, + headers: httpResponse.headers, + }); +}); + +export default { + fetch: app.fetch, +}; +``` + +**Note:** This simplified version removes the complex WebSocket interception for now. The full interception can be re-added in Phase 5 if needed. + +**Step 2: Verify types compile** + +Run: `cd /Volumes/dev/poeclaw && npx tsc --noEmit` + +Fix any import errors. The `adminUi` route is removed — if `src/routes/index.ts` still exports it, remove that export. + +**Step 3: Run existing tests** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run` +Expected: Existing tests should still pass (they test internal modules, not the full app) + +**Step 4: Commit** + +```bash +git add src/index.ts +git commit -m "feat: rewrite index.ts for per-user sandbox resolution with session auth" +``` + +--- + +### Task 9: Update vite.config.ts for root-mounted SPA + +**Files:** +- Modify: `vite.config.ts` + +The old admin UI was mounted at `/_admin/`. The new chat UI is the main app at `/`. + +**Step 1: Change base path** + +In `vite.config.ts`, change: +```typescript +base: "/_admin/", +``` +to: +```typescript +base: "/", +``` + +**Step 2: Commit** + +```bash +git add vite.config.ts +git commit -m "feat: mount SPA at root instead of /_admin/" +``` + +--- + +### Task 10: Create minimal login page (functional, unstyled) + +**Files:** +- Modify: `src/client/App.tsx` +- Modify: `src/client/App.css` + +**Step 1: Replace App.tsx with login/chat router** + +Replace `src/client/App.tsx`: +```tsx +import { useState, useEffect } from 'react'; +import './App.css'; + +interface SessionInfo { + authenticated: boolean; + userHash?: string; + keyLast4?: string; + models?: Array<{ id: string; name: string }>; +} + +function LoginPage({ onLogin }: { onLogin: (session: SessionInfo) => void }) { + const [apiKey, setApiKey] = useState(''); + const [error, setError] = useState(''); + const [loading, setLoading] = useState(false); + + const handleSubmit = async (e: React.FormEvent) => { + e.preventDefault(); + setError(''); + setLoading(true); + + try { + const res = await fetch('/api/auth/login', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ apiKey: apiKey.trim() }), + }); + const data = await res.json(); + + if (!res.ok) { + setError(data.error || 'Login failed'); + return; + } + + onLogin({ + authenticated: true, + userHash: data.userHash, + keyLast4: data.keyLast4, + models: data.models, + }); + } catch (err) { + setError('Network error. Please try again.'); + } finally { + setLoading(false); + } + }; + + return ( +
+

PoeClaw

+

Paste your Poe API key to get started.

+

+ + Get your API key from poe.com + +

+
+ setApiKey(e.target.value)} + placeholder="pb-..." + disabled={loading} + autoFocus + /> + +
+ {error &&

{error}

} +
+ ); +} + +function ChatPage({ session }: { session: SessionInfo }) { + return ( +
+

Logged in as ***...{session.keyLast4}

+

Models: {session.models?.map((m) => m.id).join(', ') || 'none'}

+

Container is booting... (full chat UI coming in Phase 4)

+
+ ); +} + +export default function App() { + const [session, setSession] = useState(null); + const [checking, setChecking] = useState(true); + + useEffect(() => { + // Check if already logged in + fetch('/api/auth/me') + .then((res) => res.json()) + .then((data) => { + if (data.authenticated) { + setSession(data); + } + }) + .catch(() => {}) + .finally(() => setChecking(false)); + }, []); + + if (checking) { + return
Loading...
; + } + + if (!session) { + return ; + } + + return ; +} +``` + +**Step 2: Update App.css with minimal dark theme** + +Replace `src/client/App.css`: +```css +:root { + --bg: #1a1a2e; + --surface: #16213e; + --text: #e0e0e0; + --text-muted: #8888aa; + --accent: #7c5cfc; + --error: #ff6b6b; +} + +* { box-sizing: border-box; margin: 0; padding: 0; } + +body { + background: var(--bg); + color: var(--text); + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; +} + +.login-page { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + min-height: 100vh; + padding: 2rem; +} + +.login-page h1 { font-size: 2.5rem; margin-bottom: 0.5rem; } +.login-page p { color: var(--text-muted); margin-bottom: 0.5rem; } +.login-page a { color: var(--accent); } + +.login-page form { + display: flex; + gap: 0.5rem; + margin-top: 1.5rem; + width: 100%; + max-width: 500px; +} + +.login-page input { + flex: 1; + padding: 0.75rem 1rem; + border: 1px solid #333; + border-radius: 8px; + background: var(--surface); + color: var(--text); + font-size: 1rem; +} + +.login-page button { + padding: 0.75rem 1.5rem; + border: none; + border-radius: 8px; + background: var(--accent); + color: white; + font-size: 1rem; + cursor: pointer; +} + +.login-page button:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.error { color: var(--error); margin-top: 1rem; } + +.chat-page { + padding: 2rem; + text-align: center; +} + +.loading { + display: flex; + align-items: center; + justify-content: center; + min-height: 100vh; + color: var(--text-muted); +} +``` + +**Step 3: Build to verify no errors** + +Run: `cd /Volumes/dev/poeclaw && npm run build` +Expected: Build succeeds + +**Step 4: Commit** + +```bash +git add src/client/App.tsx src/client/App.css +git commit -m "feat: add minimal login page with dark theme" +``` + +--- + +### Task 11: Remove old admin UI (AdminPage) and update route exports + +**Files:** +- Delete/modify: `src/client/pages/AdminPage.tsx` (if it exists, remove it) +- Modify: `src/routes/index.ts` (remove adminUi export) +- Modify: `src/routes/admin-ui.ts` (remove or keep for backward compat) + +**Step 1: Update routes/index.ts** + +Replace `src/routes/index.ts`: +```typescript +export { publicRoutes } from './public'; +export { api } from './api'; +export { debug } from './debug'; +export { cdp } from './cdp'; +export { auth } from './auth'; +``` + +**Step 2: Run tests** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run` +Expected: All tests pass + +**Step 3: Commit** + +```bash +git add src/routes/index.ts +git commit -m "feat: remove adminUi route, add auth route export" +``` + +--- + +## Phase 2: Poe Provider Integration + +### Task 12: Pass POE_API_KEY to container env vars + +**Files:** +- Modify: `src/gateway/env.ts` +- Modify: `src/gateway/env.test.ts` + +**Step 1: Write failing test** + +Add to `src/gateway/env.test.ts`: +```typescript +it('passes POE_API_KEY to container', () => { + const env = createMockEnv({ POE_API_KEY: 'pb-test-key-12345' } as any); + const result = buildEnvVars(env); + expect(result.POE_API_KEY).toBe('pb-test-key-12345'); +}); +``` + +**Step 2: Run test to verify it fails** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/gateway/env.test.ts` +Expected: FAIL — `POE_API_KEY` not in result + +**Step 3: Add POE_API_KEY to buildEnvVars** + +In `src/gateway/env.ts`, after line 25 (`if (env.OPENAI_API_KEY) envVars.OPENAI_API_KEY = env.OPENAI_API_KEY;`), add: + +```typescript + // Poe provider (PoeClaw multi-tenant) + if ((env as any).POE_API_KEY) envVars.POE_API_KEY = (env as any).POE_API_KEY; +``` + +**Note:** We cast to `any` because `POE_API_KEY` isn't on `MoltbotEnv` — it's injected per-user at runtime. A cleaner approach is to accept an `overrides` parameter. Let's do that instead. + +Actually, the better approach: modify `ensureMoltbotGateway` in `process.ts` to accept env overrides. The per-user POE_API_KEY isn't a Worker-level env var — it comes from the session's decrypted key. + +**Alternative Step 3: Add overrides parameter to buildEnvVars** + +In `src/gateway/env.ts`, change the function signature: + +```typescript +export function buildEnvVars( + env: MoltbotEnv, + overrides?: Record, +): Record { + const envVars: Record = {}; + // ... existing code ... + + // Apply per-user overrides (e.g., POE_API_KEY for PoeClaw) + if (overrides) { + Object.assign(envVars, overrides); + } + + return envVars; +} +``` + +Update the test to use overrides: +```typescript +it('applies per-user env overrides', () => { + const env = createMockEnv(); + const result = buildEnvVars(env, { POE_API_KEY: 'pb-test-key-12345' }); + expect(result.POE_API_KEY).toBe('pb-test-key-12345'); +}); + +it('overrides take precedence over env vars', () => { + const env = createMockEnv({ ANTHROPIC_API_KEY: 'sk-original' }); + const result = buildEnvVars(env, { ANTHROPIC_API_KEY: 'sk-override' }); + expect(result.ANTHROPIC_API_KEY).toBe('sk-override'); +}); +``` + +**Step 4: Run tests** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/gateway/env.test.ts` +Expected: All tests PASS + +**Step 5: Commit** + +```bash +git add src/gateway/env.ts src/gateway/env.test.ts +git commit -m "feat: add env overrides parameter to buildEnvVars for per-user keys" +``` + +--- + +### Task 13: Update process.ts to accept env overrides + +**Files:** +- Modify: `src/gateway/process.ts` + +**Step 1: Update ensureMoltbotGateway signature** + +In `src/gateway/process.ts`, change `ensureMoltbotGateway`: + +```typescript +export async function ensureMoltbotGateway( + sandbox: Sandbox, + env: MoltbotEnv, + envOverrides?: Record, +): Promise { +``` + +And change line 93: +```typescript + const envVars = buildEnvVars(env); +``` +to: +```typescript + const envVars = buildEnvVars(env, envOverrides); +``` + +**Step 2: Verify types compile** + +Run: `cd /Volumes/dev/poeclaw && npx tsc --noEmit` + +**Step 3: Run existing tests** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/gateway/process.test.ts` +Expected: All tests PASS (new param is optional) + +**Step 4: Commit** + +```bash +git add src/gateway/process.ts +git commit -m "feat: accept env overrides in ensureMoltbotGateway for per-user config" +``` + +--- + +### Task 14: Update start-openclaw.sh for Poe provider + +**Files:** +- Modify: `start-openclaw.sh` + +**Step 1: Add POE_API_KEY onboard path** + +In `start-openclaw.sh`, after the existing `elif [ -n "$OPENAI_API_KEY" ]; then` block (around line 116), add a Poe path: + +```bash + elif [ -n "$POE_API_KEY" ]; then + # Poe uses OpenAI-compatible API — onboard with a dummy key, + # then patch the config with the real Poe provider below + AUTH_ARGS="--auth-choice openai-api-key --openai-api-key dummy-for-poe" +``` + +**Step 2: Add Poe provider config patching** + +In the Node.js config patch section (after the Slack configuration block, around line 261), add: + +```javascript +// Poe provider configuration (PoeClaw multi-tenant) +if (process.env.POE_API_KEY) { + config.models = config.models || {}; + config.models.providers = config.models.providers || {}; + config.models.providers.poe = { + baseUrl: 'https://api.poe.com/v1', + apiKey: process.env.POE_API_KEY, + api: 'openai-completions', + models: [ + { id: 'Claude-Sonnet-4.5', name: 'Claude Sonnet 4.5', contextWindow: 200000, maxTokens: 8192 }, + { id: 'GPT-5.2', name: 'GPT 5.2', contextWindow: 128000, maxTokens: 8192 }, + { id: 'Gemini-3-Pro', name: 'Gemini 3 Pro', contextWindow: 128000, maxTokens: 8192 }, + ], + }; + + // Set Poe as the default model provider + config.agents = config.agents || {}; + config.agents.defaults = config.agents.defaults || {}; + config.agents.defaults.model = { primary: 'poe/Claude-Sonnet-4.5' }; + console.log('Poe provider configured with API key'); + + // Enable HTTP chat completions endpoint for PoeClaw's HTTP API + config.gateway.http = config.gateway.http || {}; + config.gateway.http.endpoints = config.gateway.http.endpoints || {}; + config.gateway.http.endpoints.chatCompletions = { enabled: true }; + console.log('HTTP chat completions endpoint enabled'); + + // Skip device pairing in PoeClaw mode (Worker handles auth) + config.gateway.controlUi = config.gateway.controlUi || {}; + config.gateway.controlUi.allowInsecureAuth = true; +} +``` + +**Step 3: Test the script syntax** + +Run: `bash -n /Volumes/dev/poeclaw/start-openclaw.sh` +Expected: No syntax errors + +**Step 4: Commit** + +```bash +git add start-openclaw.sh +git commit -m "feat: add Poe provider config patching to start-openclaw.sh" +``` + +--- + +### Task 15: Wire per-user POE_API_KEY into sandbox startup from index.ts + +**Files:** +- Modify: `src/index.ts` (the catch-all proxy section) + +The login route stores the encrypted key. When the catch-all proxy boots the container, it needs to decrypt the key and pass it as `POE_API_KEY` to `ensureMoltbotGateway`. + +**Step 1: Update the catch-all in index.ts** + +In the catch-all handler, before `ensureMoltbotGateway`, add key decryption: + +```typescript +app.all('*', async (c) => { + const sandbox = c.get('sandbox'); + const poeUser = c.get('poeUser'); + const request = c.req.raw; + const url = new URL(request.url); + + // Build per-user env overrides + let envOverrides: Record | undefined; + if (poeUser && c.env.ENCRYPTION_SECRET) { + try { + // Read encrypted key from container (stored at login) + const readResult = await sandbox.exec('cat /tmp/poeclaw/encrypted-key 2>/dev/null || echo ""'); + const encryptedKey = readResult.stdout?.trim(); + if (encryptedKey) { + const poeApiKey = await decryptApiKey(encryptedKey, c.env.ENCRYPTION_SECRET); + envOverrides = { + POE_API_KEY: poeApiKey, + OPENCLAW_DEV_MODE: 'true', // Skip device pairing + }; + } + } catch (err) { + console.error('[PROXY] Failed to decrypt POE_API_KEY:', err); + } + } + + // ... rest of the catch-all handler, passing envOverrides to ensureMoltbotGateway +``` + +Update calls to `ensureMoltbotGateway(sandbox, c.env)` to `ensureMoltbotGateway(sandbox, c.env, envOverrides)`. + +**Step 2: Verify types compile** + +Run: `cd /Volumes/dev/poeclaw && npx tsc --noEmit` + +**Step 3: Commit** + +```bash +git add src/index.ts +git commit -m "feat: decrypt and pass per-user POE_API_KEY to container on boot" +``` + +--- + +## Phase 3: Per-User R2 Persistence + +### Task 16: Add user prefix to R2 paths in sync.ts + +**Files:** +- Modify: `src/gateway/sync.ts` +- Modify: `src/gateway/sync.test.ts` + +**Step 1: Add userPrefix parameter to syncToR2** + +In `src/gateway/sync.ts`, update the `rcloneRemote` helper and `syncToR2` function: + +```typescript +function rcloneRemote(env: MoltbotEnv, prefix: string, userPrefix?: string): string { + const base = `r2:${getR2BucketName(env)}/`; + return userPrefix ? `${base}users/${userPrefix}/${prefix}` : `${base}${prefix}`; +} + +export async function syncToR2( + sandbox: Sandbox, + env: MoltbotEnv, + userPrefix?: string, +): Promise { +``` + +And update all `remote()` calls to pass `userPrefix`: +```typescript + const remote = (prefix: string) => rcloneRemote(env, prefix, userPrefix); +``` + +**Step 2: Write a test for the user prefix** + +Add to `src/gateway/sync.test.ts`: +```typescript +it('uses user prefix in R2 paths when provided', async () => { + // ... setup mock sandbox that captures exec commands + await syncToR2(sandbox, env, 'abc123'); + // Verify the rclone command includes 'users/abc123/' + const execCalls = execMock.mock.calls.map((c: any[]) => c[0]); + expect(execCalls.some((cmd: string) => cmd.includes('users/abc123/openclaw/'))).toBe(true); +}); +``` + +**Step 3: Run tests** + +Run: `cd /Volumes/dev/poeclaw && npx vitest run src/gateway/sync.test.ts` +Expected: All tests PASS + +**Step 4: Commit** + +```bash +git add src/gateway/sync.ts src/gateway/sync.test.ts +git commit -m "feat: add per-user R2 path namespacing for data isolation" +``` + +--- + +### Task 17: Add user prefix to R2 paths in start-openclaw.sh + +**Files:** +- Modify: `start-openclaw.sh` + +**Step 1: Add R2_USER_PREFIX support** + +At the top of `start-openclaw.sh` (around line 36), add: +```bash +R2_USER_PREFIX="${R2_USER_PREFIX:-}" +``` + +Then update all R2 paths. For the restore section, change paths like: +```bash +r2:${R2_BUCKET}/openclaw/ +``` +to: +```bash +r2:${R2_BUCKET}/${R2_USER_PREFIX:+users/${R2_USER_PREFIX}/}openclaw/ +``` + +This uses bash parameter expansion: if `R2_USER_PREFIX` is set, prepend `users/{prefix}/`, otherwise use the root path (backward compatible). + +Apply the same pattern to `workspace/` and `skills/` paths, and the background sync loop paths. + +**Step 2: Test syntax** + +Run: `bash -n /Volumes/dev/poeclaw/start-openclaw.sh` +Expected: No syntax errors + +**Step 3: Commit** + +```bash +git add start-openclaw.sh +git commit -m "feat: add R2_USER_PREFIX support for per-user data isolation in shell" +``` + +--- + +### Task 18: Pass R2_USER_PREFIX in env overrides + +**Files:** +- Modify: `src/index.ts` (the envOverrides section from Task 15) + +**Step 1: Add R2_USER_PREFIX to envOverrides** + +In the catch-all handler where we build `envOverrides`, add: +```typescript +envOverrides = { + POE_API_KEY: poeApiKey, + OPENCLAW_DEV_MODE: 'true', + R2_USER_PREFIX: poeUser.userHash, // Per-user R2 path prefix +}; +``` + +**Step 2: Pass userPrefix to syncToR2 calls** + +In `src/routes/api.ts`, the `POST /api/admin/storage/sync` route calls `syncToR2(sandbox, c.env)`. Update it to pass the user prefix: +```typescript +const poeUser = c.get('poeUser'); +const result = await syncToR2(sandbox, c.env, poeUser?.userHash); +``` + +**Step 3: Commit** + +```bash +git add src/index.ts src/routes/api.ts +git commit -m "feat: pass per-user R2 prefix to container and sync operations" +``` + +--- + +## Phase 4: Poe-Style Chat Frontend + +### Task 19: Create useGatewayStatus hook + +**Files:** +- Create: `src/client/hooks/useGatewayStatus.ts` + +**Step 1: Create the hook** + +Create `src/client/hooks/useGatewayStatus.ts`: +```typescript +import { useState, useEffect, useRef } from 'react'; + +export type GatewayStatus = 'unknown' | 'booting' | 'running' | 'error'; + +/** + * Poll /api/status to track container boot progress. + * Returns current status and a flag for when it's ready. + */ +export function useGatewayStatus(enabled: boolean) { + const [status, setStatus] = useState('unknown'); + const intervalRef = useRef | null>(null); + + useEffect(() => { + if (!enabled) return; + + const check = async () => { + try { + const res = await fetch('/api/status'); + const data = await res.json(); + if (data.ok && data.status === 'running') { + setStatus('running'); + if (intervalRef.current) clearInterval(intervalRef.current); + } else { + setStatus('booting'); + } + } catch { + setStatus('error'); + } + }; + + check(); // Immediate check + intervalRef.current = setInterval(check, 3000); // Poll every 3s + + return () => { + if (intervalRef.current) clearInterval(intervalRef.current); + }; + }, [enabled]); + + return { status, isReady: status === 'running' }; +} +``` + +**Step 2: Commit** + +```bash +git add src/client/hooks/useGatewayStatus.ts +git commit -m "feat: add useGatewayStatus hook for cold start polling" +``` + +--- + +### Task 20: Create useChat SSE streaming hook + +**Files:** +- Create: `src/client/hooks/useChat.ts` + +**Step 1: Create the hook** + +Create `src/client/hooks/useChat.ts`: +```typescript +import { useState, useCallback, useRef } from 'react'; + +export interface ChatMessage { + role: 'user' | 'assistant'; + content: string; +} + +/** + * SSE streaming chat hook. + * Sends messages to /v1/chat/completions and streams the response. + */ +export function useChat(model: string) { + const [messages, setMessages] = useState([]); + const [isStreaming, setIsStreaming] = useState(false); + const abortRef = useRef(null); + + const sendMessage = useCallback( + async (userMessage: string) => { + const userMsg: ChatMessage = { role: 'user', content: userMessage }; + const updatedMessages = [...messages, userMsg]; + setMessages([...updatedMessages, { role: 'assistant', content: '' }]); + setIsStreaming(true); + + const controller = new AbortController(); + abortRef.current = controller; + + try { + const response = await fetch('/v1/chat/completions', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model, + messages: updatedMessages.map((m) => ({ role: m.role, content: m.content })), + stream: true, + }), + signal: controller.signal, + }); + + if (!response.ok) { + const errData = await response.json().catch(() => ({})); + throw new Error(errData.error?.message || `HTTP ${response.status}`); + } + + const reader = response.body?.getReader(); + if (!reader) throw new Error('No response body'); + + const decoder = new TextDecoder(); + let assistantContent = ''; + let buffer = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; // Keep incomplete line in buffer + + for (const line of lines) { + if (!line.startsWith('data: ')) continue; + const data = line.slice(6).trim(); + if (data === '[DONE]') continue; + + try { + const parsed = JSON.parse(data); + const delta = parsed.choices?.[0]?.delta?.content; + if (delta) { + assistantContent += delta; + setMessages([ + ...updatedMessages, + { role: 'assistant', content: assistantContent }, + ]); + } + } catch { + // Skip unparseable SSE lines + } + } + } + } catch (err) { + if ((err as Error).name === 'AbortError') return; + setMessages([ + ...updatedMessages, + { role: 'assistant', content: `Error: ${(err as Error).message}` }, + ]); + } finally { + setIsStreaming(false); + abortRef.current = null; + } + }, + [messages, model], + ); + + const stopStreaming = useCallback(() => { + abortRef.current?.abort(); + }, []); + + const clearMessages = useCallback(() => { + setMessages([]); + }, []); + + return { messages, sendMessage, isStreaming, stopStreaming, clearMessages }; +} +``` + +**Step 2: Commit** + +```bash +git add src/client/hooks/useChat.ts +git commit -m "feat: add useChat SSE streaming hook" +``` + +--- + +### Task 21: Create Poe-style ChatPage + +**Files:** +- Create: `src/client/pages/ChatPage.tsx` +- Create: `src/client/pages/ChatPage.css` + +**Step 1: Create ChatPage component** + +Create `src/client/pages/ChatPage.tsx`: +```tsx +import { useState, useRef, useEffect } from 'react'; +import { useChat } from '../hooks/useChat'; +import { useGatewayStatus } from '../hooks/useGatewayStatus'; +import './ChatPage.css'; + +interface ChatPageProps { + models: Array<{ id: string; name: string }>; + keyLast4: string; + onLogout: () => void; +} + +export function ChatPage({ models, keyLast4, onLogout }: ChatPageProps) { + const [selectedModel, setSelectedModel] = useState(models[0]?.id || ''); + const { status, isReady } = useGatewayStatus(true); + const { messages, sendMessage, isStreaming, stopStreaming, clearMessages } = + useChat(selectedModel); + const [input, setInput] = useState(''); + const messagesEndRef = useRef(null); + + // Auto-scroll to bottom + useEffect(() => { + messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }); + }, [messages]); + + const handleSubmit = (e: React.FormEvent) => { + e.preventDefault(); + if (!input.trim() || isStreaming || !isReady) return; + sendMessage(input.trim()); + setInput(''); + }; + + const handleKeyDown = (e: React.KeyboardEvent) => { + if (e.key === 'Enter' && !e.shiftKey) { + e.preventDefault(); + handleSubmit(e); + } + }; + + const handleLogout = async () => { + await fetch('/api/auth/logout', { method: 'POST' }); + onLogout(); + }; + + return ( +
+ {/* Sidebar */} + + + {/* Main chat area */} +
+ {!isReady ? ( +
+
+

+ {status === 'booting' + ? 'Starting your sandbox... This may take a minute.' + : status === 'error' + ? 'Error connecting. Retrying...' + : 'Checking status...'} +

+
+ ) : ( + <> +
+ {messages.length === 0 && ( +
+

Start a conversation

+

Send a message to begin chatting with {selectedModel}

+
+ )} + {messages.map((msg, i) => ( +
+
{msg.role === 'user' ? 'You' : selectedModel}
+
{msg.content || '...'}
+
+ ))} +
+
+ +
+