diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index c008ee611..408ef85c7 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -13,7 +13,7 @@ Use `openshell` first to identify the active endpoint. Then use the platform too The target deployment flow is: -1. Operator starts or deploys the gateway. +1. Operator starts or deploys the gateway with system packages, systemd, Helm, or a development task. The CLI does not start, stop, or destroy gateway services. 2. Operator configures the compute driver. 3. Operator provides TLS and SSH relay material for the deployment mode. 4. The CLI registers a reachable gateway endpoint with `openshell gateway add`. @@ -198,7 +198,7 @@ openshell logs | Kubernetes gateway pod crash loops | Missing secret, bad DB URL, bad TLS config | `kubectl -n openshell logs statefulset/openshell` | | CLI TLS error | Local mTLS bundle does not match server cert/CA | Check `~/.config/openshell/gateways//mtls/` | | Image pull failure | Gateway or sandbox image cannot be pulled | Runtime events and image pull credentials | -| `K8s namespace not ready` with `envoy-gateway-openshell.yaml: the server could not find the requested resource` | Optional Gateway API manifest was auto-applied without Envoy Gateway CRDs, or k3s Helm controller startup exceeded the namespace wait | Confirm the cluster image only bundles core manifests; apply `deploy/kube/manifests/envoy-gateway-openshell.yaml` manually only when `grpcRoute` is enabled | +| `K8s namespace not ready` with `envoy-gateway-openshell.yaml: the server could not find the requested resource` | Optional Gateway API manifest was applied without Envoy Gateway CRDs, or k3s Helm controller startup exceeded the namespace wait | Apply `deploy/kube/manifests/envoy-gateway-openshell.yaml` manually only after Envoy Gateway is installed and `grpcRoute` is enabled | ## Reporting diff --git a/.agents/skills/openshell-cli/SKILL.md b/.agents/skills/openshell-cli/SKILL.md index 39af09e08..7451ea03d 100644 --- a/.agents/skills/openshell-cli/SKILL.md +++ b/.agents/skills/openshell-cli/SKILL.md @@ -9,7 +9,7 @@ Guide agents through using the `openshell` CLI for sandbox and platform manageme ## Overview -The OpenShell CLI (`openshell`) is the primary interface for managing sandboxes, providers, policies, inference routes, and gateways. This skill teaches agents how to orchestrate CLI commands for common and complex workflows. +The OpenShell CLI (`openshell`) is the primary interface for managing sandboxes, providers, policies, inference routes, and gateway registrations. Gateway service lifecycle is handled outside the CLI by packages, systemd, Helm, or development tasks. This skill teaches agents how to orchestrate CLI commands for common and complex workflows. **Companion skill**: For creating or modifying sandbox policy YAML content (network rules, L7 inspection, access presets), use the `generate-sandbox-policy` skill. This skill covers the CLI *commands* for the policy lifecycle; `generate-sandbox-policy` covers policy *content authoring*. @@ -486,7 +486,7 @@ openshell status # Verify connectivity ```bash openshell gateway add http://127.0.0.1:8080 --local --name local openshell gateway add https://gateway.example.com --name production -openshell gateway destroy --name local # Remove local registration +openshell gateway remove local # Remove local registration ``` ### Platform-specific deployment inspection @@ -549,7 +549,7 @@ $ openshell sandbox upload --help | Configure gateway inference | `openshell inference set --provider P --model M` | | View gateway inference | `openshell inference get` | | Delete sandbox | `openshell sandbox delete ` | -| Remove gateway registration | `openshell gateway destroy --name ` | +| Remove gateway registration | `openshell gateway remove ` | | Self-teach any command | `openshell --help` | ## Companion Skills diff --git a/.agents/skills/openshell-cli/cli-reference.md b/.agents/skills/openshell-cli/cli-reference.md index 3256aef42..adfa849dd 100644 --- a/.agents/skills/openshell-cli/cli-reference.md +++ b/.agents/skills/openshell-cli/cli-reference.md @@ -27,8 +27,10 @@ openshell ├── gateway │ ├── add [opts] │ ├── login [name] -│ ├── destroy [opts] +│ ├── logout [name] +│ ├── remove [name] │ ├── info [--name] +│ ├── list │ └── select [name] ├── status ├── inference @@ -62,8 +64,7 @@ openshell │ ├── update --type [opts] │ └── delete ... ├── doctor -│ ├── logs [--name] [-n] [--tail] [--remote] [--ssh-key] -│ └── exec [--name] [--remote] [--ssh-key] -- +│ └── check ├── term ├── completions └── ssh-proxy [opts] @@ -82,16 +83,15 @@ Register an existing gateway endpoint. | `--name ` | Gateway name | | `--local` | Register a local endpoint, commonly a trusted port-forward | | `--remote ` | Register a remote gateway associated with an SSH destination | -| `--ssh-key ` | SSH private key for the remote host | Examples: - `openshell gateway add http://127.0.0.1:8080 --local --name local` - `openshell gateway add https://gateway.example.com --name production` -### `openshell gateway destroy` +### `openshell gateway remove [name]` -Remove a gateway registration. For Helm deployments this affects local CLI metadata only; it does not uninstall the Helm release. +Remove a local gateway registration. This removes CLI metadata and stored auth tokens only; package managers, systemd, Helm, Docker, and other platform tools still own the gateway process. ### `openshell gateway login [name]` @@ -107,38 +107,17 @@ Show gateway details: endpoint, auth mode, and remote host metadata when present ### `openshell gateway select [name]` -Set the active gateway. Writes to `~/.config/openshell/active_gateway`. When called without arguments, lists all provisioned gateways with the active one marked with `*`. +Set the active gateway. Writes to `~/.config/openshell/active_gateway`. When called without arguments, lists all registered gateways with the active one marked with `*`. --- ## Doctor Commands -### `openshell doctor logs` +### `openshell doctor check` -Fetch logs when gateway metadata supports it. For Helm deployments, prefer `kubectl -n openshell logs statefulset/openshell`. - -| Flag | Default | Description | -|------|---------|-------------| -| `--name ` | active gateway | Gateway name | -| `-n, --lines ` | all | Number of log lines to return | -| `--tail` | false | Stream live logs (follow mode) | -| `--remote ` | auto-resolved | SSH destination for remote gateways | -| `--ssh-key ` | none | SSH private key for remote gateways | - -### `openshell doctor exec -- ` - -Run a diagnostic command when gateway metadata supports it. For Helm deployments, prefer direct `kubectl` and `helm` commands. - -| Flag | Default | Description | -|------|---------|-------------| -| `--name ` | active gateway | Gateway name | -| `--remote ` | auto-resolved | SSH destination for remote gateways | -| `--ssh-key ` | none | SSH private key for remote gateways | - -Examples: -- `kubectl -n openshell get pods` -- `kubectl -n openshell logs statefulset/openshell` -- `helm -n openshell status openshell` +Validate local Docker prerequisites for standalone gateway development. For +package-managed or Helm gateways, use `systemctl`, `journalctl`, `kubectl`, and +`helm` directly. --- diff --git a/.env.example b/.env.example index 61cdf4e3c..d043379e7 100644 --- a/.env.example +++ b/.env.example @@ -1,22 +1,19 @@ # OpenShell local development environment # Copy to .env and customise. Mise loads .env automatically. # -# Use unique CLUSTER_NAME/GATEWAY_PORT values per worktree to run -# multiple clusters simultaneously; `mise run cluster` will recreate as needed. +# Use unique gateway names and ports per worktree when running standalone +# gateways with `mise run gateway:docker`. -# ---------- Cluster identity ---------- +# ---------- Gateway identity ---------- -# Name used for the Docker container, k3s volume, TLS secrets, and the -# openshell CLI's active-cluster bookmark. Defaults to the repo directory -# basename (e.g. "openshell-c"). -#CLUSTER_NAME=openshell-c +# Gateway name registered by `mise run gateway:docker`. +#OPENSHELL_DOCKER_GATEWAY_NAME=openshell-c # Default gateway name used by `openshell` commands in this repo when `--gateway` -# is not provided. Usually matches CLUSTER_NAME. +# is not provided. Usually matches OPENSHELL_DOCKER_GATEWAY_NAME. #OPENSHELL_GATEWAY=openshell-c # ---------- Ports ---------- -# Host port mapped to the k3s NodePort (30051) where the OpenShell gateway -# listens. The CLI connects here. Must be unique per cluster. -#GATEWAY_PORT=8080 +# Host port where the standalone gateway listens. Must be unique per worktree. +#OPENSHELL_SERVER_PORT=18080 diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 3b3aa1cb8..bd8f77f11 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -4,7 +4,7 @@ on: workflow_call: inputs: component: - description: "Component to build (gateway, supervisor, cluster)" + description: "Component to build (gateway, supervisor)" required: true type: string timeout-minutes: @@ -73,7 +73,7 @@ jobs: binary_component=gateway binary_name=openshell-gateway ;; - supervisor|cluster) + supervisor) binary_component=sandbox binary_name=openshell-sandbox ;; @@ -246,11 +246,6 @@ jobs: echo "$output" grep -q '^openshell-sandbox ' <<<"$output" ;; - cluster) - output="$(docker run --rm --platform "${{ matrix.platform }}" --entrypoint /opt/openshell/bin/openshell-sandbox "$image" --version)" - echo "$output" - grep -q '^openshell-sandbox ' <<<"$output" - ;; esac merge: diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index 6a296f5e3..2607a1bf0 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -24,18 +24,12 @@ jobs: include: - name: linux-arm64 runner: linux-arm64-gpu-l4-latest-1 - cluster: e2e-gpu-arm64 - port: "8083" experimental: false - name: linux-amd64 runner: linux-amd64-gpu-rtxpro6000-latest-1 - cluster: e2e-gpu-amd64 - port: "8084" experimental: false - name: wsl-amd64 runner: wsl-amd64-gpu-rtxpro6000-latest-1 - cluster: e2e-gpu-wsl - port: "8085" experimental: true container: image: ghcr.io/nvidia/openshell/ci:latest @@ -53,30 +47,15 @@ jobs: OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_GATEWAY: ${{ matrix.cluster }} + OPENSHELL_E2E_DOCKER_GPU: "1" steps: - uses: actions/checkout@v6 - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Pull cluster image - run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - - name: Install Python dependencies and generate protobuf stubs run: uv sync --frozen && mise run --no-deps python:proto - - name: Bootstrap GPU cluster - env: - GATEWAY_HOST: host.docker.internal - GATEWAY_PORT: ${{ matrix.port }} - CLUSTER_NAME: ${{ matrix.cluster }} - # Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled. - CLUSTER_GPU: "1" - SKIP_IMAGE_PUSH: "1" - SKIP_CLUSTER_IMAGE_BUILD: "1" - OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - run: mise run --no-deps --skip-deps cluster - - name: Run tests run: mise run --no-deps --skip-deps e2e:python:gpu diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index 8f7284aab..defe6f32a 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -141,41 +141,21 @@ jobs: fi canary: - name: Canary ${{ matrix.mode }} (${{ matrix.arch }}) + name: Canary package gateway (${{ matrix.arch }}) if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} strategy: fail-fast: false matrix: - arch: - - amd64 - - arm64 - mode: - - auto-bootstrap - - two-step include: - arch: amd64 runner: linux-amd64-cpu8 - target: x86_64-unknown-linux-musl - arch: arm64 runner: linux-arm64-cpu8 - target: aarch64-unknown-linux-musl runs-on: ${{ matrix.runner }} timeout-minutes: 30 - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock env: OPENSHELL_REGISTRY_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # The CI container mounts the host Docker socket, so the gateway - # container is a sibling — not reachable at 127.0.0.1 from inside - # this container. OPENSHELL_GATEWAY_HOST tells the auto-bootstrap - # to advertise a reachable address instead. - OPENSHELL_GATEWAY_HOST: host.docker.internal + OPENSHELL_CANARY_PORT: "17670" steps: - uses: actions/checkout@v6 @@ -202,50 +182,83 @@ jobs: fi fi - - name: Install CLI from published install script + - name: Install Debian package run: | set -euo pipefail - curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/main/install.sh | OPENSHELL_VERSION=${{ steps.release.outputs.tag }} OPENSHELL_INSTALL_DIR=/usr/local/bin sh + curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/main/install-dev.sh \ + | OPENSHELL_VERSION=${{ steps.release.outputs.tag }} sh - - name: Verify CLI installation + - name: Verify package binaries run: | set -euo pipefail command -v openshell - ACTUAL="$(openshell --version)" - echo "Installed: $ACTUAL" + command -v openshell-gateway + test -x /usr/libexec/openshell/openshell-driver-vm + + CLI_ACTUAL="$(openshell --version)" + GATEWAY_ACTUAL="$(openshell-gateway --version)" + DRIVER_ACTUAL="$(/usr/libexec/openshell/openshell-driver-vm --version)" + echo "CLI: ${CLI_ACTUAL}" + echo "Gateway: ${GATEWAY_ACTUAL}" + echo "Driver: ${DRIVER_ACTUAL}" + TAG="${{ steps.release.outputs.tag }}" - # For tagged releases (v1.2.3), verify the semver appears in the version string if [[ "$TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then EXPECTED="${TAG#v}" - if [[ "$ACTUAL" != *"$EXPECTED"* ]]; then - echo "::error::Version mismatch: expected '$EXPECTED' in '$ACTUAL'" - exit 1 - fi - echo "Version check passed: found $EXPECTED in output" + for actual in "$CLI_ACTUAL" "$GATEWAY_ACTUAL" "$DRIVER_ACTUAL"; do + if [[ "$actual" != *"$EXPECTED"* ]]; then + echo "::error::Version mismatch: expected '$EXPECTED' in '$actual'" + exit 1 + fi + done + echo "Version check passed: found $EXPECTED in package binaries" else echo "Non-release tag ($TAG), skipping version check" fi - - name: Resolve gateway host - run: | - # On Linux CI runners host.docker.internal is not set automatically - # (it's a Docker Desktop feature). Add it via the Docker bridge IP. - if ! getent hosts host.docker.internal >/dev/null 2>&1; then - BRIDGE_IP=$(docker network inspect bridge --format '{{(index .IPAM.Config 0).Gateway}}') - echo "Adding /etc/hosts entry: ${BRIDGE_IP} host.docker.internal" - echo "${BRIDGE_IP} host.docker.internal" >> /etc/hosts - fi - - # Two-step mode: explicitly start the gateway before creating a sandbox. - # --gateway-host is required because the gateway container is a Docker - # sibling (not in the same network namespace). Without it the metadata - # stores 127.0.0.1 which is unreachable from this CI container. - - name: Start gateway - if: matrix.mode == 'two-step' + - name: Start packaged gateway run: | set -euo pipefail - echo "Starting gateway..." - openshell gateway start --gateway-host "$OPENSHELL_GATEWAY_HOST" + + # The CLI no longer owns gateway lifecycle. In CI we start the + # gateway binary installed by the Debian package directly, using the + # Docker driver so this canary can launch a real sandbox. + systemctl --user stop openshell-gateway >/dev/null 2>&1 || true + + STATE_DIR="$(mktemp -d)" + LOG="${STATE_DIR}/openshell-gateway.log" + echo "GATEWAY_LOG=${LOG}" >> "$GITHUB_ENV" + echo "GATEWAY_STATE_DIR=${STATE_DIR}" >> "$GITHUB_ENV" + + OPENSHELL_BIND_ADDRESS=0.0.0.0 \ + OPENSHELL_SERVER_PORT="${OPENSHELL_CANARY_PORT}" \ + OPENSHELL_DISABLE_TLS=true \ + OPENSHELL_DISABLE_GATEWAY_AUTH=true \ + OPENSHELL_DRIVERS=docker \ + OPENSHELL_DB_URL="sqlite:${STATE_DIR}/openshell.db?mode=rwc" \ + OPENSHELL_GRPC_ENDPOINT="http://host.openshell.internal:${OPENSHELL_CANARY_PORT}" \ + OPENSHELL_SSH_GATEWAY_HOST=127.0.0.1 \ + OPENSHELL_SSH_GATEWAY_PORT="${OPENSHELL_CANARY_PORT}" \ + OPENSHELL_SANDBOX_NAMESPACE="canary-${{ matrix.arch }}-${{ github.run_id }}" \ + nohup openshell-gateway >"${LOG}" 2>&1 & + PID=$! + echo "GATEWAY_PID=${PID}" >> "$GITHUB_ENV" + + for _ in $(seq 1 60); do + if curl -fsS "http://127.0.0.1:${OPENSHELL_CANARY_PORT}/healthz" >/dev/null; then + break + fi + if ! kill -0 "$PID" 2>/dev/null; then + echo "::error::openshell-gateway exited before becoming healthy" + cat "$LOG" + exit 1 + fi + sleep 1 + done + + curl -fsS "http://127.0.0.1:${OPENSHELL_CANARY_PORT}/healthz" + openshell gateway remove local >/dev/null 2>&1 || true + openshell gateway add "http://127.0.0.1:${OPENSHELL_CANARY_PORT}" --local --name local - name: Run canary test run: | @@ -267,3 +280,15 @@ jobs: echo "::error::Canary test failed: 'hello world' not found in output" exit 1 fi + + - name: Stop packaged gateway + if: always() + run: | + set -euo pipefail + if [ -n "${GATEWAY_PID:-}" ]; then + kill "$GATEWAY_PID" >/dev/null 2>&1 || true + fi + if [ "${{ job.status }}" != "success" ] && [ -n "${GATEWAY_LOG:-}" ] && [ -f "$GATEWAY_LOG" ]; then + echo "Gateway log:" + cat "$GATEWAY_LOG" + fi diff --git a/.github/workflows/release-dev.yml b/.github/workflows/release-dev.yml index dfafc43e1..0385930bd 100644 --- a/.github/workflows/release-dev.yml +++ b/.github/workflows/release-dev.yml @@ -67,15 +67,8 @@ jobs: component: supervisor cargo-version: ${{ needs.compute-versions.outputs.cargo_version }} - build-cluster: - needs: [compute-versions] - uses: ./.github/workflows/docker-build.yml - with: - component: cluster - cargo-version: ${{ needs.compute-versions.outputs.cargo_version }} - e2e: - needs: [build-gateway, build-supervisor, build-cluster] + needs: [build-gateway, build-supervisor] uses: ./.github/workflows/e2e-test.yml with: image-tag: ${{ github.sha }} @@ -83,7 +76,7 @@ jobs: tag-ghcr-dev: name: Tag GHCR Images as Dev - needs: [build-gateway, build-supervisor, build-cluster] + needs: [build-gateway, build-supervisor] runs-on: linux-amd64-cpu8 timeout-minutes: 10 steps: @@ -94,7 +87,7 @@ jobs: run: | set -euo pipefail REGISTRY="ghcr.io/nvidia/openshell" - for component in gateway supervisor cluster; do + for component in gateway supervisor; do echo "Tagging ${REGISTRY}/${component}:${{ github.sha }} as dev..." docker buildx imagetools create \ --prefer-index=false \ diff --git a/.github/workflows/release-tag.yml b/.github/workflows/release-tag.yml index 8a3aa7ae5..15cde9b2b 100644 --- a/.github/workflows/release-tag.yml +++ b/.github/workflows/release-tag.yml @@ -82,15 +82,8 @@ jobs: component: supervisor cargo-version: ${{ needs.compute-versions.outputs.cargo_version }} - build-cluster: - needs: [compute-versions] - uses: ./.github/workflows/docker-build.yml - with: - component: cluster - cargo-version: ${{ needs.compute-versions.outputs.cargo_version }} - e2e: - needs: [build-gateway, build-supervisor, build-cluster] + needs: [build-gateway, build-supervisor] uses: ./.github/workflows/e2e-test.yml with: image-tag: ${{ github.sha }} @@ -98,7 +91,7 @@ jobs: tag-ghcr-release: name: Tag GHCR Images for Release - needs: [compute-versions, build-gateway, build-supervisor, build-cluster, e2e] + needs: [compute-versions, build-gateway, build-supervisor, e2e] runs-on: linux-amd64-cpu8 timeout-minutes: 10 steps: @@ -110,7 +103,7 @@ jobs: set -euo pipefail REGISTRY="ghcr.io/nvidia/openshell" VERSION="${{ needs.compute-versions.outputs.semver }}" - for component in gateway supervisor cluster; do + for component in gateway supervisor; do echo "Tagging ${REGISTRY}/${component}:${{ github.sha }} as ${VERSION} and latest..." docker buildx imagetools create \ --prefer-index=false \ diff --git a/.github/workflows/shadow-docker-build.yml b/.github/workflows/shadow-docker-build.yml index 62e687867..3c7642ab3 100644 --- a/.github/workflows/shadow-docker-build.yml +++ b/.github/workflows/shadow-docker-build.yml @@ -33,11 +33,3 @@ jobs: platform: ${{ inputs.platform }} push: false secrets: inherit - - cluster: - uses: ./.github/workflows/docker-build.yml - with: - component: cluster - platform: ${{ inputs.platform }} - push: false - secrets: inherit diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index 4721c9750..37fdcbb94 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -25,7 +25,7 @@ jobs: with: required_label: test:e2e-gpu - build-gateway: + build-supervisor: needs: [pr_metadata] if: needs.pr_metadata.outputs.should_run == 'true' permissions: @@ -33,20 +33,10 @@ jobs: packages: write uses: ./.github/workflows/docker-build.yml with: - component: gateway - - build-cluster: - needs: [pr_metadata] - if: needs.pr_metadata.outputs.should_run == 'true' - permissions: - contents: read - packages: write - uses: ./.github/workflows/docker-build.yml - with: - component: cluster + component: supervisor e2e-gpu: - needs: [pr_metadata, build-gateway, build-cluster] + needs: [pr_metadata, build-supervisor] if: needs.pr_metadata.outputs.should_run == 'true' permissions: contents: read diff --git a/AGENTS.md b/AGENTS.md index 93062fd5f..838726f8b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,7 +34,7 @@ These pipelines connect skills into end-to-end workflows. Individual skill files | `crates/openshell-sandbox/` | Sandbox runtime | Container supervision, policy-enforced egress routing | | `crates/openshell-policy/` | Policy engine | Filesystem, network, process, and inference constraints | | `crates/openshell-router/` | Privacy router | Privacy-aware LLM routing | -| `crates/openshell-bootstrap/` | Gateway metadata | Gateway registration metadata, mTLS bundle storage, legacy bootstrap helpers | +| `crates/openshell-bootstrap/` | Gateway metadata | Gateway registration metadata, auth token storage, mTLS bundle storage | | `crates/openshell-ocsf/` | OCSF logging | OCSF v1.7.0 event types, builders, shorthand/JSONL formatters, tracing layers | | `crates/openshell-core/` | Shared core | Common types, configuration, error handling | | `crates/openshell-providers/` | Provider management | Credential provider backends | diff --git a/Cargo.lock b/Cargo.lock index c02158d44..ed11fd1db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3325,8 +3325,6 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" name = "openshell-bootstrap" version = "0.0.0" dependencies = [ - "async-stream", - "base64 0.22.1", "bollard", "bytes", "futures", diff --git a/architecture/gateway-deploy-connect.md b/architecture/gateway-deploy-connect.md index 14bb3e90f..beb00ecda 100644 --- a/architecture/gateway-deploy-connect.md +++ b/architecture/gateway-deploy-connect.md @@ -93,7 +93,7 @@ The CLI also treats an explicit `http://...` registration as plaintext mode: openshell gateway add http://127.0.0.1:8080 --local ``` -This stores `auth_mode = "plaintext"`, skips mTLS certificate extraction, and bypasses the edge browser-auth flow. +This stores `auth_mode = "plaintext"`, skips mTLS client certificate lookup, and bypasses the edge browser-auth flow. ## File System Layout diff --git a/architecture/oidc-auth.md b/architecture/oidc-auth.md index 746a6a519..199b7c85c 100644 --- a/architecture/oidc-auth.md +++ b/architecture/oidc-auth.md @@ -276,7 +276,7 @@ Standard OIDC scopes (`openid`, `profile`, `email`, `offline_access`) are filter ### CLI Scope Requests -The `--oidc-scopes` flag on `gateway add` and `gateway start` is stored in gateway metadata and included in OAuth2 token requests: +The `--oidc-scopes` flag on `gateway add` is stored in gateway metadata and included in OAuth2 token requests: - **Browser flow**: appended to the `scope` parameter alongside `openid` - **Client credentials flow**: sent as-is (without `openid`, which is inappropriate for service tokens) @@ -313,22 +313,20 @@ These flags configure JWT validation on the `openshell-server` binary: When `--oidc-issuer` is not set, OIDC validation is disabled and the server falls back to mTLS-only or plaintext behavior. -### Gateway Start Flags (CLI) +### Gateway Runtime Flags -The `openshell gateway start` command exposes flags that configure both the server and the local gateway metadata: +OIDC validation is configured on the gateway service, either through package-managed service configuration, direct `openshell-server` flags, or Helm values: | Flag | Default | Description | |---|---|---| -| `--oidc-issuer` | (none) | OIDC issuer URL; passed to the server binary | -| `--oidc-audience` | `openshell-cli` | Expected `aud` claim; passed to the server binary | -| `--oidc-client-id` | `openshell-cli` | Client ID stored in gateway metadata for CLI login flows | -| `--oidc-roles-claim` | (none) | Passed to the server binary if set | -| `--oidc-admin-role` | (none) | Passed to the server binary if set | -| `--oidc-user-role` | (none) | Passed to the server binary if set | -| `--oidc-scopes-claim` | (none) | Passed to the server binary; enables scope enforcement | -| `--oidc-scopes` | (none) | Stored in gateway metadata; included in CLI token requests | +| `--oidc-issuer` | (none) | OIDC issuer URL | +| `--oidc-audience` | `openshell-cli` | Expected `aud` claim | +| `--oidc-roles-claim` | `realm_access.roles` | Dot-separated path to roles array | +| `--oidc-admin-role` | `openshell-admin` | Role name that grants admin access | +| `--oidc-user-role` | `openshell-user` | Role name that grants standard user access | +| `--oidc-scopes-claim` | (empty) | Claim path for scopes; enables scope enforcement | -The `--oidc-client-id` flag is **not** a server flag — it is stored in gateway metadata and used by the CLI during login. The `--oidc-audience` flag is both a server flag (for JWT validation) and stored in metadata (for token requests). +The CLI no longer starts or stops gateways. After the service is running, `openshell gateway add --oidc-*` stores client-side login metadata such as client ID, audience, and requested scopes. ### Helm Values @@ -350,7 +348,7 @@ The server exposes `GET /auth/oidc-config` which returns the configured OIDC iss ### Keycloak ```bash -openshell gateway start \ +openshell-server \ --oidc-issuer http://keycloak:8180/realms/openshell # Defaults work: realm_access.roles, openshell-admin, openshell-user ``` @@ -360,10 +358,9 @@ openshell gateway start \ Register an app in Azure Portal with app roles `OpenShell.Admin` and `OpenShell.User`. With Entra ID the client ID (the SPA/public app registration) and audience (the API app registration, e.g. `api://openshell`) are typically different: ```bash -openshell gateway start \ +openshell-server \ --oidc-issuer https://login.microsoftonline.com/{tenant-id}/v2.0 \ --oidc-audience api://openshell \ - --oidc-client-id {client-id} \ --oidc-roles-claim roles \ --oidc-admin-role OpenShell.Admin \ --oidc-user-role OpenShell.User @@ -383,7 +380,7 @@ openshell gateway add https://gateway:8080 \ Create an authorization server with a `groups` claim, then: ```bash -openshell gateway start \ +openshell-server \ --oidc-issuer https://dev-xxxxx.okta.com/oauth2/default \ --oidc-roles-claim groups \ --oidc-admin-role openshell-admin \ @@ -395,7 +392,7 @@ openshell gateway start \ GitHub's OIDC tokens (from Actions) don't carry roles. Use empty role names to skip RBAC — any valid GitHub JWT is authorized: ```bash -openshell gateway start \ +openshell-server \ --oidc-issuer https://token.actions.githubusercontent.com \ --oidc-audience https://github.com/{org} \ --oidc-admin-role "" \ @@ -422,22 +419,6 @@ openshell gateway add http://gateway:8080 \ --oidc-audience api://openshell ``` -### Start a K3s Gateway with OIDC - -```bash -openshell gateway start \ - --oidc-issuer http://keycloak:8180/realms/openshell \ - --plaintext - -# With RBAC configuration: -openshell gateway start \ - --oidc-issuer http://keycloak:8180/realms/openshell \ - --oidc-client-id openshell-cli \ - --oidc-roles-claim realm_access.roles \ - --oidc-admin-role openshell-admin \ - --oidc-user-role openshell-user -``` - ### Authenticate ```bash @@ -533,8 +514,6 @@ The CLI determines which auth mode to use based on `auth_mode` in gateway metada | CLI gateway commands | `crates/openshell-cli/src/run.rs` (`gateway_add`, `gateway_login`) | | Token storage | `crates/openshell-bootstrap/src/oidc_token.rs` | | Gateway metadata | `crates/openshell-bootstrap/src/metadata.rs` | -| Bootstrap pipeline | `crates/openshell-bootstrap/src/lib.rs`, `docker.rs` | -| K3s entrypoint | `deploy/docker/cluster-entrypoint.sh` | | HelmChart template | `deploy/kube/manifests/openshell-helmchart.yaml` | | Helm values | `deploy/helm/openshell/values.yaml` | | Helm statefulset | `deploy/helm/openshell/templates/statefulset.yaml` | diff --git a/architecture/oidc-local-testing.md b/architecture/oidc-local-testing.md index 160636a9e..b345f3400 100644 --- a/architecture/oidc-local-testing.md +++ b/architecture/oidc-local-testing.md @@ -1,7 +1,7 @@ # OIDC Local Testing Guide Step-by-step instructions for testing OIDC/Keycloak authentication locally, -including both standalone server testing and full end-to-end K3s testing. +including both standalone server testing and full end-to-end Helm/k3d testing. ## Prerequisites @@ -136,7 +136,7 @@ curl -s http://127.0.0.1:8080/auth/oidc-config | jq . # Expected: {"audience":"openshell-cli","issuer":"http://localhost:8180/realms/openshell"} ``` -Stop the standalone server (Ctrl+C) before proceeding to K3s testing. +Stop the standalone server (Ctrl+C) before proceeding to Helm/k3d testing. ## 3. CLI OIDC Flow (Standalone) @@ -182,40 +182,45 @@ cargo run -p openshell-cli --features bundled-z3 -- sandbox list # Expected: error (no token) ``` -## 4. End-to-End K3s Testing +## 4. End-to-End Helm/k3d Testing -This deploys a full K3s cluster with OIDC enforcement and tests sandbox +This deploys a local k3d cluster with OIDC enforcement and tests sandbox creation, RBAC, login/logout, and token expiry. -### 4a. Bootstrap the cluster with OIDC +### 4a. Start the Helm deployment with OIDC -Keycloak runs on the host. The K3s container reaches it via the host IP. -The `OPENSHELL_OIDC_ISSUER` env var tells the deploy script to pass the -issuer to the Helm chart so the gateway starts with JWT validation enabled. +Use the Helm local development flow and include the Keycloak values overlay so +the gateway starts with JWT validation enabled. ```bash -HOST_IP=$(hostname -I | awk '{print $1}') -OPENSHELL_OIDC_ISSUER="http://${HOST_IP}:8180/realms/openshell" \ -OPENSHELL_OIDC_SCOPES="openshell:all" \ -mise run cluster +mise run helm:k3s:create +mise run keycloak:k8s:setup +mise run helm:skaffold:run ``` -Add `OPENSHELL_OIDC_SCOPES_CLAIM="scope"` to also enable scope enforcement. -The `OPENSHELL_OIDC_SCOPES` value is stored in gateway metadata so `gateway login` -requests these scopes automatically. +Set `server.oidc.scopesClaim` in the Helm values to enable scope enforcement. +Pass `--oidc-scopes` when registering the gateway so `gateway login` requests +those scopes automatically. -Wait for "Deploy complete!" and verify OIDC is active: +Verify OIDC is active: ```bash -CONTAINER=$(docker ps --format '{{.Names}}' | grep openshell-cluster) -docker exec $CONTAINER kubectl -n openshell logs openshell-0 | grep OIDC +kubectl -n openshell logs statefulset/openshell | grep OIDC # Expected: OIDC JWT validation enabled (issuer: http://...) ``` ### 4b. Login to the gateway -The bootstrap step above configures the gateway metadata with the OIDC -issuer automatically. Authenticate with Keycloak: +If the cluster task did not register gateway metadata automatically, register +the running gateway before login: + +```bash +openshell gateway add http://127.0.0.1:8080 \ + --oidc-issuer http://localhost:8180/realms/openshell \ + --oidc-scopes "openshell:all" +``` + +Authenticate with Keycloak: ```bash openshell gateway login @@ -353,7 +358,7 @@ openshell provider delete test-provider ## 5. Scope-Based Permissions Testing -Scopes provide fine-grained, per-method access control on top of roles. This section tests scope enforcement using both the standalone server and K3s. +Scopes provide fine-grained, per-method access control on top of roles. This section tests scope enforcement using both the standalone server and Helm/k3d. ### 5a. Standalone server with scope enforcement @@ -436,16 +441,6 @@ openshell gateway add http://127.0.0.1:8080 \ --oidc-scopes "sandbox:read sandbox:write" ``` -Or for K3s testing, pass `OPENSHELL_OIDC_SCOPES` during bootstrap: - -```bash -HOST_IP=$(hostname -I | awk '{print $1}') -OPENSHELL_OIDC_ISSUER="http://${HOST_IP}:8180/realms/openshell" \ -OPENSHELL_OIDC_SCOPES_CLAIM="scope" \ -OPENSHELL_OIDC_SCOPES="sandbox:read sandbox:write" \ -mise run cluster -``` - Then login and test: ```bash @@ -458,15 +453,10 @@ openshell provider list # should fail (no provider:read scope) ### 5f. Test openshell:all via CLI -For K3s, restart the cluster with `openshell:all`: +For Helm/k3d, update the OIDC scopes in the Helm values and redeploy: ```bash -mise run cluster:stop -HOST_IP=$(hostname -I | awk '{print $1}') -OPENSHELL_OIDC_ISSUER="http://${HOST_IP}:8180/realms/openshell" \ -OPENSHELL_OIDC_SCOPES_CLAIM="scope" \ -OPENSHELL_OIDC_SCOPES="openshell:all" \ -mise run cluster +mise run helm:skaffold:run openshell gateway login openshell sandbox list # should work @@ -506,8 +496,8 @@ grpcurl -plaintext -import-path proto -proto openshell.proto \ ## 6. Cleanup ```bash -# Stop the cluster -mise run cluster:stop +# Delete the local k3d cluster +mise run helm:k3s:delete # Stop Keycloak mise run keycloak:stop @@ -556,11 +546,11 @@ mise run keycloak:stop **"sandbox secret required for this method"** — A sandbox-to-server RPC was called without the `x-sandbox-secret` header. -**"OIDC discovery request failed"** — Server can't reach Keycloak. Use the host IP (not `localhost`) for K3s deployments. +**"OIDC discovery request failed"** — Server can't reach Keycloak. Check the Keycloak service and issuer configured in the Helm values. **"invalid token: unknown signing key"** — JWKS key mismatch. Restart the server to refresh the cache. -**No "OIDC JWT validation enabled" in K3s logs** — The `OPENSHELL_OIDC_ISSUER` env var was not set when deploying. Re-run `OPENSHELL_OIDC_ISSUER="http://:8180/realms/openshell" mise run cluster gateway` to rebuild and redeploy with OIDC enabled. +**No "OIDC JWT validation enabled" in gateway logs** — The Helm values did not set `server.oidc.issuer`. Include `values-keycloak.yaml` or another OIDC values overlay and redeploy with `mise run helm:skaffold:run`. **"InvalidIssuer"** — The issuer URL in the OIDC token does not match the server's configured issuer. Ensure the gateway metadata `oidc_issuer` uses the same URL the server was started with (typically the host IP, not `localhost`). diff --git a/architecture/sandbox-custom-containers.md b/architecture/sandbox-custom-containers.md index b8303c5de..d231f4bd1 100644 --- a/architecture/sandbox-custom-containers.md +++ b/architecture/sandbox-custom-containers.md @@ -9,7 +9,7 @@ The `--from` flag accepts four kinds of input: | Input | Example | Behavior | |-------|---------|----------| | **Community sandbox name** | `--from openclaw` | Resolves to `ghcr.io/nvidia/openshell-community/sandboxes/openclaw:latest` | -| **Dockerfile path** | `--from ./Dockerfile` | Builds the image locally, makes it available to the local gateway when needed, then creates the sandbox | +| **Dockerfile path** | `--from ./Dockerfile` | Builds the image into the local Docker daemon, then creates the sandbox | | **Directory with Dockerfile** | `--from ./my-sandbox/` | Uses the directory as the build context | | **Full image reference** | `--from myregistry.com/img:tag` | Uses the image directly | @@ -34,8 +34,7 @@ The community registry prefix defaults to `ghcr.io/nvidia/openshell-community/sa When `--from` points to a Dockerfile or directory, the CLI: 1. Builds the image locally via the Docker daemon (respecting `.dockerignore`). -2. Makes it available to the local gateway runtime when a managed local gateway is running; otherwise keeps the tag in the host Docker daemon for standalone local drivers. -3. Creates the sandbox with the resulting image tag. +2. Creates the sandbox with the resulting image tag. The build step aborts with a clear error if the Docker build stream stays silent for longer than `OPENSHELL_BUILD_NO_PROGRESS_TIMEOUT_SECS` seconds (default 1800). This is a guard against deadlocked container runtimes — most commonly an under-provisioned VM (e.g. macOS Colima with the default 2 vCPU / 2 GiB) where BuildKit can stop emitting events partway through a multi-step build and never recover. Raise the value if a legitimate build step is just quiet, or lower it for tighter CI budgets. @@ -110,14 +109,14 @@ The `openshell-sandbox` supervisor adapts to arbitrary environments: |----------|-----------| | Unified `--from` flag | Single entry point for community names, Dockerfiles, directories, and image refs — removes the need to know registry paths | | Community name resolution | Bare names like `openclaw` expand to the GHCR community registry, making the common case simple | -| Auto build/import for Dockerfiles | Eliminates the two-step build/import + create workflow for local gateway development | +| Auto build for Dockerfiles | Eliminates the two-step build + create workflow for local gateway development | | `OPENSHELL_COMMUNITY_REGISTRY` env var | Allows organizations to host their own community sandbox registry | | Driver-owned supervisor delivery | Each compute driver decides how to deliver `openshell-sandbox` for its runtime. | | Read-only supervisor delivery | The supervisor should be mounted or packaged read-only where the driver supports it, and the startup seccomp prelude blocks remount syscalls that would otherwise reopen it for writes once privileged bootstrap has completed. | | Command override | Ensures `openshell-sandbox` is the entrypoint regardless of the image's default CMD | | Clear `run_as_user/group` for custom images | Prevents startup failure when the image lacks the default `sandbox` user | | Non-fatal log file init | `/var/log/openshell.log` may be unwritable in arbitrary images; falls back to stdout | -| Local gateway image availability | Dockerfile sources build into the host Docker daemon; managed local gateway deployments import the tag so the selected runtime can resolve it. | +| Local gateway image availability | Dockerfile sources build into the host Docker daemon; package-managed local gateways and their compute drivers resolve the resulting tag from that daemon. | | Optional `iptables` for bypass detection | Core network isolation works via routing alone (`iproute2`); `iptables` only adds fast-fail (`ECONNREFUSED`) and diagnostic LOG entries. Making it optional avoids hard failures in minimal images that lack `iptables` while giving better UX when it is available. | ## Limitations diff --git a/crates/openshell-bootstrap/Cargo.toml b/crates/openshell-bootstrap/Cargo.toml index 942ffc48b..c0fb7e9f4 100644 --- a/crates/openshell-bootstrap/Cargo.toml +++ b/crates/openshell-bootstrap/Cargo.toml @@ -11,9 +11,7 @@ rust-version.workspace = true [dependencies] openshell-core = { path = "../openshell-core" } -async-stream = "0.3" -base64 = "0.22" -bollard = { version = "0.20", features = ["ssh"] } +bollard = "0.20" bytes = { workspace = true } futures = { workspace = true } miette = { workspace = true } diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index 9caeca57a..3b61a4517 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -1,13 +1,11 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Build container images for gateway runtimes. +//! Build container images for sandbox runtimes. //! //! This module wraps bollard's `build_image()` API to build a container image -//! from a Dockerfile and build context. Kubernetes deployments reuse the -//! existing push pipeline to import the image into the gateway's containerd -//! runtime. VM deployments keep the built image in the local Docker daemon and -//! pass an internal local-image reference to the VM driver. +//! from a Dockerfile and build context. Package-managed local gateways use the +//! host Docker daemon, so the resulting tag is passed to the gateway directly. use std::collections::HashMap; use std::path::Path; @@ -19,9 +17,6 @@ use futures::StreamExt; use miette::{IntoDiagnostic, Result, WrapErr}; use tokio::time::timeout; -use crate::constants::container_name; -use crate::push::push_local_images; - /// Maximum gap between Docker build stream events before a build is treated /// as stuck. /// @@ -35,10 +30,9 @@ const DEFAULT_BUILD_NO_PROGRESS_TIMEOUT_SECS: u64 = 1800; /// Build a container image from a Dockerfile using the local Docker daemon. /// -/// This is used by `openshell sandbox create --from ` for both the -/// Kubernetes and VM backends. The image remains available in the local Docker -/// daemon so the caller can either hand the resulting tag directly to the VM -/// backend or import it into a local gateway containerd runtime. +/// This is used by `openshell sandbox create --from `. The image +/// remains available in the local Docker daemon so the gateway's active local +/// compute driver can resolve the tag. #[allow(clippy::implicit_hasher)] pub async fn build_local_image( dockerfile_path: &Path, @@ -56,49 +50,6 @@ pub async fn build_local_image( Ok(()) } -/// Push a locally-built image into the gateway's containerd runtime. -#[allow(clippy::implicit_hasher)] -pub async fn push_image_into_gateway( - tag: &str, - gateway_name: &str, - on_log: &mut impl FnMut(String), -) -> Result<()> { - on_log(format!( - "Pushing image {tag} into gateway \"{gateway_name}\"" - )); - let local_docker = crate::docker::connect_local_for_large_transfers() - .into_diagnostic() - .wrap_err("failed to connect to local Docker daemon")?; - let container = container_name(gateway_name); - let images: Vec<&str> = vec![tag]; - push_local_images(&local_docker, &local_docker, &container, &images, on_log).await?; - - on_log(format!("Image {tag} is available in the gateway.")); - Ok(()) -} - -/// Build a container image from a Dockerfile and push it into the gateway. -/// -/// This is used by `openshell sandbox create --from ` when the -/// active gateway is the local Kubernetes deployment. It: -/// 1. Creates a tar archive of the build context directory. -/// 2. Sends it to the local Docker daemon via `build_image()`. -/// 3. Pushes the resulting image into the gateway's containerd via the -/// existing `push_local_images()` pipeline. -#[allow(clippy::implicit_hasher)] -pub async fn build_and_push_image( - dockerfile_path: &Path, - tag: &str, - context_dir: &Path, - gateway_name: &str, - build_args: &HashMap, - on_log: &mut impl FnMut(String), -) -> Result<()> { - build_local_image(dockerfile_path, tag, context_dir, build_args, on_log).await?; - push_image_into_gateway(tag, gateway_name, on_log).await?; - Ok(()) -} - /// Build a container image using the local Docker daemon. /// /// Creates a tar archive of `context_dir`, sends it to Docker with the diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs deleted file mode 100644 index eee9000d1..000000000 --- a/crates/openshell-bootstrap/src/constants.rs +++ /dev/null @@ -1,112 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/// Path to the kubeconfig inside the k3s container. -/// Used by in-container kubectl operations (node cleanup, PKI reconciliation, etc.). -pub const KUBECONFIG_PATH: &str = "/etc/rancher/k3s/k3s.yaml"; - -/// K8s secret holding the server's TLS certificate and private key. -pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls"; -/// K8s secret holding the CA certificate used to verify client certificates. -pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca"; -/// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes). -pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls"; -/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods). -pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake"; -const NODE_NAME_PREFIX: &str = "openshell-"; -const NODE_NAME_FALLBACK_SUFFIX: &str = "gateway"; -const KUBERNETES_MAX_NAME_LEN: usize = 253; - -pub fn container_name(name: &str) -> String { - format!("openshell-cluster-{name}") -} - -/// Deterministic k3s node name derived from the gateway name. -/// -/// k3s defaults to using the container hostname (= Docker container ID) as -/// the node name. When the container is recreated (e.g. after an image -/// upgrade), the container ID changes, creating a new k3s node. The -/// `clean_stale_nodes` function then deletes PVCs whose backing PVs have -/// node affinity for the old node — wiping the server database and any -/// sandbox persistent volumes. -/// -/// By passing a deterministic `--node-name` to k3s, the node identity -/// survives container recreation, and PVCs are never orphaned. -/// -/// Gateway names allow Docker-friendly separators and uppercase characters, -/// but Kubernetes node names must be DNS-safe. Normalize the gateway name into -/// a single lowercase RFC 1123 label so previously accepted names such as -/// `prod_us` or `Prod.US` still deploy successfully. -pub fn node_name(name: &str) -> String { - format!("{NODE_NAME_PREFIX}{}", normalize_node_name_suffix(name)) -} - -fn normalize_node_name_suffix(name: &str) -> String { - let mut normalized = String::with_capacity(name.len()); - let mut last_was_separator = false; - - for ch in name.chars() { - if ch.is_ascii_alphanumeric() { - normalized.push(ch.to_ascii_lowercase()); - last_was_separator = false; - } else if !last_was_separator { - normalized.push('-'); - last_was_separator = true; - } - } - - let mut normalized = normalized.trim_matches('-').to_string(); - if normalized.is_empty() { - normalized.push_str(NODE_NAME_FALLBACK_SUFFIX); - } - - let max_suffix_len = KUBERNETES_MAX_NAME_LEN.saturating_sub(NODE_NAME_PREFIX.len()); - if normalized.len() > max_suffix_len { - normalized.truncate(max_suffix_len); - normalized.truncate(normalized.trim_end_matches('-').len()); - } - - if normalized.is_empty() { - normalized.push_str(NODE_NAME_FALLBACK_SUFFIX); - } - - normalized -} - -pub fn volume_name(name: &str) -> String { - format!("openshell-cluster-{name}") -} - -pub fn network_name(name: &str) -> String { - format!("openshell-cluster-{name}") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn node_name_normalizes_uppercase_and_underscores() { - assert_eq!(node_name("Prod_US"), "openshell-prod-us"); - } - - #[test] - fn node_name_collapses_and_trims_separator_runs() { - assert_eq!(node_name("._Prod..__-Gateway-."), "openshell-prod-gateway"); - } - - #[test] - fn node_name_falls_back_when_gateway_name_has_no_alphanumerics() { - assert_eq!(node_name("...___---"), "openshell-gateway"); - } - - #[test] - fn node_name_truncates_to_kubernetes_name_limit() { - let gateway_name = "A".repeat(400); - let node_name = node_name(&gateway_name); - - assert!(node_name.len() <= KUBERNETES_MAX_NAME_LEN); - assert!(node_name.starts_with(NODE_NAME_PREFIX)); - assert!(node_name.ends_with('a')); - } -} diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs deleted file mode 100644 index c18c938aa..000000000 --- a/crates/openshell-bootstrap/src/docker.rs +++ /dev/null @@ -1,1499 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -use crate::RemoteOptions; -use crate::constants::{container_name, network_name, node_name, volume_name}; -use crate::image::{self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, parse_image_ref}; -use bollard::API_DEFAULT_VERSION; -use bollard::Docker; -use bollard::errors::Error as BollardError; -use bollard::models::{ - ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum, - NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, - RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest, -}; -use bollard::query_parameters::{ - CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, - ListContainersOptionsBuilder, RemoveContainerOptions, RemoveImageOptions, RemoveVolumeOptions, - StartContainerOptions, -}; -use futures::StreamExt; -use miette::{IntoDiagnostic, Result, WrapErr}; -use std::collections::HashMap; - -const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell"; - -/// Default total HTTP timeout for Docker API calls that stream large payloads -/// (e.g. `docker save` used by `sandbox create --from`). Bollard's own -/// `connect_with_local_defaults()` ceiling is 120s, which is far too short for -/// multi-GB image exports — a 7 GB image on a laptop SSD takes ~4–5 minutes. -/// One hour is a safe upper bound; override with `OPENSHELL_DOCKER_TIMEOUT_SECS`. -pub const DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS: u64 = 3600; - -/// Build a local-Docker client suitable for large streaming transfers. -/// Respects `OPENSHELL_DOCKER_TIMEOUT_SECS` (in seconds); falls back to -/// [`DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS`] when unset or unparseable. -pub fn connect_local_for_large_transfers() -> std::result::Result { - let secs: u64 = std::env::var("OPENSHELL_DOCKER_TIMEOUT_SECS") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS); - Ok(Docker::connect_with_local_defaults()?.with_timeout(std::time::Duration::from_secs(secs))) -} - -/// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a -/// concrete device ID based on whether CDI is enabled on the daemon. -/// -/// | Input | Output | -/// |--------------|--------------------------------------------------------------| -/// | `[]` | `[]` — no GPU | -/// | `["legacy"]` | `["legacy"]` — pass through to the non-CDI fallback path | -/// | `["auto"]` | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]` | -/// | `[cdi-ids…]` | unchanged | -pub fn resolve_gpu_device_ids(gpu: &[String], cdi_enabled: bool) -> Vec { - match gpu { - [] => vec![], - [v] if v == "auto" => { - if cdi_enabled { - vec!["nvidia.com/gpu=all".to_string()] - } else { - vec!["legacy".to_string()] - } - } - other => other.to_vec(), - } -} - -const REGISTRY_MODE_EXTERNAL: &str = "external"; - -fn env_non_empty(key: &str) -> Option { - std::env::var(key) - .ok() - .map(|v| v.trim().to_string()) - .filter(|v| !v.is_empty()) -} - -fn env_bool(key: &str) -> Option { - env_non_empty(key).map(|value| { - matches!( - value.to_ascii_lowercase().as_str(), - "1" | "true" | "yes" | "on" - ) - }) -} - -/// Platform information for a Docker daemon host. -#[derive(Debug, Clone)] -pub struct HostPlatform { - /// CPU architecture (e.g., "amd64", "arm64") - pub arch: String, - /// Operating system (e.g., "linux") - pub os: String, -} - -impl HostPlatform { - /// Return the platform string in the format `os/arch` (e.g., `linux/amd64`). - pub fn platform_string(&self) -> String { - format!("{}/{}", self.os, self.arch) - } -} - -/// Query the Docker daemon for the host platform (architecture and OS). -pub async fn get_host_platform(docker: &Docker) -> Result { - let version = docker - .version() - .await - .into_diagnostic() - .wrap_err("failed to query Docker daemon version")?; - - let arch = version - .arch - .ok_or_else(|| miette::miette!("Docker daemon did not report architecture"))?; - let os = version - .os - .ok_or_else(|| miette::miette!("Docker daemon did not report OS"))?; - - Ok(HostPlatform { - arch: normalize_arch(&arch), - os: os.to_lowercase(), - }) -} - -/// Normalize architecture names to Docker convention. -/// -/// Docker uses `amd64` / `arm64` / `arm` etc., but some systems may report -/// `x86_64` or `aarch64` instead. -pub fn normalize_arch(arch: &str) -> String { - match arch { - "x86_64" => "amd64".to_string(), - "aarch64" => "arm64".to_string(), - other => other.to_lowercase(), - } -} - -/// Result of a successful Docker preflight check. -/// -/// Contains the validated Docker client and metadata about the daemon so -/// callers can reuse the connection without re-checking. -#[derive(Debug)] -pub struct DockerPreflight { - /// A Docker client that has been verified as connected and responsive. - pub docker: Docker, - /// Docker daemon version string (e.g., "28.1.1"). - pub version: Option, -} - -/// Well-known Docker socket paths to probe when the default fails. -/// -/// These cover common container runtimes on macOS and Linux: -/// - `/var/run/docker.sock` — default for Docker Desktop, `OrbStack`, Colima -/// - `$HOME/.colima/docker.sock` — Colima (older installs) -/// - `$HOME/.orbstack/run/docker.sock` — `OrbStack` (if symlink is missing) -const WELL_KNOWN_SOCKET_PATHS: &[&str] = &[ - "/var/run/docker.sock", - // Expanded at runtime via home_dir(): - // ~/.colima/docker.sock - // ~/.orbstack/run/docker.sock -]; - -/// Check that a Docker-compatible runtime is installed, running, and reachable. -/// -/// This is the primary preflight gate. It must be called before any gateway -/// deploy work begins. On failure it produces a user-friendly error with -/// actionable recovery steps instead of a raw bollard connection error. -pub async fn check_docker_available() -> Result { - // Step 1: Try to connect using bollard's default resolution - // (respects DOCKER_HOST, then falls back to /var/run/docker.sock). - let docker = match Docker::connect_with_local_defaults() { - Ok(d) => d, - Err(err) => { - return Err(docker_not_reachable_error( - &format!("{err}"), - "Failed to create Docker client", - )); - } - }; - - // Step 2: Ping the daemon to confirm it's responsive. - if let Err(err) = docker.ping().await { - return Err(docker_not_reachable_error( - &format!("{err}"), - "Docker socket exists but the daemon is not responding", - )); - } - - // Step 3: Query version info (best-effort — don't fail on this). - let version = match docker.version().await { - Ok(v) => v.version, - Err(_) => None, - }; - - Ok(DockerPreflight { docker, version }) -} - -/// Build a rich, user-friendly error when Docker is not reachable. -fn docker_not_reachable_error(raw_err: &str, summary: &str) -> miette::Report { - let docker_host = std::env::var("DOCKER_HOST").ok(); - let socket_exists = std::path::Path::new("/var/run/docker.sock").exists(); - - let mut hints: Vec = Vec::new(); - - if !socket_exists && docker_host.is_none() { - // No socket and no DOCKER_HOST — likely nothing is installed or started - hints.push( - "No Docker socket found at /var/run/docker.sock and DOCKER_HOST is not set." - .to_string(), - ); - hints.push( - "Install and start a Docker-compatible runtime. See the support matrix \ - in the OpenShell docs for tested configurations." - .to_string(), - ); - - // Check for alternative sockets that might exist - let alt_sockets = find_alternative_sockets(); - if !alt_sockets.is_empty() { - hints.push(format!( - "Found Docker-compatible socket(s) at alternative path(s):\n {}\n\n \ - Set DOCKER_HOST to use one, e.g.:\n\n \ - export DOCKER_HOST=unix://{}", - alt_sockets.join("\n "), - alt_sockets[0], - )); - } - } else if docker_host.is_some() { - // DOCKER_HOST is set but daemon didn't respond - let host_val = docker_host.unwrap(); - hints.push(format!( - "DOCKER_HOST is set to '{host_val}' but the Docker daemon is not responding." - )); - hints.push( - "Verify your Docker runtime is started and the DOCKER_HOST value is correct." - .to_string(), - ); - } else { - // Socket exists but daemon isn't responding - hints.push( - "Docker socket found at /var/run/docker.sock but the daemon is not responding." - .to_string(), - ); - hints.push("Start your Docker runtime and try again.".to_string()); - } - - hints.push("Verify Docker is working with: docker info".to_string()); - - let help_text = hints.join("\n\n"); - - miette::miette!(help = help_text, "{summary}.\n\n {raw_err}") -} - -/// Probe for Docker-compatible sockets at non-default locations. -fn find_alternative_sockets() -> Vec { - let mut found = Vec::new(); - - // Check well-known static paths - for path in WELL_KNOWN_SOCKET_PATHS { - if std::path::Path::new(path).exists() { - found.push(path.to_string()); - } - } - - // Check home-relative paths - if let Some(home) = home_dir() { - let home_sockets = [ - format!("{home}/.colima/docker.sock"), - format!("{home}/.orbstack/run/docker.sock"), - ]; - for path in &home_sockets { - if std::path::Path::new(path).exists() && !found.contains(path) { - found.push(path.clone()); - } - } - } - - found -} - -fn home_dir() -> Option { - std::env::var("HOME").ok() -} - -/// Create an SSH Docker client from remote options. -pub async fn create_ssh_docker_client(remote: &RemoteOptions) -> Result { - // Ensure destination has ssh:// prefix - let ssh_url = if remote.destination.starts_with("ssh://") { - remote.destination.clone() - } else { - format!("ssh://{}", remote.destination) - }; - - let docker = Docker::connect_with_ssh( - &ssh_url, - 600, // timeout in seconds (10 minutes for large image transfers) - API_DEFAULT_VERSION, - remote.ssh_key.clone(), - ) - .into_diagnostic() - .wrap_err_with(|| format!("failed to connect to remote Docker daemon at {ssh_url}"))?; - - // Negotiate the API version with the remote daemon. bollard defaults to - // a recent API version (1.52) which may be higher than what the remote - // Docker supports. Version negotiation downgrades the client version to - // match the server, preventing errors like "Schema 2 manifest not - // supported by client" when pulling images on older Docker daemons. - docker - .negotiate_version() - .await - .into_diagnostic() - .wrap_err("failed to negotiate Docker API version with remote daemon") -} - -/// Find the running openshell gateway container by image name. -/// -/// Lists all running containers and returns the name of the one whose image -/// contains `openshell/cluster`. When `port` is provided, only containers -/// with a matching host port binding are considered — this disambiguates -/// when multiple gateway containers are running on the same host. -/// -/// Fails if zero or multiple containers match. -pub async fn find_gateway_container(docker: &Docker, port: Option) -> Result { - let containers = docker - .list_containers(Some(ListContainersOptionsBuilder::new().all(false).build())) - .await - .into_diagnostic() - .wrap_err("failed to list Docker containers")?; - - let is_gateway_image = |c: &bollard::models::ContainerSummary| { - c.image - .as_deref() - .is_some_and(|img| img.contains("openshell/cluster")) - }; - - let has_port = |c: &bollard::models::ContainerSummary, p: u16| { - c.ports - .as_deref() - .unwrap_or_default() - .iter() - .any(|binding| binding.public_port == Some(p)) - }; - - let container_name = |c: &bollard::models::ContainerSummary| { - c.names - .as_ref() - .and_then(|n| n.first()) - .map(|n| n.trim_start_matches('/').to_string()) - }; - - let matches: Vec = containers - .iter() - .filter(|c| is_gateway_image(c) && port.is_none_or(|p| has_port(c, p))) - .filter_map(container_name) - .collect(); - - match matches.len() { - 0 => { - let hint = port.map_or_else( - || { - "No openshell gateway container found.\n\ - Is the gateway running? Check with: docker ps" - .to_string() - }, - |p| { - format!( - "No openshell gateway container found listening on port {p}.\n\ - Is the gateway running? Check with: docker ps" - ) - }, - ); - Err(miette::miette!("{hint}")) - } - 1 => Ok(matches.into_iter().next().unwrap()), - _ => Err(miette::miette!( - "Found multiple openshell gateway containers: {}\n\ - Specify the port in the endpoint URL to select one (e.g. https://host:8080).", - matches.join(", ") - )), - } -} - -/// Create a fresh Docker bridge network for the gateway. -/// -/// Always removes and recreates the network to guarantee a clean state. -/// Stale Docker networks (e.g., from a previous interrupted destroy or -/// Docker Desktop restart) can leave broken routing that causes the -/// container to fail with "no default routes found". -pub async fn ensure_network(docker: &Docker, net_name: &str) -> Result<()> { - force_remove_network(docker, net_name).await?; - - // Docker may return a 409 conflict if the previous network teardown has - // not fully completed in the daemon. Retry a few times with back-off, - // re-attempting the removal before each create. - let mut last_err = None; - for attempt in 0u64..5 { - if attempt > 0 { - tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await; - // Re-attempt removal in case the previous teardown has now settled. - force_remove_network(docker, net_name).await?; - } - match docker - .create_network(NetworkCreateRequest { - name: net_name.to_string(), - driver: Some("bridge".to_string()), - attachable: Some(true), - ..Default::default() - }) - .await - { - Ok(_) => return Ok(()), - Err(err) if is_conflict(&err) => { - tracing::debug!( - "Network create conflict (attempt {}/5), retrying: {}", - attempt + 1, - err, - ); - last_err = Some(err); - } - Err(err) => { - return Err(err) - .into_diagnostic() - .wrap_err("failed to create Docker network"); - } - } - } - Err(last_err.expect("at least one retry attempt")) - .into_diagnostic() - .wrap_err("failed to create Docker network after retries (network still in use)") -} - -pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> { - match docker.inspect_volume(name).await { - Ok(_) => return Ok(()), - Err(err) if is_not_found(&err) => {} - Err(err) => return Err(err).into_diagnostic(), - } - - docker - .create_volume(VolumeCreateRequest { - name: Some(name.to_string()), - ..Default::default() - }) - .await - .into_diagnostic() - .wrap_err("failed to create Docker volume")?; - Ok(()) -} - -pub async fn ensure_image( - docker: &Docker, - image_ref: &str, - registry_username: Option<&str>, - registry_token: Option<&str>, -) -> Result<()> { - match docker.inspect_image(image_ref).await { - Ok(_) => return Ok(()), - Err(err) if is_not_found(&err) => {} - Err(err) => return Err(err).into_diagnostic(), - } - - // For local-only images (no registry prefix), give a clear error instead - // of attempting a pull from Docker Hub that will always fail. - if image::is_local_image_ref(image_ref) { - return Err(miette::miette!( - "Image '{}' not found locally. This looks like a locally-built image \ - (no registry prefix). Build it first with `mise run docker:build:gateway`.", - image_ref, - )); - } - - let (repo, tag) = parse_image_ref(image_ref); - - // Use explicit GHCR credentials when provided for ghcr.io images. - // Public repos are pulled without authentication by default. - let credentials = if repo.starts_with("ghcr.io/") { - image::ghcr_credentials(registry_username, registry_token) - } else { - None - }; - - let options = CreateImageOptions { - from_image: Some(repo.clone()), - tag: if tag.is_empty() { None } else { Some(tag) }, - ..Default::default() - }; - - let mut stream = docker.create_image(Some(options), None, credentials); - while let Some(result) = stream.next().await { - result.into_diagnostic()?; - } - Ok(()) -} - -/// Returns the actual host port the container is using. When an existing -/// container is reused (same image), this may differ from `gateway_port` -/// because the container was originally created with a different port. -// Refactoring this signature would touch many call sites across the workspace. -#[allow(clippy::too_many_arguments)] -pub async fn ensure_container( - docker: &Docker, - name: &str, - image_ref: &str, - extra_sans: &[String], - ssh_gateway_host: Option<&str>, - gateway_port: u16, - disable_tls: bool, - disable_gateway_auth: bool, - registry_username: Option<&str>, - registry_token: Option<&str>, - device_ids: &[String], - resume: bool, - oidc_issuer: Option<&str>, - oidc_audience: &str, - oidc_roles_claim: Option<&str>, - oidc_admin_role: Option<&str>, - oidc_user_role: Option<&str>, - oidc_scopes_claim: Option<&str>, -) -> Result { - let container_name = container_name(name); - - // Check if the container already exists - match docker - .inspect_container(&container_name, None::) - .await - { - Ok(info) => { - // On resume we always reuse the existing container — the persistent - // volume holds k3s etcd state, and recreating the container with - // different env vars would cause the entrypoint to rewrite the - // HelmChart manifest, triggering a Helm upgrade that changes the - // StatefulSet image reference while the old pod still runs with the - // previous image. Reusing the container avoids this entirely. - // - // On a non-resume path we check whether the image changed and - // recreate only when necessary. - let reuse = if resume { - true - } else { - let desired_id = docker - .inspect_image(image_ref) - .await - .ok() - .and_then(|img| img.id); - - let container_image_id = info.image.clone(); - - match (&desired_id, &container_image_id) { - (Some(desired), Some(current)) => desired == current, - _ => false, - } - }; - - if reuse { - // The container exists and should be reused. Its network - // attachment may be stale. When the gateway is resumed after a - // container kill, `ensure_network` destroys and recreates the - // Docker network (giving it a new ID). The stopped container - // still references the old network ID, so `docker start` would - // fail with "network not found". - // - // Fix: disconnect from any existing networks and reconnect to - // the current (just-created) network before returning. - let expected_net = network_name(name); - reconcile_container_network(docker, &container_name, &expected_net).await?; - - // Read the actual host port from the container's port bindings - // as a cross-check. The caller should already pass the correct - // port (from stored metadata), but this catches mismatches if - // the container was recreated with a different port externally. - let actual_port = info - .host_config - .as_ref() - .and_then(|hc| hc.port_bindings.as_ref()) - .and_then(|pb| pb.get("30051/tcp")) - .and_then(|bindings| bindings.as_ref()) - .and_then(|bindings| bindings.first()) - .and_then(|b| b.host_port.as_ref()) - .and_then(|p| p.parse::().ok()) - .unwrap_or(gateway_port); - - return Ok(actual_port); - } - - // Image changed — remove the stale container so we can recreate it. - tracing::info!( - "Container {} exists but uses a different image (container={}, desired={}), recreating", - container_name, - info.image.as_deref().map_or("unknown", truncate_id), - image_ref, - ); - - let _ = docker.stop_container(&container_name, None).await; - docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) - .await - .into_diagnostic() - .wrap_err("failed to remove stale container")?; - } - Err(err) if is_not_found(&err) => { - // Container does not exist — will create below - } - Err(err) => return Err(err).into_diagnostic(), - } - - let mut port_bindings = HashMap::new(); - port_bindings.insert( - "30051/tcp".to_string(), - Some(vec![PortBinding { - host_ip: Some("0.0.0.0".to_string()), - host_port: Some(gateway_port.to_string()), - }]), - ); - let exposed_ports = vec!["30051/tcp".to_string()]; - - let mut host_config = HostConfig { - privileged: Some(true), - // Use host cgroup namespace so k3s kubelet can manage cgroup controllers - // (cpu, cpuset, memory, pids, etc.) required for pod QoS. With cgroup v2 - // and a private cgroupns, the controllers are not delegated into the - // container's namespace, causing kubelet ContainerManager to fail. - cgroupns_mode: Some(HostConfigCgroupnsModeEnum::HOST), - port_bindings: Some(port_bindings), - binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]), - network_mode: Some(network_name(name)), - // Automatically restart the container when Docker restarts, unless the - // user explicitly stopped it with `gateway stop`. - restart_policy: Some(RestartPolicy { - name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), - maximum_retry_count: None, - }), - // Add host gateway aliases for DNS resolution. - // This allows both the entrypoint script and the running gateway - // process to reach services on the Docker host. - extra_hosts: Some(vec![ - "host.docker.internal:host-gateway".to_string(), - "host.openshell.internal:host-gateway".to_string(), - ]), - ..Default::default() - }; - - // Inject GPU devices into the container based on the resolved device ID list. - // - // The list is pre-resolved by `resolve_gpu_device_ids` before reaching here: - // [] — no GPU passthrough - // ["legacy"] — internal non-CDI fallback path: `driver="nvidia"`, - // `count=-1`; relies on the NVIDIA Container Runtime hook - // [cdi-ids…] — CDI DeviceRequest (driver="cdi") with the given device IDs; - // Docker resolves them against the host CDI spec at /etc/cdi/ - match device_ids { - [] => {} - [id] if id == "legacy" => { - host_config.device_requests = Some(vec![DeviceRequest { - driver: Some("nvidia".to_string()), - count: Some(-1), // all GPUs - capabilities: Some(vec![vec![ - "gpu".to_string(), - "utility".to_string(), - "compute".to_string(), - ]]), - ..Default::default() - }]); - } - ids => { - host_config.device_requests = Some(vec![DeviceRequest { - driver: Some("cdi".to_string()), - device_ids: Some(ids.to_vec()), - ..Default::default() - }]); - } - } - - let mut cmd = vec![ - "server".to_string(), - "--disable=traefik".to_string(), - "--tls-san=127.0.0.1".to_string(), - "--tls-san=localhost".to_string(), - "--tls-san=host.docker.internal".to_string(), - ]; - for san in extra_sans { - cmd.push(format!("--tls-san={san}")); - } - - // Pass extra SANs, SSH gateway config, and registry credentials to the - // entrypoint so they can be injected into the HelmChart manifest and - // k3s registries.yaml. - let registry_host = - env_non_empty("OPENSHELL_REGISTRY_HOST").unwrap_or_else(|| DEFAULT_REGISTRY.to_string()); - let registry_namespace = env_non_empty("OPENSHELL_REGISTRY_NAMESPACE") - .unwrap_or_else(|| REGISTRY_NAMESPACE_DEFAULT.to_string()); - let image_repo_base = env_non_empty("IMAGE_REPO_BASE") - .or_else(|| env_non_empty("OPENSHELL_IMAGE_REPO_BASE")) - .unwrap_or_else(|| { - if registry_host == DEFAULT_REGISTRY { - // For ghcr.io the default namespace is the full org path. - DEFAULT_IMAGE_REPO_BASE.to_string() - } else { - format!("{registry_host}/{registry_namespace}") - } - }); - let registry_insecure = env_bool("OPENSHELL_REGISTRY_INSECURE").unwrap_or(false); - let registry_endpoint = env_non_empty("OPENSHELL_REGISTRY_ENDPOINT"); - - // Credential priority: - // 1. OPENSHELL_REGISTRY_USERNAME/PASSWORD env vars (power-user override) - // 2. registry_username/registry_token from CLI flags / env vars - // No built-in default — GHCR repos are public and pull without auth. - let effective_username = env_non_empty("OPENSHELL_REGISTRY_USERNAME").or_else(|| { - registry_username - .filter(|u| !u.is_empty()) - .map(ToString::to_string) - }); - let effective_password = env_non_empty("OPENSHELL_REGISTRY_PASSWORD").or_else(|| { - registry_token - .filter(|t| !t.is_empty()) - .map(ToString::to_string) - }); - - let mut env_vars: Vec = vec![ - format!("REGISTRY_MODE={REGISTRY_MODE_EXTERNAL}"), - format!("REGISTRY_HOST={registry_host}"), - format!("REGISTRY_INSECURE={registry_insecure}"), - format!("IMAGE_REPO_BASE={image_repo_base}"), - // Deterministic k3s node name so the node identity survives container - // recreation (e.g. after an image upgrade). Without this, k3s uses - // the container ID as the hostname/node name, which changes on every - // container recreate and triggers stale-node PVC cleanup. - format!("OPENSHELL_NODE_NAME={}", node_name(name)), - ]; - if let Some(endpoint) = registry_endpoint { - env_vars.push(format!("REGISTRY_ENDPOINT={endpoint}")); - } - if let Some(password) = effective_password { - // Default to __token__ when only a password/token is provided. - let username = effective_username.unwrap_or_else(|| "__token__".to_string()); - env_vars.push(format!("REGISTRY_USERNAME={username}")); - env_vars.push(format!("REGISTRY_PASSWORD={password}")); - } - - if !extra_sans.is_empty() { - env_vars.push(format!("EXTRA_SANS={}", extra_sans.join(","))); - } - if let Some(host) = ssh_gateway_host { - env_vars.push(format!("SSH_GATEWAY_HOST={host}")); - // The NodePort is mapped to the configured host port, so the SSH - // gateway port for remote clusters must match. - env_vars.push(format!("SSH_GATEWAY_PORT={gateway_port}")); - } - - // Pass image configuration to the cluster entrypoint. - // The effective tag is resolved from the runtime IMAGE_TAG env var (if set) - // or the compile-time default (see image::DEFAULT_IMAGE_TAG). - // When OPENSHELL_PUSH_IMAGES is set the entrypoint overrides the baked-in - // HelmChart manifest so k3s uses the locally-pushed images with - // IfNotPresent pull policy instead of pulling from the remote registry. - let push_mode = std::env::var("OPENSHELL_PUSH_IMAGES").is_ok_and(|v| !v.trim().is_empty()); - let effective_tag = std::env::var("IMAGE_TAG") - .ok() - .filter(|v| !v.trim().is_empty()) - .unwrap_or_else(|| image::DEFAULT_IMAGE_TAG.to_string()); - if push_mode { - if let Ok(images) = std::env::var("OPENSHELL_PUSH_IMAGES") - && !images.trim().is_empty() - { - env_vars.push(format!("PUSH_IMAGE_REFS={images}")); - } - env_vars.push(format!("IMAGE_TAG={effective_tag}")); - env_vars.push("IMAGE_PULL_POLICY=IfNotPresent".to_string()); - } else { - env_vars.push(format!("IMAGE_TAG={effective_tag}")); - } - - // Disable TLS: pass through to the entrypoint so the HelmChart manifest - // configures the server pod for plaintext HTTP. - if disable_tls { - env_vars.push("DISABLE_TLS=true".to_string()); - } - - // Disable gateway auth: pass through to the entrypoint so the HelmChart - // manifest sets the flag on the server pod. - if disable_gateway_auth { - env_vars.push("DISABLE_GATEWAY_AUTH=true".to_string()); - } - - // GPU support: tell the entrypoint to deploy the NVIDIA device plugin - // HelmChart CR so k8s workloads can request nvidia.com/gpu resources. - if !device_ids.is_empty() { - env_vars.push("GPU_ENABLED=true".to_string()); - } - - // OIDC JWT authentication: pass issuer and audience to the entrypoint - // so the HelmChart manifest configures the server pod for JWT validation. - if let Some(issuer) = oidc_issuer { - env_vars.push(format!("OIDC_ISSUER={issuer}")); - env_vars.push(format!("OIDC_AUDIENCE={oidc_audience}")); - if let Some(claim) = oidc_roles_claim { - env_vars.push(format!("OIDC_ROLES_CLAIM={claim}")); - } - if let Some(role) = oidc_admin_role { - env_vars.push(format!("OIDC_ADMIN_ROLE={role}")); - } - if let Some(role) = oidc_user_role { - env_vars.push(format!("OIDC_USER_ROLE={role}")); - } - if let Some(claim) = oidc_scopes_claim { - env_vars.push(format!("OIDC_SCOPES_CLAIM={claim}")); - } - } - - let env = Some(env_vars); - - let config = ContainerCreateBody { - image: Some(image_ref.to_string()), - // Set the container hostname to the deterministic node name. - // k3s uses the container hostname as its default node name. Without - // this, Docker defaults to the container ID (first 12 hex chars), - // which changes on every container recreation and can cause - // `clean_stale_nodes` to delete the wrong node on resume. The - // hostname persists across container stop/start cycles, ensuring a - // stable node identity. - hostname: Some(node_name(name)), - cmd: Some(cmd), - env, - exposed_ports: Some(exposed_ports), - host_config: Some(host_config), - ..Default::default() - }; - - docker - .create_container( - Some(CreateContainerOptions { - name: Some(container_name), - platform: String::new(), - }), - config, - ) - .await - .into_diagnostic() - .wrap_err("failed to create gateway container")?; - Ok(gateway_port) -} - -/// Information about a container that is holding a port we need. -#[derive(Debug, Clone)] -pub struct PortConflict { - /// Name of the container holding the port (without leading `/`). - pub container_name: String, - /// The host port that conflicts. - pub host_port: u16, -} - -/// Check whether any *other* running container already binds the host ports -/// that the gateway needs. Returns a list of conflicts (empty if none). -/// -/// Docker silently fails to attach networking when a port is already bound, -/// leaving the new container with only a loopback interface. Detecting this -/// up-front lets us give a clear error instead of a cryptic "no default route" -/// failure 30 seconds later. -pub async fn check_port_conflicts( - docker: &Docker, - name: &str, - gateway_port: u16, -) -> Result> { - let our_container = container_name(name); - let needed_ports: Vec = vec![gateway_port]; - - let containers = docker - .list_containers(Some( - ListContainersOptionsBuilder::new() - // Only running containers can hold port bindings. - .all(false) - .build(), - )) - .await - .into_diagnostic() - .wrap_err("failed to list containers for port conflict check")?; - - let mut conflicts = Vec::new(); - for container in &containers { - // Skip our own container (it may already exist from a previous run). - let names = container.names.as_deref().unwrap_or_default(); - let is_ours = names - .iter() - .any(|n| n.trim_start_matches('/') == our_container); - if is_ours { - continue; - } - - let ports = container.ports.as_deref().unwrap_or_default(); - for port in ports { - if let Some(public) = port.public_port - && needed_ports.contains(&public) - { - let cname = names.first().map_or_else( - || { - container - .id - .clone() - .unwrap_or_else(|| "".to_string()) - }, - |n| n.trim_start_matches('/').to_string(), - ); - conflicts.push(PortConflict { - container_name: cname, - host_port: public, - }); - } - } - } - Ok(conflicts) -} - -pub async fn start_container(docker: &Docker, name: &str) -> Result<()> { - let container_name = container_name(name); - - // Retry with backoff when the start fails due to a port binding conflict. - // After a container is destroyed the OS may take a moment to release the - // TCP socket, so the new container's start can transiently fail with - // "port is already allocated". - let max_attempts: u64 = 5; - for attempt in 1..=max_attempts { - let response = docker - .start_container(&container_name, None::) - .await; - match response { - Ok(()) => return Ok(()), - Err(err) if is_conflict(&err) => return Ok(()), - Err(ref err) if attempt < max_attempts && is_port_conflict(err) => { - tracing::debug!( - "Port conflict on start attempt {attempt}/{max_attempts}, retrying after backoff" - ); - tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await; - } - Err(err) => { - return Err(err) - .into_diagnostic() - .wrap_err("failed to start gateway container"); - } - } - } - unreachable!() -} - -pub async fn stop_container(docker: &Docker, container_name: &str) -> Result<()> { - let response = docker.stop_container(container_name, None).await; - match response { - Ok(()) => Ok(()), - Err(err) if is_conflict(&err) => Ok(()), - Err(err) if is_not_found(&err) => Ok(()), - Err(err) => Err(err).into_diagnostic(), - } -} - -pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()> { - let container_name = container_name(name); - let volume_name = volume_name(name); - - // Capture the container's image reference before removing the container so - // we can clean it up afterwards. This prevents stale images from being - // re-used on subsequent deploys. - let container_image = docker - .inspect_container(&container_name, None::) - .await - .ok() - .and_then(|info| info.image); - - // Explicitly disconnect the container from the per-gateway network before - // removing it. This ensures Docker tears down the network endpoint - // synchronously so port bindings are released immediately and the - // subsequent network cleanup sees zero connected containers. - let net_name = network_name(name); - let _ = docker - .disconnect_network( - &net_name, - NetworkDisconnectRequest { - container: container_name.clone(), - force: Some(true), - }, - ) - .await; - - let _ = stop_container(docker, &container_name).await; - - let remove_container = docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) - .await; - if let Err(err) = remove_container - && !is_not_found(&err) - { - return Err(err).into_diagnostic(); - } - - // Remove the gateway image so the next deploy always pulls the latest - // version from the registry instead of reusing a stale local copy. - // Docker may briefly report the container as still running after a - // force-remove, so retry a few times on conflict (409) errors. - if let Some(ref image_id) = container_image { - tracing::debug!("Removing gateway image: {}", image_id); - let mut last_err = None; - for attempt in 0..5 { - if attempt > 0 { - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - } - match docker - .remove_image( - image_id, - Some(RemoveImageOptions { - force: true, - noprune: true, - ..Default::default() - }), - None, - ) - .await - { - Ok(_) => { - last_err = None; - break; - } - Err(err) if is_not_found(&err) => { - last_err = None; - break; - } - Err(err) if is_conflict(&err) => { - last_err = Some(err); - } - Err(err) => { - last_err = Some(err); - break; - } - } - } - if let Some(err) = last_err { - tracing::warn!("Failed to remove gateway image {}: {}", image_id, err); - } - } - - let remove_volume = docker - .remove_volume(&volume_name, Some(RemoveVolumeOptions { force: true })) - .await; - if let Err(err) = remove_volume - && !is_not_found(&err) - { - return Err(err).into_diagnostic(); - } - - // Force-remove the per-gateway network during a full destroy. First - // disconnect any stale endpoints that Docker may still report (race - // between container removal and network bookkeeping), then remove the - // network itself. - force_remove_network(docker, &net_name).await?; - - Ok(()) -} - -/// Clean up the gateway container and network, preserving the persistent volume. -/// -/// Used when a resume attempt fails — we want to remove the container we may -/// have just created but keep the volume so the user can retry without losing -/// their k3s/etcd state and sandbox data. -pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> { - let container_name = container_name(name); - let net_name = network_name(name); - - // Disconnect container from network - let _ = docker - .disconnect_network( - &net_name, - NetworkDisconnectRequest { - container: container_name.clone(), - force: Some(true), - }, - ) - .await; - - let _ = stop_container(docker, &container_name).await; - - let remove_container = docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) - .await; - if let Err(err) = remove_container - && !is_not_found(&err) - { - return Err(err).into_diagnostic(); - } - - force_remove_network(docker, &net_name).await?; - - Ok(()) -} - -/// Forcefully remove a Docker network, disconnecting any remaining -/// containers first. This ensures that stale Docker network endpoints -/// cannot prevent port bindings from being released. -async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> { - let network = match docker - .inspect_network(net_name, None::) - .await - { - Ok(info) => info, - Err(err) if is_not_found(&err) => return Ok(()), - Err(err) => return Err(err).into_diagnostic(), - }; - - // Disconnect any containers still attached to the network. - if let Some(containers) = network.containers { - for (id, _) in containers { - let _ = docker - .disconnect_network( - net_name, - NetworkDisconnectRequest { - container: id, - force: Some(true), - }, - ) - .await; - } - } - - match docker.remove_network(net_name).await { - Ok(()) => Ok(()), - Err(err) if is_not_found(&err) => Ok(()), - Err(err) => Err(err) - .into_diagnostic() - .wrap_err("failed to remove Docker network"), - } -} - -/// Ensure a stopped container is connected to the expected Docker network. -/// -/// When a gateway is resumed after the container was killed (but not removed), -/// `ensure_network` destroys and recreates the network with a new ID. The -/// stopped container still holds a reference to the old network ID in its -/// config, so `docker start` would fail with a 404 "network not found" error. -/// -/// This function disconnects the container from any networks that no longer -/// match the expected network name and connects it to the correct one. -async fn reconcile_container_network( - docker: &Docker, - container_name: &str, - expected_network: &str, -) -> Result<()> { - let info = docker - .inspect_container(container_name, None::) - .await - .into_diagnostic() - .wrap_err("failed to inspect container for network reconciliation")?; - - // Check the container's current network attachments via NetworkSettings. - let attached_networks: Vec = info - .network_settings - .as_ref() - .and_then(|ns| ns.networks.as_ref()) - .map(|nets| nets.keys().cloned().collect()) - .unwrap_or_default(); - - // If the container is already attached to the expected network (by name), - // Docker will resolve the name to the current network ID on start. - // However, when the network was destroyed and recreated, the container's - // stored endpoint references the old ID. Disconnect and reconnect to - // pick up the new network ID. - for net_name in &attached_networks { - let _ = docker - .disconnect_network( - net_name, - NetworkDisconnectRequest { - container: container_name.to_string(), - force: Some(true), - }, - ) - .await; - } - - // Connect to the (freshly created) expected network. - docker - .connect_network( - expected_network, - NetworkConnectRequest { - container: container_name.to_string(), - endpoint_config: Some(EndpointSettings::default()), - }, - ) - .await - .into_diagnostic() - .wrap_err("failed to connect container to gateway network")?; - - tracing::debug!( - "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}" - ); - - Ok(()) -} - -fn is_not_found(err: &BollardError) -> bool { - matches!( - err, - BollardError::DockerResponseServerError { - status_code: 404, - .. - } - ) -} - -/// Check whether a container is still running. -/// Returns `Ok(())` if running, or an `Err` with the exit status if the container has stopped. -pub async fn check_container_running(docker: &Docker, container_name: &str) -> Result<()> { - let inspect = docker - .inspect_container(container_name, None::) - .await - .into_diagnostic() - .wrap_err("failed to inspect container")?; - - let state = inspect.state.as_ref(); - let running = state.and_then(|s| s.running).unwrap_or(false); - if running { - return Ok(()); - } - - let status = state - .and_then(|s| s.status.as_ref()) - .map_or_else(|| "unknown".to_string(), |s| format!("{s:?}")); - let exit_code = state.and_then(|s| s.exit_code).unwrap_or(-1); - let error_msg = state.and_then(|s| s.error.as_deref()).unwrap_or(""); - let oom = state.and_then(|s| s.oom_killed).unwrap_or(false); - - let mut detail = format!("container exited (status={status}, exit_code={exit_code})"); - if !error_msg.is_empty() { - use std::fmt::Write; - let _ = write!(detail, ", error={error_msg}"); - } - if oom { - detail.push_str(", OOMKilled=true"); - } - - Err(miette::miette!(detail)) -} - -/// Truncate an image ID for display (e.g., `sha256:abcdef1234...` -> `sha256:abcdef1234ab`). -fn truncate_id(id: &str) -> &str { - const DISPLAY_LEN: usize = "sha256:".len() + 12; - if id.len() > DISPLAY_LEN { - &id[..DISPLAY_LEN] - } else { - id - } -} - -/// Information about an existing gateway deployment. -#[derive(Debug, Clone)] -pub struct ExistingGatewayInfo { - /// Whether the container exists. - pub container_exists: bool, - /// Whether the container is currently running. - pub container_running: bool, - /// Whether the persistent volume exists. - pub volume_exists: bool, - /// The image used by the existing container (if any). - pub container_image: Option, -} - -/// Check whether a gateway with the given name already exists. -/// -/// Returns `None` if no gateway resources exist, or `Some(info)` with -/// details about the existing deployment. -pub async fn check_existing_gateway( - docker: &Docker, - name: &str, -) -> Result> { - let container_name = container_name(name); - let vol_name = volume_name(name); - - let volume_exists = match docker.inspect_volume(&vol_name).await { - Ok(_) => true, - Err(err) if is_not_found(&err) => false, - Err(err) => return Err(err).into_diagnostic(), - }; - - let (container_exists, container_running, container_image) = match docker - .inspect_container(&container_name, None::) - .await - { - Ok(info) => { - let running = info.state.as_ref().and_then(|s| s.running).unwrap_or(false); - let image = info.config.and_then(|c| c.image); - (true, running, image) - } - Err(err) if is_not_found(&err) => (false, false, None), - Err(err) => return Err(err).into_diagnostic(), - }; - - if !container_exists && !volume_exists { - return Ok(None); - } - - Ok(Some(ExistingGatewayInfo { - container_exists, - container_running, - volume_exists, - container_image, - })) -} - -fn is_conflict(err: &BollardError) -> bool { - matches!( - err, - BollardError::DockerResponseServerError { - status_code: 409, - .. - } - ) -} - -/// Detect Docker "port is already allocated" errors that can occur transiently -/// after a container using the same port was just destroyed. -fn is_port_conflict(err: &BollardError) -> bool { - matches!( - err, - BollardError::DockerResponseServerError { - status_code: 500, - message, - .. - } if message.contains("port is already allocated") - ) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn normalize_arch_x86_64() { - assert_eq!(normalize_arch("x86_64"), "amd64"); - } - - #[test] - fn normalize_arch_aarch64() { - assert_eq!(normalize_arch("aarch64"), "arm64"); - } - - #[test] - fn normalize_arch_passthrough_amd64() { - assert_eq!(normalize_arch("amd64"), "amd64"); - } - - #[test] - fn normalize_arch_passthrough_arm64() { - assert_eq!(normalize_arch("arm64"), "arm64"); - } - - #[test] - fn normalize_arch_uppercase() { - assert_eq!(normalize_arch("ARM64"), "arm64"); - } - - #[test] - fn host_platform_string() { - let platform = HostPlatform { - arch: "arm64".to_string(), - os: "linux".to_string(), - }; - assert_eq!(platform.platform_string(), "linux/arm64"); - } - - #[test] - fn docker_not_reachable_error_no_socket_no_docker_host() { - // Simulate: no socket at default path, no DOCKER_HOST set. - // We can't guarantee /var/run/docker.sock state in CI, but we can - // verify the error message is well-formed and contains guidance. - let err = - docker_not_reachable_error("connection refused", "Failed to create Docker client"); - let msg = format!("{err:?}"); - assert!( - msg.contains("Failed to create Docker client"), - "should include the summary" - ); - assert!( - msg.contains("connection refused"), - "should include the raw error" - ); - // The message should always include the verification step - assert!( - msg.contains("docker info"), - "should suggest 'docker info' verification" - ); - } - - // Test-only: mutates DOCKER_HOST env var via std::env::set_var/remove_var, - // which require unsafe in the 2024 edition. - #[allow(unsafe_code)] - #[test] - fn docker_not_reachable_error_with_docker_host() { - // Simulate: DOCKER_HOST is set but daemon unresponsive. - // We set the env var temporarily (this is test-only). - let prev_docker_host = std::env::var("DOCKER_HOST").ok(); - // SAFETY: test-only, single-threaded test runner for this test - unsafe { - std::env::set_var("DOCKER_HOST", "unix:///tmp/fake-docker.sock"); - } - - let err = docker_not_reachable_error( - "daemon not responding", - "Docker socket exists but the daemon is not responding", - ); - let msg = format!("{err:?}"); - - // Restore env - // SAFETY: test-only, restoring previous state - unsafe { - match prev_docker_host { - Some(val) => std::env::set_var("DOCKER_HOST", val), - None => std::env::remove_var("DOCKER_HOST"), - } - } - - assert!( - msg.contains("DOCKER_HOST"), - "should mention DOCKER_HOST when it is set" - ); - assert!( - msg.contains("unix:///tmp/fake-docker.sock"), - "should show the current DOCKER_HOST value" - ); - } - - #[test] - fn find_alternative_sockets_returns_vec() { - // Verify the function runs without panic and returns a vec. - // Exact contents depend on the host system, so we just check the type. - let sockets = find_alternative_sockets(); - // On any system, /var/run/docker.sock may or may not exist - assert!( - sockets.len() <= 10, - "should return a reasonable number of sockets" - ); - } - - // --- resolve_gpu_device_ids --- - - #[test] - fn resolve_gpu_empty_returns_empty() { - assert_eq!(resolve_gpu_device_ids(&[], true), Vec::::new()); - assert_eq!(resolve_gpu_device_ids(&[], false), Vec::::new()); - } - - #[test] - fn resolve_gpu_auto_cdi_enabled() { - assert_eq!( - resolve_gpu_device_ids(&["auto".to_string()], true), - vec!["nvidia.com/gpu=all"], - ); - } - - #[test] - fn resolve_gpu_auto_cdi_disabled() { - assert_eq!( - resolve_gpu_device_ids(&["auto".to_string()], false), - vec!["legacy"], - ); - } - - #[test] - fn resolve_gpu_legacy_passthrough() { - assert_eq!( - resolve_gpu_device_ids(&["legacy".to_string()], true), - vec!["legacy"], - ); - assert_eq!( - resolve_gpu_device_ids(&["legacy".to_string()], false), - vec!["legacy"], - ); - } - - #[test] - fn resolve_gpu_cdi_ids_passthrough() { - let ids = vec!["nvidia.com/gpu=all".to_string()]; - assert_eq!(resolve_gpu_device_ids(&ids, true), ids); - assert_eq!(resolve_gpu_device_ids(&ids, false), ids); - - let multi = vec![ - "nvidia.com/gpu=0".to_string(), - "nvidia.com/gpu=1".to_string(), - ]; - assert_eq!(resolve_gpu_device_ids(&multi, true), multi); - } -} diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs deleted file mode 100644 index 8a6ccf5bd..000000000 --- a/crates/openshell-bootstrap/src/errors.rs +++ /dev/null @@ -1,1038 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Gateway error detection and user-friendly guidance. -//! -//! This module analyzes error messages and container logs to detect known -//! failure patterns and provide actionable recovery guidance. - -/// A diagnosed gateway failure with user-friendly guidance. -#[derive(Debug, Clone)] -pub struct GatewayFailureDiagnosis { - /// Short summary of what went wrong. - pub summary: String, - /// Detailed explanation of the issue. - pub explanation: String, - /// Commands or steps the user can take to fix the issue. - pub recovery_steps: Vec, - /// Whether the issue might be auto-recoverable by retrying. - pub retryable: bool, -} - -/// A recovery step with a command and description. -#[derive(Debug, Clone)] -pub struct RecoveryStep { - /// Description of what this step does. - pub description: String, - /// Command to run (if applicable). - pub command: Option, -} - -impl RecoveryStep { - fn new(description: impl Into) -> Self { - Self { - description: description.into(), - command: None, - } - } - - fn with_command(description: impl Into, command: impl Into) -> Self { - Self { - description: description.into(), - command: Some(command.into()), - } - } -} - -/// How multiple matchers should be combined. -#[derive(Debug, Clone, Copy, Default)] -enum MatchMode { - /// Match if ANY of the matchers is found (default). - #[default] - Any, - /// Match only if ALL of the matchers are found. - All, -} - -/// Known failure patterns and their detection logic. -struct FailurePattern { - /// Patterns to match in error message or logs. - matchers: &'static [&'static str], - /// How to combine multiple matchers (default: Any). - match_mode: MatchMode, - /// Function to generate diagnosis. - diagnose: fn(gateway_name: &str) -> GatewayFailureDiagnosis, -} - -const FAILURE_PATTERNS: &[FailurePattern] = &[ - // Corrupted cluster state / RBAC issues - FailurePattern { - matchers: &[ - "extension-apiserver-authentication", - "cannot get resource", - "is forbidden", - ], - match_mode: MatchMode::Any, - diagnose: diagnose_corrupted_state, - }, - // No default route (Docker networking) - FailurePattern { - matchers: &["no default route present"], - match_mode: MatchMode::Any, - diagnose: diagnose_no_default_route, - }, - // Port already in use - FailurePattern { - matchers: &["port is already allocated", "address already in use"], - match_mode: MatchMode::Any, - diagnose: diagnose_port_conflict, - }, - // Image pull failures (auth/registry issues) - FailurePattern { - matchers: &[ - "pull access denied", - "image not found", - "manifest unknown", - "unauthorized to access repository", - "denied: access forbidden", - ], - match_mode: MatchMode::Any, - diagnose: diagnose_image_pull_auth_failure, - }, - // k3s internal DNS proxy failure (must be before general network connectivity) - // This happens when the k3s cluster starts but its internal DNS proxy can't resolve - // external names, causing all image pulls to fail with "Try again" DNS errors. - // The pattern "Try again" is a DNS EAGAIN error indicating temporary failure. - // IMPORTANT: Both patterns must match to distinguish from other network issues. - FailurePattern { - matchers: &["dial tcp: lookup", "Try again"], - match_mode: MatchMode::All, - diagnose: diagnose_k3s_dns_proxy_failure, - }, - // Network connectivity issues (DNS, timeouts, unreachable) - FailurePattern { - matchers: &[ - "no such host", - "i/o timeout", - "network is unreachable", - "connection refused", - "connection reset by peer", - "TLS handshake timeout", - "dial tcp", - "lookup ghcr.io", - "lookup registry", - "no route to host", - "temporary failure in name resolution", - ], - match_mode: MatchMode::Any, - diagnose: diagnose_network_connectivity, - }, - // OOM killed - FailurePattern { - matchers: &["exit_code=137", "OOMKilled"], - match_mode: MatchMode::Any, - diagnose: diagnose_oom_killed, - }, - // Node resource pressure (DiskPressure, MemoryPressure, PIDPressure) - FailurePattern { - matchers: &["HEALTHCHECK_NODE_PRESSURE"], - match_mode: MatchMode::Any, - diagnose: diagnose_node_pressure, - }, - // Missing sandbox supervisor binary - FailurePattern { - matchers: &["HEALTHCHECK_MISSING_SUPERVISOR"], - match_mode: MatchMode::Any, - diagnose: diagnose_missing_supervisor, - }, - // TLS/certificate issues - FailurePattern { - matchers: &[ - "certificate has expired", - "x509: certificate", - "tls: failed to verify", - ], - match_mode: MatchMode::Any, - diagnose: diagnose_certificate_issue, - }, - // Docker daemon not running or socket not found - FailurePattern { - matchers: &[ - "Cannot connect to the Docker daemon", - "docker daemon is not running", - "Is the docker daemon running", - "Socket not found", - "No such file or directory", - "Failed to create Docker client", - "Docker socket exists but the daemon is not responding", - ], - match_mode: MatchMode::Any, - diagnose: diagnose_docker_not_running, - }, - // CDI specs missing — Docker daemon has CDI configured but no spec files exist - // or the requested device ID (nvidia.com/gpu=all) is not in any spec. - // Matches errors from Docker 25+ and containerd CDI injection paths. - FailurePattern { - matchers: &[ - "CDI device not found", - "unknown CDI device", - "failed to inject CDI devices", - "no CDI devices found", - "CDI device injection failed", - "unresolvable CDI devices", - ], - match_mode: MatchMode::Any, - diagnose: diagnose_cdi_specs_missing, - }, -]; - -fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Corrupted cluster state".to_string(), - explanation: "The gateway cluster has corrupted internal state, likely from a previous \ - interrupted startup or unclean shutdown. Resources from the failed deploy have been \ - automatically cleaned up." - .to_string(), - recovery_steps: vec![ - RecoveryStep::new("Retry the gateway start (cleanup was automatic)"), - RecoveryStep::with_command( - "If the retry fails, manually destroy and recreate", - format!( - "openshell gateway destroy --name {gateway_name} && openshell gateway start" - ), - ), - ], - retryable: true, - } -} - -fn diagnose_no_default_route(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Docker networking issue".to_string(), - explanation: "The gateway container has no network route. This can happen when \ - another container is already bound to the same host port (Docker silently \ - skips network attachment), or due to stale Docker networks." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Check for containers using the same port", - "docker ps --format '{{.Names}}\\t{{.Ports}}'", - ), - RecoveryStep::new( - "Stop any container holding the gateway port (default 8080), then retry", - ), - RecoveryStep::with_command("Prune unused Docker networks", "docker network prune -f"), - RecoveryStep::new("Restart your Docker runtime"), - RecoveryStep::new("Then retry: openshell gateway start"), - ], - retryable: true, - } -} - -fn diagnose_port_conflict(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Port already in use".to_string(), - explanation: "The gateway port is already in use by another process or container." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Check what's using the port", - "lsof -i :8080 || netstat -an | grep 8080", - ), - RecoveryStep::with_command( - "Use a different port", - "openshell gateway start --port 8081", - ), - RecoveryStep::with_command( - "Or stop other openshell gateways", - "openshell gateway list && openshell gateway destroy --name ", - ), - ], - retryable: false, - } -} - -fn diagnose_image_pull_auth_failure(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Registry authentication failed".to_string(), - explanation: "Could not authenticate with the container registry. The image may not \ - exist, or you may not have permission to access it. Public GHCR repos \ - should not require authentication — if you see this error with the default \ - registry, it may indicate the image does not exist or a network issue." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Verify the image exists and you have access", - "docker pull ghcr.io/nvidia/openshell/cluster:latest", - ), - RecoveryStep::new( - "If using a private registry, set OPENSHELL_REGISTRY_USERNAME and OPENSHELL_REGISTRY_TOKEN \ - (or use --registry-username and --registry-token)", - ), - RecoveryStep::with_command("Check your Docker login", "docker login ghcr.io"), - ], - retryable: false, - } -} - -fn diagnose_k3s_dns_proxy_failure(gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Cluster DNS resolution failed".to_string(), - explanation: "The gateway cluster started but its internal DNS proxy cannot resolve \ - external hostnames. Docker's embedded DNS inside the container cannot reach \ - an upstream resolver. This is typically caused by Docker not being configured \ - with the host's DNS servers, stale Docker networking state, or (on Desktop) \ - DNS configuration issues." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Check your host's DNS servers", - "resolvectl status | grep 'DNS Servers' -A2", - ), - RecoveryStep::with_command( - "Configure Docker to use those DNS servers \ - (add to /etc/docker/daemon.json, then restart Docker)", - "echo '{\"dns\": [\"\"]}' | sudo tee /etc/docker/daemon.json \ - && sudo systemctl restart docker", - ), - RecoveryStep::with_command("Prune Docker networks", "docker network prune -f"), - RecoveryStep::with_command( - "Destroy and recreate the gateway", - format!( - "openshell gateway destroy --name {gateway_name} && openshell gateway start" - ), - ), - ], - retryable: true, - } -} - -fn diagnose_network_connectivity(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Network connectivity issue".to_string(), - explanation: "Could not reach the container registry. This could be a DNS resolution \ - failure, firewall blocking the connection, or general internet connectivity issue." - .to_string(), - recovery_steps: vec![ - RecoveryStep::new("Check your internet connection"), - RecoveryStep::with_command("Test DNS resolution", "nslookup ghcr.io"), - RecoveryStep::with_command("Test registry connectivity", "curl -I https://ghcr.io/v2/"), - RecoveryStep::new( - "If behind a corporate firewall/proxy, ensure Docker is configured to use it", - ), - RecoveryStep::new("Restart Docker and try again"), - ], - retryable: true, - } -} - -fn diagnose_oom_killed(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Container killed due to memory limits".to_string(), - explanation: "The gateway container was killed because it exceeded memory limits. \ - The gateway requires at least 4GB of memory." - .to_string(), - recovery_steps: vec![ - RecoveryStep::new("Increase Docker memory allocation to at least 4GB"), - RecoveryStep::new("Close other memory-intensive applications"), - RecoveryStep::new("Then retry: openshell gateway start"), - ], - retryable: false, - } -} - -fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Node under resource pressure".to_string(), - explanation: "The cluster node is reporting a resource pressure condition \ - (DiskPressure, MemoryPressure, or PIDPressure). When a node is under \ - pressure the kubelet evicts running pods and rejects new pod scheduling, \ - so the gateway will never become healthy until the pressure is resolved." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command("Check available disk space on the host", "df -h /"), - RecoveryStep::with_command( - "Free disk space by pruning unused Docker resources", - "docker system prune -a --volumes", - ), - RecoveryStep::with_command("Check available memory on the host", "free -h"), - RecoveryStep::new("Increase Docker resource allocation or free resources on the host"), - RecoveryStep::with_command( - "Destroy and recreate the gateway after freeing resources", - format!( - "openshell gateway destroy --name {gateway_name} && openshell gateway start" - ), - ), - ], - retryable: false, - } -} - -fn diagnose_missing_supervisor(gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Sandbox supervisor binary missing from cluster image".to_string(), - explanation: "The sandbox supervisor binary (/opt/openshell/bin/openshell-sandbox) \ - was not found in the gateway container. This binary is side-loaded into every \ - sandbox pod via a hostPath volume mount. Without it, all sandbox pods will \ - crash immediately with \"no such file or directory\". This typically means the \ - cluster image was built or published without the staged prebuilt openshell-sandbox binary." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Rebuild the cluster image with the supervisor binary included", - "mise run docker:build:cluster", - ), - RecoveryStep::with_command( - "Destroy and recreate the gateway with the updated image", - format!( - "openshell gateway destroy --name {gateway_name} && openshell gateway start" - ), - ), - RecoveryStep::new( - "Or set OPENSHELL_CLUSTER_IMAGE to a cluster image version that includes \ - the supervisor binary", - ), - ], - retryable: false, - } -} - -fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "TLS certificate issue".to_string(), - explanation: "There's a problem with the gateway's TLS certificates, possibly expired \ - or mismatched certificates from a previous installation." - .to_string(), - recovery_steps: vec![RecoveryStep::with_command( - "Destroy and recreate the gateway to regenerate certificates", - format!("openshell gateway destroy --name {gateway_name} && openshell gateway start"), - )], - retryable: false, - } -} - -fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "CDI specs not found on host".to_string(), - explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \ - directories configured) but no CDI device specs were found on the host. \ - Specs must be pre-generated before OpenShell can inject the GPU into the \ - cluster container." - .to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)", - "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", - ), - RecoveryStep::with_command( - "Verify the specs were generated and include nvidia.com/gpu entries", - "nvidia-ctk cdi list", - ), - RecoveryStep::new("Then retry: openshell gateway start --gpu"), - ], - retryable: false, - } -} - -fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Docker is not running".to_string(), - explanation: "The Docker daemon is not running or not accessible. OpenShell requires \ - a Docker-compatible container runtime to manage gateway clusters." - .to_string(), - recovery_steps: vec![ - RecoveryStep::new("Start your Docker runtime"), - RecoveryStep::with_command("Verify Docker is accessible", "docker info"), - RecoveryStep::new( - "If using a non-default Docker socket, set DOCKER_HOST:\n \ - export DOCKER_HOST=unix:///var/run/docker.sock", - ), - RecoveryStep::new("Then retry: openshell gateway start"), - ], - retryable: true, - } -} - -/// Analyze an error message and container logs to diagnose the failure. -/// -/// Returns `Some(diagnosis)` if a known pattern is detected, `None` otherwise. -pub fn diagnose_failure( - gateway_name: &str, - error_message: &str, - container_logs: Option<&str>, -) -> Option { - let combined = container_logs.map_or_else( - || error_message.to_string(), - |logs| format!("{error_message}\n{logs}"), - ); - - for pattern in FAILURE_PATTERNS { - let matches = match pattern.match_mode { - MatchMode::Any => pattern.matchers.iter().any(|m| combined.contains(m)), - MatchMode::All => pattern.matchers.iter().all(|m| combined.contains(m)), - }; - if matches { - return Some((pattern.diagnose)(gateway_name)); - } - } - - None -} - -/// Create a generic diagnosis when no specific pattern is matched. -pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis { - GatewayFailureDiagnosis { - summary: "Gateway failed to start".to_string(), - explanation: "The gateway encountered an unexpected error during startup.".to_string(), - recovery_steps: vec![ - RecoveryStep::with_command( - "Check container logs for details", - format!("openshell doctor logs --name {gateway_name}"), - ), - RecoveryStep::with_command( - "Run diagnostics", - format!("openshell doctor check --name {gateway_name}"), - ), - RecoveryStep::with_command( - "Try destroying and recreating the gateway", - format!( - "openshell gateway destroy --name {gateway_name} && openshell gateway start" - ), - ), - RecoveryStep::new( - "If the issue persists, report it at https://github.com/nvidia/openshell/issues", - ), - ], - retryable: false, - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_diagnose_corrupted_state() { - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready", - Some("configmaps \"extension-apiserver-authentication\" is forbidden"), - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("Corrupted")); - } - - #[test] - fn test_diagnose_corrupted_state_is_retryable_after_auto_cleanup() { - // After the auto-cleanup fix (#463), corrupted state errors should be - // marked retryable because deploy_gateway_with_logs now automatically - // cleans up Docker resources on failure. - let d = diagnose_failure( - "mygw", - "K8s namespace not ready", - Some("configmaps \"extension-apiserver-authentication\" is forbidden"), - ) - .expect("should match corrupted state pattern"); - assert!( - d.retryable, - "corrupted state should be retryable after auto-cleanup" - ); - assert!( - d.explanation.contains("automatically cleaned up"), - "explanation should mention automatic cleanup, got: {}", - d.explanation - ); - } - - #[test] - fn test_diagnose_corrupted_state_recovery_no_manual_volume_rm() { - // The recovery steps should no longer include a manual docker volume rm - // command, since cleanup is now automatic. The first step should tell - // the user to simply retry. - let d = diagnose_failure("mygw", "cannot get resource \"namespaces\"", None) - .expect("should match corrupted state pattern"); - - let all_commands: Vec = d - .recovery_steps - .iter() - .filter_map(|s| s.command.clone()) - .collect(); - let all_commands_joined = all_commands.join(" "); - - assert!( - !all_commands_joined.contains("docker volume rm"), - "recovery steps should not include manual docker volume rm, got: {all_commands_joined}" - ); - - // First step should be a description-only step (no command) about retrying - assert!( - d.recovery_steps[0].command.is_none(), - "first recovery step should be description-only (automatic cleanup)" - ); - assert!( - d.recovery_steps[0] - .description - .contains("cleanup was automatic"), - "first recovery step should mention automatic cleanup" - ); - } - - #[test] - fn test_diagnose_corrupted_state_fallback_step_includes_gateway_name() { - // The fallback recovery step should interpolate the gateway name so - // users can copy-paste the command. - let d = diagnose_failure("my-gateway", "is forbidden", None) - .expect("should match corrupted state pattern"); - - assert!( - d.recovery_steps.len() >= 2, - "should have at least 2 recovery steps" - ); - let fallback = &d.recovery_steps[1]; - let cmd = fallback - .command - .as_deref() - .expect("fallback step should have a command"); - assert!( - cmd.contains("my-gateway"), - "fallback command should contain gateway name, got: {cmd}" - ); - assert!( - cmd.contains("openshell gateway destroy"), - "fallback command should include gateway destroy, got: {cmd}" - ); - } - - #[test] - fn test_diagnose_no_default_route() { - let diagnosis = diagnose_failure( - "test", - "container exited with code 1", - Some("Error: no default route present before starting k3s"), - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("networking")); - } - - #[test] - fn test_diagnose_port_conflict() { - let diagnosis = diagnose_failure("test", "port is already allocated", None); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("Port")); - } - - #[test] - fn test_no_match_returns_none() { - let diagnosis = diagnose_failure("test", "some random error", Some("random logs")); - assert!(diagnosis.is_none()); - } - - #[test] - fn test_diagnose_k3s_dns_proxy_failure_both_patterns() { - // Should match when BOTH patterns are present - let diagnosis = diagnose_failure( - "test", - "failed to pull image", - Some("dial tcp: lookup registry-1.docker.io: Try again"), - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("DNS")); - assert!(d.retryable); - } - - #[test] - fn test_diagnose_k3s_dns_proxy_failure_requires_both_patterns() { - // Should NOT match with only "dial tcp: lookup" (falls through to network connectivity) - let diagnosis = diagnose_failure( - "test", - "failed to pull image", - Some("dial tcp: lookup registry-1.docker.io: connection refused"), - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - // Should match the general network connectivity pattern, not k3s DNS - assert!(d.summary.contains("Network connectivity")); - - // Should NOT match with only "Try again" (no match at all since it's too generic) - let diagnosis = diagnose_failure("test", "Try again later", None); - assert!(diagnosis.is_none()); - } - - #[test] - fn test_diagnose_node_pressure_disk() { - let diagnosis = diagnose_failure( - "test", - "HEALTHCHECK_NODE_PRESSURE: DiskPressure\n\ - The cluster node is under resource pressure.", - None, - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("pressure"), - "expected pressure diagnosis, got: {}", - d.summary - ); - assert!(!d.retryable); - } - - #[test] - fn test_diagnose_node_pressure_from_container_logs() { - let diagnosis = diagnose_failure( - "test", - "gateway health check reported unhealthy", - Some("HEALTHCHECK_NODE_PRESSURE: MemoryPressure"), - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("pressure"), - "expected pressure diagnosis, got: {}", - d.summary - ); - } - - #[test] - fn test_diagnose_docker_not_running() { - let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("Docker")); - assert!(d.retryable); - } - - #[test] - fn test_diagnose_docker_socket_not_found() { - let diagnosis = diagnose_failure("test", "Socket not found: /var/run/docker.sock", None); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("Docker")); - assert!(d.retryable); - } - - #[test] - fn test_diagnose_docker_no_such_file() { - let diagnosis = diagnose_failure("test", "No such file or directory (os error 2)", None); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("Docker")); - } - - #[test] - fn test_diagnose_docker_preflight_error() { - let diagnosis = diagnose_failure( - "test", - "Failed to create Docker client.\n\n connection error", - None, - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!(d.summary.contains("Docker")); - assert!(d.retryable); - } - - #[test] - fn test_diagnose_docker_recovery_mentions_docker_host() { - let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None); - let d = diagnosis.unwrap(); - let steps_text: String = d - .recovery_steps - .iter() - .map(|s| s.description.clone()) - .collect::>() - .join(" "); - assert!( - steps_text.contains("DOCKER_HOST"), - "recovery steps should mention DOCKER_HOST" - ); - } - - #[test] - fn test_diagnose_dns_failure_from_namespace_timeout() { - // When wait_for_namespace detects DNS failure, the error message itself - // (not container logs) contains the DNS markers. The diagnose_failure - // function must match these from the error_message parameter alone, - // since container_logs may be None in this path. - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready\n\nCaused by:\n dial tcp: lookup registry: Try again\n DNS resolution is failing inside the gateway container.", - None, - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("DNS"), - "expected DNS diagnosis, got: {}", - d.summary - ); - assert!(d.retryable); - } - - // -- generic_failure_diagnosis tests -- - - #[test] - fn generic_diagnosis_suggests_doctor_logs() { - let d = generic_failure_diagnosis("my-gw"); - let commands: Vec = d - .recovery_steps - .iter() - .filter_map(|s| s.command.clone()) - .collect(); - assert!( - commands.iter().any(|c| c.contains("openshell doctor logs")), - "expected 'openshell doctor logs' in recovery commands, got: {commands:?}" - ); - } - - #[test] - fn generic_diagnosis_suggests_doctor_check() { - let d = generic_failure_diagnosis("my-gw"); - let commands: Vec = d - .recovery_steps - .iter() - .filter_map(|s| s.command.clone()) - .collect(); - assert!( - commands - .iter() - .any(|c| c.contains("openshell doctor check")), - "expected 'openshell doctor check' in recovery commands, got: {commands:?}" - ); - } - - #[test] - fn generic_diagnosis_includes_gateway_name() { - let d = generic_failure_diagnosis("custom-name"); - let all_text: String = d - .recovery_steps - .iter() - .filter_map(|s| s.command.clone()) - .collect::>() - .join(" "); - assert!( - all_text.contains("custom-name"), - "expected gateway name in recovery commands, got: {all_text}" - ); - } - - // -- fallback behavior tests -- - - #[test] - fn namespace_timeout_without_logs_returns_none() { - // This is the most common user-facing error: a plain timeout with only - // kubectl output. It must NOT match any specific pattern so the caller - // can fall back to generic_failure_diagnosis. - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready\n\nCaused by:\n \ - timed out waiting for namespace 'openshell' to exist: \ - error: the server doesn't have a resource type \"namespace\"", - None, - ); - assert!( - diagnosis.is_none(), - "plain namespace timeout should not match any specific pattern, got: {:?}", - diagnosis.map(|d| d.summary) - ); - } - - #[test] - fn namespace_timeout_with_pressure_logs_matches() { - // When container logs reveal node pressure, the diagnosis engine - // should detect it even though the error message itself is generic. - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready\n\nCaused by:\n \ - timed out waiting for namespace 'openshell' to exist: ", - Some("HEALTHCHECK_NODE_PRESSURE: DiskPressure"), - ); - assert!(diagnosis.is_some(), "expected node pressure diagnosis"); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("pressure"), - "expected pressure in summary, got: {}", - d.summary - ); - } - - #[test] - fn namespace_timeout_with_corrupted_state_logs_matches() { - // Container logs revealing RBAC corruption should be caught. - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready\n\nCaused by:\n \ - timed out waiting for namespace 'openshell' to exist: ", - Some( - "configmaps \"extension-apiserver-authentication\" is forbidden: \ - User cannot get resource", - ), - ); - assert!(diagnosis.is_some(), "expected corrupted state diagnosis"); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("Corrupted"), - "expected Corrupted in summary, got: {}", - d.summary - ); - } - - #[test] - fn namespace_timeout_with_no_route_logs_matches() { - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready", - Some("Error: no default route present before starting k3s"), - ); - assert!(diagnosis.is_some(), "expected networking diagnosis"); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("networking"), - "expected networking in summary, got: {}", - d.summary - ); - } - - #[test] - fn diagnose_failure_with_logs_uses_combined_text() { - // Verify that diagnose_failure combines error_message + container_logs - // for pattern matching. The pattern "connection refused" is in logs, - // not in the error message. - let diagnosis = diagnose_failure( - "test", - "K8s namespace not ready", - Some("dial tcp 127.0.0.1:6443: connect: connection refused"), - ); - assert!( - diagnosis.is_some(), - "expected diagnosis from container logs pattern" - ); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("Network") || d.summary.contains("connectivity"), - "expected network diagnosis, got: {}", - d.summary - ); - } - - // -- end-to-end fallback pattern (mirrors CLI code) -- - - #[test] - fn fallback_to_generic_produces_actionable_diagnosis() { - // This mirrors the actual CLI pattern: - // diagnose_failure(...).unwrap_or_else(|| generic_failure_diagnosis(name)) - // For a plain namespace timeout with no useful container logs, the - // specific matcher returns None and we must fall back to the generic - // diagnosis that suggests doctor commands. - let err_str = "K8s namespace not ready\n\nCaused by:\n \ - timed out waiting for namespace 'openshell' to exist: \ - error: the server doesn't have a resource type \"namespace\""; - let container_logs = Some("k3s is starting\nwaiting for kube-apiserver"); - - let diagnosis = diagnose_failure("my-gw", err_str, container_logs) - .unwrap_or_else(|| generic_failure_diagnosis("my-gw")); - - // Should have gotten the generic diagnosis (no specific pattern matched) - assert_eq!(diagnosis.summary, "Gateway failed to start"); - // Must contain actionable recovery steps - assert!( - !diagnosis.recovery_steps.is_empty(), - "generic diagnosis should have recovery steps" - ); - // Must mention doctor commands - let all_commands: String = diagnosis - .recovery_steps - .iter() - .filter_map(|s| s.command.as_ref()) - .cloned() - .collect::>() - .join("\n"); - assert!( - all_commands.contains("doctor logs"), - "should suggest 'doctor logs', got: {all_commands}" - ); - assert!( - all_commands.contains("doctor check"), - "should suggest 'doctor check', got: {all_commands}" - ); - assert!( - all_commands.contains("my-gw"), - "commands should include gateway name, got: {all_commands}" - ); - } - - #[test] - fn test_diagnose_cdi_device_not_found() { - let diagnosis = diagnose_failure( - "test", - "could not run container: CDI device not found: nvidia.com/gpu=all", - None, - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("CDI"), - "expected CDI diagnosis, got: {}", - d.summary - ); - assert!(!d.retryable); - } - - #[test] - fn test_diagnose_cdi_injection_failed_unresolvable() { - // Exact error observed from Docker 500 response - let diagnosis = diagnose_failure( - "test", - "Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all", - None, - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("CDI"), - "expected CDI diagnosis, got: {}", - d.summary - ); - assert!(!d.retryable); - } - - #[test] - fn test_diagnose_unknown_cdi_device() { - // containerd error path - let diagnosis = diagnose_failure( - "test", - "unknown CDI device requested: nvidia.com/gpu=all", - None, - ); - assert!(diagnosis.is_some()); - let d = diagnosis.unwrap(); - assert!( - d.summary.contains("CDI"), - "expected CDI diagnosis, got: {}", - d.summary - ); - } - - #[test] - fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() { - let d = diagnose_failure("test", "CDI device not found", None) - .expect("should match CDI pattern"); - let all_steps: String = d - .recovery_steps - .iter() - .map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or(""))) - .collect::>() - .join("\n"); - assert!( - all_steps.contains("nvidia-ctk cdi generate"), - "recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}" - ); - assert!( - all_steps.contains("/etc/cdi/"), - "recovery steps should mention /etc/cdi/, got: {all_steps}" - ); - } -} diff --git a/crates/openshell-bootstrap/src/image.rs b/crates/openshell-bootstrap/src/image.rs deleted file mode 100644 index bcb13f68f..000000000 --- a/crates/openshell-bootstrap/src/image.rs +++ /dev/null @@ -1,362 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Image pull helpers for remote deployments. - -use crate::docker::{HostPlatform, get_host_platform}; -use bollard::Docker; -use bollard::auth::DockerCredentials; -use bollard::query_parameters::{CreateImageOptions, TagImageOptionsBuilder}; -use futures::StreamExt; -use miette::{IntoDiagnostic, Result, WrapErr}; -use tracing::{debug, info}; - -/// Default tag to pull from the distribution registry. -const PULL_REGISTRY_DEFAULT_TAG: &str = "latest"; - -/// Image tag baked in at compile time. -/// -/// Set via `OPENSHELL_IMAGE_TAG` env var during `cargo build`: -/// - Defaults to `"dev"` when unset (local builds, `mise run docker:build`). -/// - CI sets this explicitly: `"dev"` for main-branch builds, the version -/// string (e.g. `"0.6.0"`) for tagged releases. -pub const DEFAULT_IMAGE_TAG: &str = match option_env!("OPENSHELL_IMAGE_TAG") { - Some(tag) => tag, - None => "dev", -}; - -// --------------------------------------------------------------------------- -// GHCR registry defaults -// --------------------------------------------------------------------------- - -/// Default registry host for pulling images. -pub const DEFAULT_REGISTRY: &str = "ghcr.io"; - -/// Default image repository base on GHCR (without component name or tag). -pub const DEFAULT_IMAGE_REPO_BASE: &str = "ghcr.io/nvidia/openshell"; - -/// Default full gateway image path on GHCR (without tag). -pub const DEFAULT_GATEWAY_IMAGE: &str = "ghcr.io/nvidia/openshell/cluster"; - -/// Default username for token-based GHCR authentication. -/// -/// GHCR accepts any non-empty username when authenticating with a PAT; -/// `__token__` is a common convention for token-based OCI registry auth. -const DEFAULT_REGISTRY_USERNAME: &str = "__token__"; - -/// Parse an image reference into (repository, tag). -/// -/// Examples: -/// - `nginx:latest` -> ("nginx", "latest") -/// - `nginx` -> ("nginx", "latest") -/// - `ghcr.io/org/repo:v1.0` -> ("ghcr.io/org/repo", "v1.0") -pub fn parse_image_ref(image_ref: &str) -> (String, String) { - // Handle digest references (sha256:...) - if image_ref.contains('@') { - // For digest references, don't split - return the whole thing - return (image_ref.to_string(), String::new()); - } - - // Find the last colon that's after any registry/path separators - // This handles cases like "registry.io:5000/image:tag" - if let Some(last_colon) = image_ref.rfind(':') { - let before_colon = &image_ref[..last_colon]; - let after_colon = &image_ref[last_colon + 1..]; - - // If there's a slash after this colon, it's a port not a tag - if !after_colon.contains('/') { - return (before_colon.to_string(), after_colon.to_string()); - } - } - - // No tag found, default to "latest" - (image_ref.to_string(), "latest".to_string()) -} - -/// Pull an image from a registry to the local Docker daemon. -/// -/// If `platform` is provided (e.g., `"linux/arm64"`), the pull will request that specific -/// platform variant. This is essential when the local host architecture differs from the -/// target deployment architecture. -pub async fn pull_image( - docker: &Docker, - image_ref: &str, - platform: Option<&HostPlatform>, -) -> Result<()> { - let (repo, tag) = parse_image_ref(image_ref); - let platform_str = platform - .map(HostPlatform::platform_string) - .unwrap_or_default(); - - if platform_str.is_empty() { - info!("Pulling image {}:{}", repo, tag); - } else { - info!( - "Pulling image {}:{} for platform {}", - repo, tag, platform_str - ); - } - - let options = CreateImageOptions { - from_image: Some(repo.clone()), - tag: Some(tag.clone()), - platform: platform_str, - ..Default::default() - }; - - let mut stream = docker.create_image(Some(options), None, None); - while let Some(result) = stream.next().await { - let info = result.into_diagnostic().wrap_err("failed to pull image")?; - if let Some(status) = info.status { - debug!("Pull status: {}", status); - } - } - - Ok(()) -} - -/// Build [`DockerCredentials`] for ghcr.io from explicit credentials. -/// -/// Returns `None` when `token` is `None` or empty — the default GHCR repos -/// are public and do not require authentication. When a token is provided, -/// uses the given `username` (falling back to `__token__` if `None`/empty). -pub(crate) fn ghcr_credentials( - username: Option<&str>, - token: Option<&str>, -) -> Option { - let token = token.filter(|t| !t.is_empty())?; - let username = username - .filter(|u| !u.is_empty()) - .unwrap_or(DEFAULT_REGISTRY_USERNAME); - Some(DockerCredentials { - username: Some(username.to_string()), - password: Some(token.to_string()), - serveraddress: Some(DEFAULT_REGISTRY.to_string()), - ..Default::default() - }) -} - -/// Pull the gateway image directly on a remote Docker daemon from ghcr.io, -/// authenticating with the provided registry token. -/// -/// After pulling, the image is tagged to the expected local image ref (e.g., -/// `openshell/cluster:dev`) so that all downstream container creation logic works -/// without changes. -/// -/// The remote host's platform is queried so the correct architecture variant is -/// explicitly requested from the registry (avoids pulling the wrong arch when the -/// registry manifest list defaults differ from the host). -/// -/// Progress is reported via `on_progress` with `[status]`-prefixed messages. -pub async fn pull_remote_image( - remote: &Docker, - image_ref: &str, - registry_username: Option<&str>, - registry_token: Option<&str>, - mut on_progress: impl FnMut(String) + Send + 'static, -) -> Result<()> { - // Query the remote host's platform so we pull the correct architecture. - let remote_platform = get_host_platform(remote).await?; - let platform_str = remote_platform.platform_string(); - info!( - "Remote host platform: {} — will pull matching image variant", - platform_str - ); - - // Determine the registry tag to pull. If OPENSHELL_CLUSTER_IMAGE is set - // and already points at a registry image, honour its tag. Otherwise use - // the distribution registry default tag — the local build tag (e.g. "dev") - // is a build-time convention that doesn't exist in the registry. - let registry_image_base = DEFAULT_GATEWAY_IMAGE.to_string(); - - let tag = if is_local_image_ref(image_ref) { - PULL_REGISTRY_DEFAULT_TAG.to_string() - } else { - let (_repo, t) = parse_image_ref(image_ref); - t - }; - let registry_image = format!("{registry_image_base}:{tag}"); - - info!( - "Pulling image {} on remote host from {}", - registry_image, DEFAULT_REGISTRY - ); - on_progress(format!("[progress] Pulling {platform_str} image")); - - let credentials = ghcr_credentials(registry_username, registry_token); - - let options = CreateImageOptions { - from_image: Some(registry_image_base), - tag: Some(tag.clone()), - platform: platform_str, - ..Default::default() - }; - - let mut stream = remote.create_image(Some(options), None, credentials); - while let Some(result) = stream.next().await { - let info = result - .into_diagnostic() - .wrap_err("failed to pull image on remote host")?; - if let Some(ref status) = info.status { - debug!("Remote pull: {}", status); - } - // Report layer progress - if let Some(ref status) = info.status - && let Some(ref detail) = info.progress_detail - && let (Some(current), Some(total)) = (detail.current, detail.total) - { - let current_mb = current / (1024 * 1024); - let total_mb = total / (1024 * 1024); - on_progress(format!("[progress] {status}: {current_mb}/{total_mb} MB")); - } - } - - // Tag the pulled image to the expected local image ref so downstream code - // (container creation, image ID checks) works unchanged. - // e.g., tag "ghcr.io/nvidia/openshell/cluster:latest" as "openshell/cluster:dev" - let (target_repo, target_tag) = parse_image_ref(image_ref); - info!( - "Tagging {} as {}:{}", - registry_image, target_repo, target_tag - ); - remote - .tag_image( - ®istry_image, - Some( - TagImageOptionsBuilder::default() - .repo(target_repo.as_ref()) - .tag(target_tag.as_ref()) - .build(), - ), - ) - .await - .into_diagnostic() - .wrap_err_with(|| { - format!("failed to tag {registry_image} as {target_repo}:{target_tag} on remote") - })?; - - // Verify that the pulled image matches the expected architecture. - // This catches cases where the registry returned the wrong platform - // variant (e.g., amd64 on an arm64 host) which would cause an - // "exec format error" at container start time. - let inspect = remote - .inspect_image(image_ref) - .await - .into_diagnostic() - .wrap_err_with(|| format!("failed to inspect pulled image {image_ref} on remote"))?; - - let actual_arch = inspect.architecture.as_deref().unwrap_or("unknown"); - if actual_arch != remote_platform.arch { - return Err(miette::miette!( - "architecture mismatch: pulled image {image_ref} is {actual_arch} but remote host is {expected}; \ - try removing stale images on the remote host and re-deploying", - expected = remote_platform.arch, - )); - } - info!( - "Verified image architecture: {} matches remote host", - actual_arch - ); - - on_progress("[progress] Image ready".to_string()); - info!("Remote image pull and tag complete: {}", image_ref); - - Ok(()) -} - -/// Check whether an image reference looks like a locally-built image (no registry prefix). -/// -/// An image reference is considered "local-only" when the repository portion contains no `/`, -/// meaning it has no registry or namespace prefix (e.g., `cluster-local:dev` vs -/// `ghcr.io/org/image:tag` or `docker.io/library/nginx:latest`). -pub(crate) fn is_local_image_ref(image_ref: &str) -> bool { - let (repo, _tag) = parse_image_ref(image_ref); - !repo.contains('/') -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn parse_simple_image() { - let (repo, tag) = parse_image_ref("nginx:latest"); - assert_eq!(repo, "nginx"); - assert_eq!(tag, "latest"); - } - - #[test] - fn parse_image_no_tag() { - let (repo, tag) = parse_image_ref("nginx"); - assert_eq!(repo, "nginx"); - assert_eq!(tag, "latest"); - } - - #[test] - fn parse_image_with_registry() { - let (repo, tag) = parse_image_ref("ghcr.io/org/repo:v1.0"); - assert_eq!(repo, "ghcr.io/org/repo"); - assert_eq!(tag, "v1.0"); - } - - #[test] - fn parse_image_with_registry_port() { - let (repo, tag) = parse_image_ref("registry.io:5000/image:v1"); - assert_eq!(repo, "registry.io:5000/image"); - assert_eq!(tag, "v1"); - } - - #[test] - fn parse_image_with_registry_port_no_tag() { - let (repo, tag) = parse_image_ref("registry.io:5000/image"); - assert_eq!(repo, "registry.io:5000/image"); - assert_eq!(tag, "latest"); - } - - #[test] - fn parse_image_with_digest() { - let (repo, tag) = parse_image_ref("nginx@sha256:abc123"); - assert_eq!(repo, "nginx@sha256:abc123"); - assert_eq!(tag, ""); - } - - #[test] - fn ghcr_credentials_with_token_default_username() { - let creds = ghcr_credentials(None, Some("ghp_test123")); - assert!(creds.is_some()); - let creds = creds.unwrap(); - assert_eq!(creds.username.as_deref(), Some("__token__")); - assert_eq!(creds.password.as_deref(), Some("ghp_test123")); - assert_eq!(creds.serveraddress.as_deref(), Some("ghcr.io")); - } - - #[test] - fn ghcr_credentials_with_custom_username() { - let creds = ghcr_credentials(Some("myuser"), Some("ghp_test123")); - assert!(creds.is_some()); - let creds = creds.unwrap(); - assert_eq!(creds.username.as_deref(), Some("myuser")); - assert_eq!(creds.password.as_deref(), Some("ghp_test123")); - assert_eq!(creds.serveraddress.as_deref(), Some("ghcr.io")); - } - - #[test] - fn ghcr_credentials_without_token_returns_none() { - // No token means unauthenticated (public repos). - assert!(ghcr_credentials(None, None).is_none()); - assert!(ghcr_credentials(None, Some("")).is_none()); - assert!(ghcr_credentials(Some("myuser"), None).is_none()); - } - - #[test] - fn default_constants_are_consistent() { - assert!( - DEFAULT_GATEWAY_IMAGE.starts_with(DEFAULT_IMAGE_REPO_BASE), - "gateway image should be under the default repo base" - ); - assert!( - DEFAULT_IMAGE_REPO_BASE.starts_with(DEFAULT_REGISTRY), - "repo base should start with the registry host" - ); - } -} diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 9651d367f..0988c4b6b 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -3,18 +3,15 @@ pub mod build; pub mod edge_token; -pub mod errors; -pub mod image; pub mod oidc_token; -pub mod constants; -mod docker; mod metadata; pub mod mtls; pub mod paths; pub mod pki; -pub(crate) mod push; -mod runtime; + +#[cfg(test)] +use std::sync::Mutex; /// Shared lock for tests that mutate the process-global `XDG_CONFIG_HOME` /// env var. All such tests in any module must hold this lock to avoid @@ -22,1370 +19,9 @@ mod runtime; #[cfg(test)] pub(crate) static XDG_TEST_LOCK: Mutex<()> = Mutex::new(()); -use bollard::Docker; -use miette::{IntoDiagnostic, Result}; -use std::sync::{Arc, Mutex}; - -use crate::constants::{ - CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, - SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name, -}; -use crate::docker::{ - check_existing_gateway, check_port_conflicts, cleanup_gateway_container, - destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume, - resolve_gpu_device_ids, start_container, stop_container, -}; -use crate::metadata::{ - create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host, -}; -use crate::mtls::store_pki_bundle; -use crate::pki::generate_pki; -use crate::runtime::{ - clean_stale_nodes, exec_capture_with_exit, fetch_recent_logs, openshell_workload_exists, - restart_openshell_deployment, wait_for_gateway_ready, -}; - -pub use crate::constants::container_name; -pub use crate::docker::{ - DockerPreflight, ExistingGatewayInfo, check_docker_available, create_ssh_docker_client, -}; pub use crate::metadata::{ GatewayMetadata, clear_active_gateway, clear_last_sandbox_if_matches, extract_host_from_ssh_destination, get_gateway_metadata, list_gateways, load_active_gateway, load_gateway_metadata, load_last_sandbox, remove_gateway_metadata, resolve_ssh_hostname, save_active_gateway, save_last_sandbox, store_gateway_metadata, }; - -/// Options for remote SSH deployment. -#[derive(Debug, Clone)] -pub struct RemoteOptions { - /// SSH destination in the form `user@hostname` or `ssh://user@hostname`. - pub destination: String, - /// Path to SSH private key. If None, uses SSH agent. - pub ssh_key: Option, -} - -impl RemoteOptions { - /// Create new remote options with the given SSH destination. - pub fn new(destination: impl Into) -> Self { - Self { - destination: destination.into(), - ssh_key: None, - } - } - - /// Set the SSH key path. - #[must_use] - pub fn with_ssh_key(mut self, path: impl Into) -> Self { - self.ssh_key = Some(path.into()); - self - } -} - -/// Default host port that maps to the k3s `NodePort` (30051) for the gateway. -pub const DEFAULT_GATEWAY_PORT: u16 = 8080; - -#[derive(Debug, Clone)] -pub struct DeployOptions { - pub name: String, - pub image_ref: Option, - /// Remote deployment options. If None, deploys locally. - pub remote: Option, - /// Host port to map to the gateway `NodePort` (30051). Defaults to 8080. - pub port: u16, - /// Override the gateway host advertised in cluster metadata and passed to - /// the server. When set, the metadata will use this host instead of - /// `127.0.0.1` and the container will receive `SSH_GATEWAY_HOST`. - /// Needed whenever the client cannot reach the Docker host at 127.0.0.1 - /// — CI containers, WSL, remote Docker hosts, etc. - pub gateway_host: Option, - /// Disable TLS entirely — the server listens on plaintext HTTP. - pub disable_tls: bool, - /// Disable gateway authentication (mTLS client certificate requirement). - /// Ignored when `disable_tls` is true. - pub disable_gateway_auth: bool, - /// Registry authentication username. Defaults to `__token__` when a - /// `registry_token` is provided but no username is set. Only needed - /// for private registries — public GHCR repos pull without auth. - pub registry_username: Option, - /// Registry authentication token (e.g. a GitHub PAT with `read:packages` - /// scope) used to pull images from the registry both during the initial - /// bootstrap pull and inside the k3s cluster at runtime. Only needed - /// for private registries. - pub registry_token: Option, - /// GPU device IDs to inject into the gateway container. - /// - /// - `[]` — no GPU passthrough (default) - /// - `["legacy"]` — internal non-CDI fallback path (`driver="nvidia"`, `count=-1`) - /// - `["auto"]` — resolved at deploy time: CDI if enabled on the daemon, else the non-CDI fallback - /// - `[cdi-ids…]` — CDI `DeviceRequest` with the given device IDs - pub gpu: Vec, - /// When true, destroy any existing gateway resources before deploying. - /// When false, an existing gateway is left as-is and deployment is - /// skipped (the caller is responsible for prompting the user first). - pub recreate: bool, - /// OIDC issuer URL. When set, the server validates Bearer JWTs. - pub oidc_issuer: Option, - /// OIDC audience for the API resource server. Defaults to "openshell-cli". - pub oidc_audience: String, - /// OIDC client ID for CLI login. Defaults to "openshell-cli". - pub oidc_client_id: String, - /// OIDC roles claim path (e.g. `realm_access.roles`). - pub oidc_roles_claim: Option, - /// OIDC admin role name. - pub oidc_admin_role: Option, - /// OIDC user role name. - pub oidc_user_role: Option, - /// OIDC scopes claim path. When set, the server enforces scope-based permissions. - pub oidc_scopes_claim: Option, -} - -impl DeployOptions { - pub fn new(name: impl Into) -> Self { - Self { - name: name.into(), - image_ref: None, - remote: None, - port: DEFAULT_GATEWAY_PORT, - gateway_host: None, - disable_tls: false, - disable_gateway_auth: false, - registry_username: None, - registry_token: None, - gpu: vec![], - recreate: false, - oidc_issuer: None, - oidc_audience: "openshell-cli".to_string(), - oidc_client_id: "openshell-cli".to_string(), - oidc_roles_claim: None, - oidc_admin_role: None, - oidc_user_role: None, - oidc_scopes_claim: None, - } - } - - /// Set remote deployment options. - #[must_use] - pub fn with_remote(mut self, remote: RemoteOptions) -> Self { - self.remote = Some(remote); - self - } - - /// Set the host port for the gateway. - #[must_use] - pub fn with_port(mut self, port: u16) -> Self { - self.port = port; - self - } - - /// Override the gateway host advertised in cluster metadata. - #[must_use] - pub fn with_gateway_host(mut self, host: impl Into) -> Self { - self.gateway_host = Some(host.into()); - self - } - - /// Disable TLS entirely — the server listens on plaintext HTTP. - #[must_use] - pub fn with_disable_tls(mut self, disable: bool) -> Self { - self.disable_tls = disable; - self - } - - /// Disable gateway authentication (mTLS client certificate requirement). - #[must_use] - pub fn with_disable_gateway_auth(mut self, disable: bool) -> Self { - self.disable_gateway_auth = disable; - self - } - - /// Set the registry authentication username. - #[must_use] - pub fn with_registry_username(mut self, username: impl Into) -> Self { - self.registry_username = Some(username.into()); - self - } - - /// Set the registry authentication token for pulling images. - #[must_use] - pub fn with_registry_token(mut self, token: impl Into) -> Self { - self.registry_token = Some(token.into()); - self - } - - /// Set GPU device IDs for the cluster container. - /// - /// Pass `vec!["auto"]` to auto-select between CDI and the non-CDI fallback - /// based on daemon capabilities at deploy time. The `legacy` sentinel is an - /// internal implementation detail for the fallback path. - #[must_use] - pub fn with_gpu(mut self, gpu: Vec) -> Self { - self.gpu = gpu; - self - } - - /// Set whether to destroy and recreate existing gateway resources. - #[must_use] - pub fn with_recreate(mut self, recreate: bool) -> Self { - self.recreate = recreate; - self - } - - /// Set the OIDC issuer URL for JWT-based authentication. - #[must_use] - pub fn with_oidc_issuer(mut self, issuer: impl Into) -> Self { - self.oidc_issuer = Some(issuer.into()); - self - } - - /// Set the OIDC audience (client ID). - #[must_use] - pub fn with_oidc_audience(mut self, audience: impl Into) -> Self { - self.oidc_audience = audience.into(); - self - } -} - -fn apply_oidc_gateway_metadata( - metadata: &mut GatewayMetadata, - resume: bool, - existing: Option<&GatewayMetadata>, - oidc_issuer: Option<&str>, - oidc_client_id: &str, - oidc_audience: &str, -) { - if let Some(issuer) = oidc_issuer { - metadata.auth_mode = Some("oidc".to_string()); - metadata.oidc_issuer = Some(issuer.to_string()); - metadata.oidc_client_id = Some(oidc_client_id.to_string()); - metadata.oidc_audience = Some(oidc_audience.to_string()); - return; - } - - if resume - && let Some(existing) = existing - && existing.auth_mode.as_deref() == Some("oidc") - { - metadata.auth_mode.clone_from(&existing.auth_mode); - metadata.oidc_issuer.clone_from(&existing.oidc_issuer); - metadata.oidc_client_id.clone_from(&existing.oidc_client_id); - metadata.oidc_audience.clone_from(&existing.oidc_audience); - metadata.oidc_scopes.clone_from(&existing.oidc_scopes); - } -} - -#[derive(Debug, Clone)] -pub struct GatewayHandle { - name: String, - metadata: GatewayMetadata, - docker: Docker, -} - -impl GatewayHandle { - /// Get the gateway metadata. - pub fn metadata(&self) -> &GatewayMetadata { - &self.metadata - } - - /// Get the gateway endpoint URL. - pub fn gateway_endpoint(&self) -> &str { - &self.metadata.gateway_endpoint - } - - pub async fn stop(&self) -> Result<()> { - stop_container(&self.docker, &container_name(&self.name)).await - } - - pub async fn destroy(&self) -> Result<()> { - destroy_gateway_resources(&self.docker, &self.name).await - } -} - -/// Check whether a gateway with the given name already has resources deployed. -/// -/// Returns `None` if no existing gateway resources are found, or -/// `Some(ExistingGatewayInfo)` with details about what exists. -pub async fn check_existing_deployment( - name: &str, - remote: Option<&RemoteOptions>, -) -> Result> { - let docker = if let Some(remote_opts) = remote { - create_ssh_docker_client(remote_opts).await? - } else { - let preflight = check_docker_available().await?; - preflight.docker - }; - check_existing_gateway(&docker, name).await -} - -pub async fn deploy_gateway(options: DeployOptions) -> Result { - deploy_gateway_with_logs(options, |_| {}).await -} - -pub async fn deploy_gateway_with_logs(options: DeployOptions, on_log: F) -> Result -where - F: FnMut(String) + Send + 'static, -{ - let name = options.name; - let image_ref = options.image_ref.unwrap_or_else(default_gateway_image_ref); - let port = options.port; - let gateway_host = options.gateway_host; - let disable_tls = options.disable_tls; - let disable_gateway_auth = options.disable_gateway_auth; - let registry_username = options.registry_username; - let registry_token = options.registry_token; - let gpu = options.gpu; - let recreate = options.recreate; - let oidc_issuer = options.oidc_issuer; - let oidc_audience = options.oidc_audience; - let oidc_client_id = options.oidc_client_id; - let oidc_roles_claim = options.oidc_roles_claim; - let oidc_admin_role = options.oidc_admin_role; - let oidc_user_role = options.oidc_user_role; - let oidc_scopes_claim = options.oidc_scopes_claim; - - // Wrap on_log in Arc> so we can share it with pull_remote_image - // which needs a 'static callback for the bollard streaming pull. - let on_log = Arc::new(Mutex::new(on_log)); - - // Helper to call on_log from the shared reference - let log = |msg: String| { - if let Ok(mut f) = on_log.lock() { - f(msg); - } - }; - - // Create Docker client based on deployment mode. - // For local deploys, run a preflight check to fail fast with actionable - // guidance when Docker is not installed, not running, or unreachable. - let (target_docker, remote_opts) = if let Some(remote_opts) = &options.remote { - let remote = create_ssh_docker_client(remote_opts).await?; - (remote, Some(remote_opts.clone())) - } else { - log("[status] Checking Docker".to_string()); - let preflight = check_docker_available().await?; - (preflight.docker, None) - }; - - // CDI is considered enabled when the daemon reports at least one CDI spec - // directory via `GET /info` (`SystemInfo.CDISpecDirs`). An empty list or - // missing field means CDI is not configured and we fall back to the legacy - // NVIDIA `DeviceRequest` (driver="nvidia"). Detection is best-effort — - // failure to query daemon info is non-fatal. - let cdi_supported = target_docker - .info() - .await - .ok() - .and_then(|info| info.cdi_spec_dirs) - .is_some_and(|dirs| !dirs.is_empty()); - - // If an existing gateway is found, decide how to proceed: - // - recreate: destroy everything and start fresh - // - otherwise: auto-resume from existing state (the ensure_* calls are - // idempotent and will reuse the volume, create a container if needed, - // and start it) - let mut resume = false; - let mut resume_container_exists = false; - if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { - if recreate { - log("[status] Removing existing gateway".to_string()); - destroy_gateway_resources(&target_docker, &name).await?; - } else if existing.container_running { - log("[status] Gateway is already running".to_string()); - resume = true; - resume_container_exists = true; - } else { - log("[status] Resuming gateway from existing state".to_string()); - resume = true; - resume_container_exists = existing.container_exists; - } - } - - // Ensure the image is available on the target Docker daemon. - // When both the container and volume exist we can skip the pull entirely - // — the container already references a valid local image. This avoids - // failures when the original image tag (e.g. a local-only - // `openshell/cluster:dev`) is not available from the default registry. - // - // When only the volume survives (container was removed), we still need - // the image to recreate the container, so the pull must happen. - let need_image = !resume || !resume_container_exists; - if need_image { - log("[status] Downloading gateway".to_string()); - if remote_opts.is_some() { - let on_log_clone = Arc::clone(&on_log); - let progress_cb = move |msg: String| { - if let Ok(mut f) = on_log_clone.lock() { - f(msg); - } - }; - image::pull_remote_image( - &target_docker, - &image_ref, - registry_username.as_deref(), - registry_token.as_deref(), - progress_cb, - ) - .await?; - } else { - // Local deployment: ensure image exists (pull if needed) - ensure_image( - &target_docker, - &image_ref, - registry_username.as_deref(), - registry_token.as_deref(), - ) - .await?; - } - } - - // All subsequent operations use the target Docker (remote or local) - log("[status] Initializing environment".to_string()); - ensure_network(&target_docker, &network_name(&name)).await?; - ensure_volume(&target_docker, &volume_name(&name)).await?; - - // Compute extra TLS SANs for remote deployments so the gateway and k3s - // API server certificates include the remote host's IP/hostname. - // Also determine the SSH gateway host so the server returns the correct - // address to CLI clients for SSH proxy CONNECT requests. - // - // When `gateway_host` is provided (e.g., `host.docker.internal` in CI), - // it is added to the SAN list and used as `ssh_gateway_host` so the - // server advertises the correct address even for local clusters. - let (extra_sans, ssh_gateway_host): (Vec, Option) = - if let Some(opts) = remote_opts.as_ref() { - let ssh_host = extract_host_from_ssh_destination(&opts.destination); - let resolved = resolve_ssh_hostname(&ssh_host); - // Include both the SSH alias and resolved IP if they differ, so the - // certificate covers both names. - let mut sans = vec![resolved.clone()]; - if ssh_host != resolved { - sans.push(ssh_host); - } - if let Some(ref host) = gateway_host - && !sans.contains(host) - { - sans.push(host.clone()); - } - (sans, gateway_host.or(Some(resolved))) - } else { - let mut sans: Vec = local_gateway_host().into_iter().collect(); - if let Some(ref host) = gateway_host - && !sans.contains(host) - { - sans.push(host.clone()); - } - (sans, gateway_host) - }; - - // Check for port conflicts before creating/starting the container. - // Docker silently fails to attach networking when a host port is already - // bound by another container, leaving the new container with only loopback - // and no default route. Detecting this up-front avoids a confusing 30s - // timeout followed by a misleading "Docker networking issue" diagnostic. - let conflicts = check_port_conflicts(&target_docker, &name, port).await?; - if !conflicts.is_empty() { - let details: Vec = conflicts - .iter() - .map(|c| { - format!( - "port {} is held by container \"{}\"", - c.host_port, c.container_name - ) - }) - .collect(); - return Err(miette::miette!( - "cannot start gateway: {}\n\nStop or remove the conflicting container(s) first, \ - then retry:\n{}", - details.join(", "), - conflicts - .iter() - .map(|c| format!(" docker stop {}", c.container_name)) - .collect::>() - .join("\n"), - )); - } - - // From this point on, Docker resources (container, volume, network) are - // being created. If any subsequent step fails, we must clean up to avoid - // leaving an orphaned volume in a corrupted state that blocks retries. - // See: https://github.com/NVIDIA/OpenShell/issues/463 - let deploy_result: Result = async { - let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported); - // ensure_container returns the actual host port — which may differ from - // the requested `port` when reusing an existing container that was - // originally created with a different port. - let actual_port = ensure_container( - &target_docker, - &name, - &image_ref, - &extra_sans, - ssh_gateway_host.as_deref(), - port, - disable_tls, - disable_gateway_auth, - registry_username.as_deref(), - registry_token.as_deref(), - &device_ids, - resume, - oidc_issuer.as_deref(), - &oidc_audience, - oidc_roles_claim.as_deref(), - oidc_admin_role.as_deref(), - oidc_user_role.as_deref(), - oidc_scopes_claim.as_deref(), - ) - .await?; - let port = actual_port; - start_container(&target_docker, &name).await?; - - // Clean up stale k3s nodes left over from previous container instances that - // used the same persistent volume. Without this, pods remain scheduled on - // NotReady ghost nodes and the health check will time out. - // - // The function retries internally until kubectl becomes available (k3s may - // still be initialising after the container start). It also force-deletes - // pods stuck in Terminating on the removed nodes so that StatefulSets can - // reschedule replacements immediately. - match clean_stale_nodes(&target_docker, &name).await { - Ok(0) => {} - Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"), - Err(err) => { - tracing::warn!("stale node cleanup failed (non-fatal): {err}"); - } - } - - // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and - // valid; only generate fresh PKI when secrets are missing, incomplete, - // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS. - // - // Ordering is: reconcile secrets -> (if rotated and workload exists: - // rollout restart and wait) -> persist CLI-side bundle. - // - // We check workload presence before reconciliation. On a fresh/recreated - // cluster, secrets are always newly generated and a restart is unnecessary. - // Restarting only when workload pre-existed avoids extra rollout latency. - let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?; - let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?; - - if rotated && workload_existed_before_pki { - // If an openshell workload is already running, it must be restarted so - // it picks up the new TLS secrets before we write CLI-side certs. - // A failed rollout is a hard error — CLI certs must not be persisted - // if the server cannot come up with the new PKI. - restart_openshell_deployment(&target_docker, &name).await?; - } - - store_pki_bundle(&name, &pki_bundle)?; - - // Reconcile SSH handshake secret: reuse existing K8s secret if present, - // generate and persist a new one otherwise. This secret is stored in etcd - // (on the persistent volume) so it survives container restarts. - reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?; - - // Push locally-built component images into the k3s containerd runtime. - // This is the "push" path for local development — images are exported from - // the local Docker daemon and streamed into the cluster's containerd so - // k3s can resolve them without pulling from the remote registry. - if remote_opts.is_none() - && let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES") - { - let images: Vec<&str> = push_images_str - .split(',') - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect(); - if !images.is_empty() { - log("[status] Deploying components".to_string()); - // Long-timeout client: `docker save` of multi-GB component - // images streams past bollard's 120s default. See - // docker::connect_local_for_large_transfers(). - let local_docker = docker::connect_local_for_large_transfers().into_diagnostic()?; - let container = container_name(&name); - let on_log_ref = Arc::clone(&on_log); - let mut push_log = move |msg: String| { - if let Ok(mut f) = on_log_ref.lock() { - f(msg); - } - }; - push::push_local_images( - &local_docker, - &target_docker, - &container, - &images, - &mut push_log, - ) - .await?; - - restart_openshell_deployment(&target_docker, &name).await?; - } - } - - log("[status] Starting gateway".to_string()); - { - // Create a short-lived closure that locks on each call rather than holding - // the MutexGuard across await points. - let on_log_ref = Arc::clone(&on_log); - let mut gateway_log = move |msg: String| { - if let Ok(mut f) = on_log_ref.lock() { - f(msg); - } - }; - wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?; - } - - // Create and store gateway metadata. On resume, preserve existing - // OIDC fields so a bare `gateway start` without `--oidc-*` flags - // doesn't erase a previously configured OIDC registration. - let mut metadata = create_gateway_metadata_with_host( - &name, - remote_opts.as_ref(), - port, - ssh_gateway_host.as_deref(), - disable_tls, - ); - let existing_metadata = if resume { - load_gateway_metadata(&name).ok() - } else { - None - }; - apply_oidc_gateway_metadata( - &mut metadata, - resume, - existing_metadata.as_ref(), - oidc_issuer.as_deref(), - &oidc_client_id, - &oidc_audience, - ); - store_gateway_metadata(&name, &metadata)?; - - Ok(metadata) - } - .await; - - match deploy_result { - Ok(metadata) => Ok(GatewayHandle { - name, - metadata, - docker: target_docker, - }), - Err(deploy_err) => { - if resume { - // When resuming, preserve the volume so the user can retry. - // Only clean up the container and network that we may have created. - tracing::info!( - "resume failed, cleaning up container for '{name}' (preserving volume)" - ); - if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await { - tracing::warn!( - "automatic cleanup after failed resume also failed: {cleanup_err}. \ - Manual cleanup may be required: \ - openshell gateway destroy --name {name}" - ); - } - } else { - // Automatically clean up Docker resources (volume, container, network, - // image) so the environment is left in a retryable state. - tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); - if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { - tracing::warn!( - "automatic cleanup after failed deploy also failed: {cleanup_err}. \ - Manual cleanup may be required: \ - openshell gateway destroy --name {name}" - ); - } - } - Err(deploy_err) - } - } -} - -/// Get a handle to an existing gateway. -/// -/// For local gateways, pass `None` for remote options. -/// For remote gateways, pass the same `RemoteOptions` used during deployment. -pub async fn gateway_handle(name: &str, remote: Option<&RemoteOptions>) -> Result { - let docker = match remote { - Some(remote_opts) => create_ssh_docker_client(remote_opts).await?, - None => Docker::connect_with_local_defaults().into_diagnostic()?, - }; - // Try to load existing metadata, fall back to creating new metadata - // with the default ports (the actual ports are only known at deploy time). - let metadata = load_gateway_metadata(name) - .unwrap_or_else(|_| create_gateway_metadata(name, remote, DEFAULT_GATEWAY_PORT)); - Ok(GatewayHandle { - name: name.to_string(), - metadata, - docker, - }) -} - -/// Extract mTLS certificates from an existing gateway container and store -/// them locally so the CLI can connect. -/// -/// Connects to Docker (local or remote via SSH), auto-discovers the running -/// gateway container by image name (narrowed by `port` when provided), reads -/// the PKI bundle from Kubernetes secrets inside it, and writes the client -/// materials (ca.crt, tls.crt, tls.key) to the gateway config directory. -pub async fn extract_and_store_pki( - name: &str, - remote: Option<&RemoteOptions>, - port: Option, -) -> Result<()> { - let docker = match remote { - Some(r) => create_ssh_docker_client(r).await?, - None => Docker::connect_with_local_defaults().into_diagnostic()?, - }; - let cname = docker::find_gateway_container(&docker, port).await?; - let bundle = load_existing_pki_bundle(&docker, &cname, constants::KUBECONFIG_PATH) - .await - .map_err(|e| miette::miette!("Failed to extract TLS certificates: {e}"))?; - store_pki_bundle(name, &bundle)?; - Ok(()) -} - -pub async fn ensure_gateway_image( - version: &str, - registry_username: Option<&str>, - registry_token: Option<&str>, -) -> Result { - let docker = Docker::connect_with_local_defaults().into_diagnostic()?; - let image_ref = format!("{}:{version}", image::DEFAULT_GATEWAY_IMAGE); - ensure_image(&docker, &image_ref, registry_username, registry_token).await?; - Ok(image_ref) -} - -/// Fetch logs from the gateway Docker container. -/// -/// Connects to Docker (local or remote), retrieves logs from -/// `openshell-cluster-{name}`, and writes them to the provided writer. -/// -/// When `follow` is true, streams logs in real-time (blocks until cancelled). -/// When `lines` is `Some(n)`, returns the last `n` lines; when `None`, -/// returns all available logs. -pub async fn gateway_container_logs( - remote: Option<&RemoteOptions>, - name: &str, - lines: Option, - follow: bool, - mut writer: W, -) -> Result<()> { - use bollard::container::LogOutput; - use bollard::query_parameters::LogsOptionsBuilder; - use futures::StreamExt; - use miette::WrapErr; - - let docker = match remote { - Some(remote_opts) => create_ssh_docker_client(remote_opts).await?, - None => Docker::connect_with_local_defaults().into_diagnostic()?, - }; - - let container = container_name(name); - - let tail_value = match (follow, lines) { - (true, _) => "0".to_string(), - (false, Some(n)) => n.to_string(), - (false, None) => "all".to_string(), - }; - - let options = LogsOptionsBuilder::new() - .follow(follow) - .stdout(true) - .stderr(true) - .tail(&tail_value) - .timestamps(true) - .build(); - - let mut stream = docker.logs(&container, Some(options)); - - while let Some(item) = stream.next().await { - match item { - Ok(log) => { - let text = match log { - LogOutput::StdOut { message } - | LogOutput::StdErr { message } - | LogOutput::Console { message } => { - String::from_utf8_lossy(&message).to_string() - } - LogOutput::StdIn { .. } => continue, - }; - writer - .write_all(text.as_bytes()) - .into_diagnostic() - .wrap_err("failed to write log output")?; - } - Err(err) => { - return Err(miette::miette!("error reading container logs: {err}")); - } - } - } - - Ok(()) -} - -/// Fetch the last `n` lines of container logs for a local gateway as a `String`. -/// -/// This is a convenience wrapper for diagnostic call sites (e.g. failure -/// diagnosis in the CLI) that do not hold a Docker client handle. -/// -/// Returns an empty string on any Docker/connection error so callers don't -/// need to worry about error handling. -pub async fn fetch_gateway_logs(name: &str, n: usize) -> String { - let Ok(docker) = Docker::connect_with_local_defaults() else { - return String::new(); - }; - let container = container_name(name); - fetch_recent_logs(&docker, &container, n).await -} - -fn default_gateway_image_ref() -> String { - if let Ok(image) = std::env::var("OPENSHELL_CLUSTER_IMAGE") - && !image.trim().is_empty() - { - return image; - } - format!( - "{}:{}", - image::DEFAULT_GATEWAY_IMAGE, - image::DEFAULT_IMAGE_TAG - ) -} - -/// Create the three TLS K8s secrets required by the `OpenShell` server and sandbox pods. -/// -/// Secrets are created via `kubectl` exec'd inside the cluster container: -/// - `openshell-server-tls` (kubernetes.io/tls): server cert + key -/// - `openshell-server-client-ca` (Opaque): CA cert for verifying client certs -/// - `openshell-client-tls` (Opaque): client cert + key + CA cert (shared by CLI & sandboxes) -async fn create_k8s_tls_secrets( - docker: &Docker, - name: &str, - bundle: &pki::PkiBundle, -) -> Result<()> { - use base64::Engine; - use base64::engine::general_purpose::STANDARD; - use miette::WrapErr; - - let cname = container_name(name); - let kubeconfig = constants::KUBECONFIG_PATH; - - // Helper: run kubectl apply -f - with a JSON secret manifest. - let apply_secret = |manifest: String| { - let docker = docker.clone(); - let cname = cname.clone(); - async move { - let (output, exit_code) = exec_capture_with_exit( - &docker, - &cname, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={kubeconfig} kubectl apply -f - <<'ENDOFMANIFEST'\n{manifest}\nENDOFMANIFEST" - ), - ], - ) - .await?; - if exit_code != 0 { - return Err(miette::miette!( - "kubectl apply failed (exit {exit_code}): {output}" - )); - } - Ok(()) - } - }; - - // 1. openshell-server-tls (kubernetes.io/tls) - let server_tls_manifest = serde_json::json!({ - "apiVersion": "v1", - "kind": "Secret", - "metadata": { - "name": SERVER_TLS_SECRET_NAME, - "namespace": "openshell" - }, - "type": "kubernetes.io/tls", - "data": { - "tls.crt": STANDARD.encode(&bundle.server_cert_pem), - "tls.key": STANDARD.encode(&bundle.server_key_pem) - } - }); - apply_secret(server_tls_manifest.to_string()) - .await - .wrap_err("failed to create openshell-server-tls secret")?; - - // 2. openshell-server-client-ca (Opaque) - let client_ca_manifest = serde_json::json!({ - "apiVersion": "v1", - "kind": "Secret", - "metadata": { - "name": SERVER_CLIENT_CA_SECRET_NAME, - "namespace": "openshell" - }, - "type": "Opaque", - "data": { - "ca.crt": STANDARD.encode(&bundle.ca_cert_pem) - } - }); - apply_secret(client_ca_manifest.to_string()) - .await - .wrap_err("failed to create openshell-server-client-ca secret")?; - - // 3. openshell-client-tls (Opaque) — shared by CLI and sandbox pods - let client_tls_manifest = serde_json::json!({ - "apiVersion": "v1", - "kind": "Secret", - "metadata": { - "name": CLIENT_TLS_SECRET_NAME, - "namespace": "openshell" - }, - "type": "Opaque", - "data": { - "tls.crt": STANDARD.encode(&bundle.client_cert_pem), - "tls.key": STANDARD.encode(&bundle.client_key_pem), - "ca.crt": STANDARD.encode(&bundle.ca_cert_pem) - } - }); - apply_secret(client_tls_manifest.to_string()) - .await - .wrap_err("failed to create openshell-client-tls secret")?; - - Ok(()) -} - -/// Reconcile gateway TLS secrets: reuse existing PKI if valid, generate new if needed. -/// -/// Returns `(bundle, rotated)` where `rotated` is true if new PKI was generated -/// and applied to the gateway (meaning the server needs a restart to pick it up). -async fn reconcile_pki( - docker: &Docker, - name: &str, - extra_sans: &[String], - log: &F, -) -> Result<(pki::PkiBundle, bool)> -where - F: Fn(String) + Sync, -{ - use miette::WrapErr; - - let cname = container_name(name); - let kubeconfig = constants::KUBECONFIG_PATH; - - // Wait for the k3s API server and openshell namespace before attempting - // to read secrets. Without this, kubectl fails transiently on resume - // (k3s hasn't booted yet), the code assumes secrets are gone, and - // regenerates PKI unnecessarily — triggering a server rollout restart - // and TLS errors for in-flight connections. - log("[progress] Waiting for openshell namespace".to_string()); - wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; - - // Try to load existing secrets. - match load_existing_pki_bundle(docker, &cname, kubeconfig).await { - Ok(bundle) => { - log("[progress] Reusing existing TLS certificates".to_string()); - return Ok((bundle, false)); - } - Err(reason) => { - log(format!( - "[progress] Cannot reuse existing TLS secrets ({reason}) — generating new PKI" - )); - } - } - - // Generate fresh PKI and apply to cluster. - log("[progress] Generating TLS certificates".to_string()); - let bundle = generate_pki(extra_sans)?; - log("[progress] Applying TLS secrets to gateway".to_string()); - create_k8s_tls_secrets(docker, name, &bundle) - .await - .wrap_err("failed to apply new TLS secrets")?; - - Ok((bundle, true)) -} - -/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret. -/// -/// If the secret already exists in the cluster, this is a no-op. Otherwise a -/// fresh 32-byte hex secret is generated and applied. Because the secret lives -/// in etcd (backed by the persistent Docker volume), it survives container -/// restarts without regeneration — existing sandbox SSH sessions remain valid. -async fn reconcile_ssh_handshake_secret(docker: &Docker, name: &str, log: &F) -> Result<()> -where - F: Fn(String) + Sync, -{ - use miette::WrapErr; - - let cname = container_name(name); - let kubeconfig = constants::KUBECONFIG_PATH; - - // Check if the secret already exists. - let (output, exit_code) = exec_capture_with_exit( - docker, - &cname, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null" - ), - ], - ) - .await?; - - if exit_code == 0 && !output.trim().is_empty() { - tracing::debug!( - "existing SSH handshake secret found ({} bytes encoded)", - output.trim().len() - ); - log("[progress] Reusing existing SSH handshake secret".to_string()); - return Ok(()); - } - - // Generate a new 32-byte hex secret and create the K8s secret. - log("[progress] Generating SSH handshake secret".to_string()); - let (output, exit_code) = exec_capture_with_exit( - docker, - &cname, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \ - KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \ - --from-literal=secret=$SECRET --dry-run=client -o yaml | \ - KUBECONFIG={kubeconfig} kubectl apply -f -" - ), - ], - ) - .await?; - - if exit_code != 0 { - return Err(miette::miette!( - "failed to create SSH handshake secret (exit {exit_code}): {output}" - )) - .wrap_err("failed to apply SSH handshake secret"); - } - - Ok(()) -} - -/// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`]. -/// -/// Returns an error string describing why secrets couldn't be loaded (for logging). -async fn load_existing_pki_bundle( - docker: &Docker, - container_name: &str, - kubeconfig: &str, -) -> std::result::Result { - use base64::Engine; - use base64::engine::general_purpose::STANDARD; - - // Helper to read a specific key from a K8s secret. - let read_secret_key = |secret: &str, key: &str| { - let docker = docker.clone(); - let container_name = container_name.to_string(); - let secret = secret.to_string(); - let key = key.to_string(); - async move { - let jsonpath = format!("{{.data.{}}}", key.replace('.', "\\.")); - let cmd = format!( - "KUBECONFIG={kubeconfig} kubectl get secret {secret} -n openshell -o jsonpath='{jsonpath}' 2>/dev/null" - ); - let (output, exit_code) = exec_capture_with_exit( - &docker, - &container_name, - vec!["sh".to_string(), "-c".to_string(), cmd], - ) - .await - .map_err(|e| format!("exec failed: {e}"))?; - - if exit_code != 0 || output.trim().is_empty() { - return Err(format!("secret {secret} key {key} not found or empty")); - } - - let decoded = STANDARD - .decode(output.trim()) - .map_err(|e| format!("base64 decode failed for {secret}/{key}: {e}"))?; - String::from_utf8(decoded).map_err(|e| format!("non-UTF8 data in {secret}/{key}: {e}")) - } - }; - - // Read required fields concurrently to reduce bootstrap latency. - let (server_cert, server_key, ca_cert, client_cert, client_key, client_ca) = tokio::try_join!( - read_secret_key(SERVER_TLS_SECRET_NAME, "tls.crt"), - read_secret_key(SERVER_TLS_SECRET_NAME, "tls.key"), - read_secret_key(SERVER_CLIENT_CA_SECRET_NAME, "ca.crt"), - read_secret_key(CLIENT_TLS_SECRET_NAME, "tls.crt"), - read_secret_key(CLIENT_TLS_SECRET_NAME, "tls.key"), - // Also read ca.crt from client-tls for completeness check. - read_secret_key(CLIENT_TLS_SECRET_NAME, "ca.crt"), - )?; - - // Validate that all PEM data contains expected markers. - for (label, data) in [ - ("server cert", &server_cert), - ("server key", &server_key), - ("CA cert", &ca_cert), - ("client cert", &client_cert), - ("client key", &client_key), - ("client CA", &client_ca), - ] { - if !data.contains("-----BEGIN ") { - return Err(format!("{label} does not contain valid PEM data")); - } - } - - Ok(pki::PkiBundle { - ca_cert_pem: ca_cert, - ca_key_pem: String::new(), // CA key is not stored in cluster secrets - server_cert_pem: server_cert, - server_key_pem: server_key, - client_cert_pem: client_cert, - client_key_pem: client_key, - }) -} - -/// Wait for a K8s namespace to exist inside the cluster container. -/// -/// The Helm controller creates the `openshell` namespace when it processes -/// the `HelmChart` manifest, but there's a race between kubeconfig being ready -/// and the namespace being created. We poll briefly. -/// Check whether DNS resolution is working inside the container. -/// -/// Probes the configured `REGISTRY_HOST` (falling back to `ghcr.io`) since -/// that is the primary registry the cluster needs to reach for image pulls. -/// -/// Returns `Ok(true)` if DNS is functional, `Ok(false)` if the probe ran but -/// resolution failed, and `Err` if the exec itself failed. -async fn probe_container_dns(docker: &Docker, container_name: &str) -> Result { - // The probe must handle IP-literal registry hosts (e.g. 127.0.0.1:5000) - // which don't need DNS resolution. Strip the port suffix since nslookup - // doesn't understand host:port, and skip the probe entirely for IP - // literals. - let (output, exit_code) = exec_capture_with_exit( - docker, - container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - concat!( - "host=\"${REGISTRY_HOST:-ghcr.io}\"; ", - "host=\"${host%%:*}\"; ", - "echo \"$host\" | grep -qE '^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$' && { echo DNS_OK; exit 0; }; ", - "echo \"$host\" | grep -qE '^\\[?[0-9a-fA-F:]+\\]?$' && { echo DNS_OK; exit 0; }; ", - "nslookup \"$host\" >/dev/null 2>&1 && echo DNS_OK || echo DNS_FAIL", - ) - .to_string(), - ], - ) - .await?; - Ok(exit_code == 0 && output.contains("DNS_OK")) -} - -async fn wait_for_namespace( - docker: &Docker, - container_name: &str, - kubeconfig: &str, - namespace: &str, -) -> Result<()> { - use miette::WrapErr; - - // Shared CPU runners can take several minutes to cold-start k3s, apply - // bundled manifests, and let the k3s Helm controller create the namespace. - let attempts = 150; - let max_backoff = std::time::Duration::from_secs(2); - let mut backoff = std::time::Duration::from_millis(200); - - // Track consecutive DNS failures. We start probing early (iteration 3, - // giving k3s a few seconds to boot) and probe every 3 iterations after - // that. Two consecutive failures are enough to abort — the nslookup - // timeout already provides a built-in retry window. - let dns_probe_start = 3; // skip the first few iterations while k3s boots - let dns_probe_interval = 3; // probe every N iterations after start - let dns_failure_threshold: u32 = 2; // consecutive probe failures to abort - let mut dns_consecutive_failures: u32 = 0; - - for attempt in 0..attempts { - // --- Periodic DNS health probe --- - if attempt >= dns_probe_start && (attempt - dns_probe_start) % dns_probe_interval == 0 { - match probe_container_dns(docker, container_name).await { - Ok(true) => { - dns_consecutive_failures = 0; - } - Ok(false) => { - dns_consecutive_failures += 1; - if dns_consecutive_failures >= dns_failure_threshold { - let logs = fetch_recent_logs(docker, container_name, 40).await; - return Err(miette::miette!( - "dial tcp: lookup registry: Try again\n\ - DNS resolution is failing inside the gateway container. \ - The cluster cannot pull images or create the '{namespace}' namespace \ - until DNS is fixed.\n{logs}" - )) - .wrap_err("K8s namespace not ready"); - } - } - Err(_) => { - // Exec failed — container may be restarting; don't count - // as a DNS failure. - } - } - } - - let exec_result = exec_capture_with_exit( - docker, - container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!("KUBECONFIG={kubeconfig} kubectl get namespace {namespace} -o name 2>&1"), - ], - ) - .await; - - let (output, exit_code) = match exec_result { - Ok(result) => result, - Err(err) => { - if let Err(status_err) = - docker::check_container_running(docker, container_name).await - { - let logs = fetch_recent_logs(docker, container_name, 40).await; - return Err(miette::miette!( - "gateway container is not running while waiting for namespace '{namespace}': {status_err}\n{logs}" - )) - .wrap_err("K8s namespace not ready"); - } - - if attempt + 1 == attempts { - let logs = fetch_recent_logs(docker, container_name, 40).await; - return Err(miette::miette!( - "exec failed on final attempt while waiting for namespace '{namespace}': {err}\n{logs}" - )) - .wrap_err("K8s namespace not ready"); - } - tokio::time::sleep(backoff).await; - backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff); - continue; - } - }; - - if exit_code == 0 && output.contains(namespace) { - return Ok(()); - } - - if attempt + 1 == attempts { - let logs = fetch_recent_logs(docker, container_name, 40).await; - return Err(miette::miette!( - "timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}" - )) - .wrap_err("K8s namespace not ready"); - } - - tokio::time::sleep(backoff).await; - backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff); - } - - unreachable!() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn load_existing_pki_bundle_validates_pem_markers() { - // The PEM validation in load_existing_pki_bundle checks for "-----BEGIN " - // markers. This test verifies that generate_pki produces bundles that - // would pass that check. - let bundle = generate_pki(&[]).expect("generate_pki failed"); - for (label, pem) in [ - ("ca_cert", &bundle.ca_cert_pem), - ("server_cert", &bundle.server_cert_pem), - ("server_key", &bundle.server_key_pem), - ("client_cert", &bundle.client_cert_pem), - ("client_key", &bundle.client_key_pem), - ] { - assert!( - pem.contains("-----BEGIN "), - "{label} should contain PEM marker" - ); - } - } - - #[test] - fn apply_oidc_gateway_metadata_sets_explicit_values() { - let mut metadata = GatewayMetadata::default(); - apply_oidc_gateway_metadata( - &mut metadata, - false, - None, - Some("http://issuer.test/realm"), - "openshell-cli", - "openshell-api", - ); - - assert_eq!(metadata.auth_mode.as_deref(), Some("oidc")); - assert_eq!( - metadata.oidc_issuer.as_deref(), - Some("http://issuer.test/realm") - ); - assert_eq!(metadata.oidc_client_id.as_deref(), Some("openshell-cli")); - assert_eq!(metadata.oidc_audience.as_deref(), Some("openshell-api")); - } - - #[test] - fn apply_oidc_gateway_metadata_preserves_existing_oidc_on_resume() { - let mut metadata = GatewayMetadata::default(); - let existing = GatewayMetadata { - auth_mode: Some("oidc".to_string()), - oidc_issuer: Some("http://issuer.test/realm".to_string()), - oidc_client_id: Some("openshell-cli".to_string()), - oidc_audience: Some("openshell-api".to_string()), - oidc_scopes: Some("sandbox:read".to_string()), - ..GatewayMetadata::default() - }; - - apply_oidc_gateway_metadata( - &mut metadata, - true, - Some(&existing), - None, - "ignored-client", - "ignored-audience", - ); - - assert_eq!(metadata.auth_mode.as_deref(), Some("oidc")); - assert_eq!( - metadata.oidc_issuer.as_deref(), - Some("http://issuer.test/realm") - ); - assert_eq!(metadata.oidc_client_id.as_deref(), Some("openshell-cli")); - assert_eq!(metadata.oidc_audience.as_deref(), Some("openshell-api")); - assert_eq!(metadata.oidc_scopes.as_deref(), Some("sandbox:read")); - } - - #[test] - fn apply_oidc_gateway_metadata_does_not_preserve_without_resume() { - let mut metadata = GatewayMetadata::default(); - let existing = GatewayMetadata { - auth_mode: Some("oidc".to_string()), - oidc_issuer: Some("http://issuer.test/realm".to_string()), - oidc_client_id: Some("openshell-cli".to_string()), - oidc_audience: Some("openshell-api".to_string()), - ..GatewayMetadata::default() - }; - - apply_oidc_gateway_metadata( - &mut metadata, - false, - Some(&existing), - None, - "ignored-client", - "ignored-audience", - ); - - assert!(metadata.auth_mode.is_none()); - assert!(metadata.oidc_issuer.is_none()); - assert!(metadata.oidc_client_id.is_none()); - assert!(metadata.oidc_audience.is_none()); - } -} diff --git a/crates/openshell-bootstrap/src/metadata.rs b/crates/openshell-bootstrap/src/metadata.rs index b1bb36351..abe51335e 100644 --- a/crates/openshell-bootstrap/src/metadata.rs +++ b/crates/openshell-bootstrap/src/metadata.rs @@ -1,14 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use crate::RemoteOptions; use crate::paths::{active_gateway_path, gateways_dir, last_sandbox_path}; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::paths::ensure_parent_dir_restricted; use serde::{Deserialize, Serialize}; use std::path::PathBuf; -/// Gateway metadata stored alongside deployment info. +/// Gateway metadata stored for CLI endpoint resolution and authentication. #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct GatewayMetadata { /// The gateway name. @@ -69,137 +68,6 @@ pub struct GatewayMetadata { /// Local VM driver state directory for standalone VM gateways. #[serde(default, skip_serializing_if = "Option::is_none")] pub vm_driver_state_dir: Option, - - /// Whether the CLI manages this gateway's full lifecycle (deploy, - /// stop, destroy). - /// - /// - `Some(true)` — deployed via `gateway start`; destroy/stop operate on - /// the underlying container or VM. - /// - `Some(false)` — registered via `gateway add`; destroy/stop only remove - /// the local registration metadata. - /// - `None` — legacy metadata written before this field existed; the CLI - /// falls back to the previous heuristic (`is_remote`). - #[serde(default, skip_serializing_if = "Option::is_none")] - pub client_lifecycle_managed: Option, -} - -impl GatewayMetadata { - /// Extract the host portion from the stored `gateway_endpoint` URL. - /// - /// Returns `None` if the endpoint is malformed or uses a default loopback - /// address (`127.0.0.1`, `localhost`, `::1`) — those are never meaningful - /// as a `--gateway-host` override. - pub fn gateway_host(&self) -> Option<&str> { - // Endpoint format: "https://host:port" or "http://host:port" - let after_scheme = self - .gateway_endpoint - .strip_prefix("https://") - .or_else(|| self.gateway_endpoint.strip_prefix("http://"))?; - // Strip port suffix (":8082") - let host = after_scheme - .rsplit_once(':') - .map_or(after_scheme, |(h, _)| h); - if host.is_empty() - || host == "127.0.0.1" - || host == "localhost" - || host == "::1" - || host == "[::1]" - { - return None; - } - Some(host) - } -} - -pub fn create_gateway_metadata( - name: &str, - remote: Option<&RemoteOptions>, - port: u16, -) -> GatewayMetadata { - create_gateway_metadata_with_host(name, remote, port, None, false) -} - -/// Create gateway metadata, optionally overriding the gateway host. -/// -/// When `gateway_host` is `Some`, that value is used as the host portion of -/// `gateway_endpoint` instead of the default (`127.0.0.1` for local gateways, -/// or the resolved SSH host for remote gateways). -/// -/// When `disable_tls` is `true`, the gateway endpoint uses the `http://` -/// scheme instead of `https://`. This must match the server configuration -/// so that the CLI connects with the correct protocol. -pub fn create_gateway_metadata_with_host( - name: &str, - remote: Option<&RemoteOptions>, - port: u16, - gateway_host: Option<&str>, - disable_tls: bool, -) -> GatewayMetadata { - let scheme = if disable_tls { "http" } else { "https" }; - - let (gateway_endpoint, is_remote, remote_host, resolved_host) = remote.map_or_else( - || { - let host = gateway_host.map_or_else( - || local_gateway_host().unwrap_or_else(|| "127.0.0.1".to_string()), - String::from, - ); - (format!("{scheme}://{host}:{port}"), false, None, None) - }, - |opts| { - // Extract the host portion from the SSH destination, then resolve it - // via `ssh -G` to get the actual hostname/IP (handles SSH config aliases). - let ssh_host = extract_host_from_ssh_destination(&opts.destination); - let resolved = resolve_ssh_hostname(&ssh_host); - let host = gateway_host.unwrap_or(&resolved); - let endpoint = format!("{scheme}://{host}:{port}"); - ( - endpoint, - true, - Some(opts.destination.clone()), - Some(resolved), - ) - }, - ); - - GatewayMetadata { - name: name.to_string(), - gateway_endpoint, - is_remote, - gateway_port: port, - remote_host, - resolved_host, - auth_mode: disable_tls.then(|| "plaintext".to_string()), - client_lifecycle_managed: Some(true), - ..Default::default() - } -} - -pub fn local_gateway_host() -> Option { - std::env::var("DOCKER_HOST") - .ok() - .and_then(|value| local_gateway_host_from_docker_host(&value)) -} - -pub fn local_gateway_host_from_docker_host(docker_host: &str) -> Option { - let target = docker_host.strip_prefix("tcp://")?; - let authority = target.split('/').next()?; - if authority.is_empty() { - return None; - } - - let host = authority - .strip_prefix('[') - .map_or_else( - || authority.split(':').next().unwrap_or(""), - |rest| rest.split(']').next().unwrap_or(""), - ) - .trim(); - - if host.is_empty() || host == "localhost" || host == "127.0.0.1" || host == "::1" { - return None; - } - - Some(host.to_string()) } fn stored_metadata_path(name: &str) -> Result { @@ -435,57 +303,6 @@ mod tests { ); } - #[test] - fn local_gateway_metadata() { - let meta = create_gateway_metadata("test", None, 8080); - assert_eq!(meta.name, "test"); - assert_eq!(meta.gateway_endpoint, "https://127.0.0.1:8080"); - assert_eq!(meta.gateway_port, 8080); - assert!(!meta.is_remote); - assert!(meta.remote_host.is_none()); - assert!(meta.resolved_host.is_none()); - } - - #[test] - fn local_gateway_metadata_custom_port() { - let meta = create_gateway_metadata("test", None, 9090); - assert_eq!(meta.gateway_endpoint, "https://127.0.0.1:9090"); - assert_eq!(meta.gateway_port, 9090); - } - - #[test] - fn local_gateway_host_from_docker_host_tcp_service_name() { - let host = local_gateway_host_from_docker_host("tcp://docker:2375"); - assert_eq!(host.as_deref(), Some("docker")); - } - - #[test] - fn local_gateway_host_from_docker_host_tcp_loopback() { - let host = local_gateway_host_from_docker_host("tcp://127.0.0.1:2375"); - assert!(host.is_none()); - } - - #[test] - fn local_gateway_host_from_docker_host_unix_socket() { - let host = local_gateway_host_from_docker_host("unix:///var/run/docker.sock"); - assert!(host.is_none()); - } - - #[test] - fn remote_gateway_metadata_has_resolved_host() { - let opts = RemoteOptions::new("user@10.0.0.5"); - let meta = create_gateway_metadata("test", Some(&opts), 8080); - assert!(meta.is_remote); - assert_eq!(meta.remote_host.as_deref(), Some("user@10.0.0.5")); - // When the host is a plain IP, ssh -G should resolve it to itself - assert!(meta.resolved_host.is_some()); - assert_eq!( - meta.gateway_endpoint, - format!("https://{}:8080", meta.resolved_host.as_ref().unwrap()) - ); - assert_eq!(meta.gateway_port, 8080); - } - #[test] fn metadata_roundtrip() { let meta = GatewayMetadata { @@ -519,155 +336,6 @@ mod tests { assert!(parsed.resolved_host.is_none()); } - #[test] - fn metadata_deserialize_without_client_lifecycle_managed_field() { - // Legacy metadata files won't have the client_lifecycle_managed field. - // Ensure backwards compatibility: defaults to None. - let json = r#"{ - "name": "test", - "gateway_endpoint": "https://127.0.0.1:8080", - "is_remote": false, - "gateway_port": 8080 - }"#; - let parsed: GatewayMetadata = serde_json::from_str(json).unwrap(); - assert_eq!(parsed.client_lifecycle_managed, None); - } - - #[test] - fn metadata_roundtrip_with_client_lifecycle_managed_field() { - let meta = GatewayMetadata { - name: "test".to_string(), - gateway_endpoint: "https://127.0.0.1:8080".to_string(), - gateway_port: 8080, - client_lifecycle_managed: Some(false), - ..Default::default() - }; - let json = serde_json::to_string(&meta).unwrap(); - assert!(json.contains(r#""client_lifecycle_managed":false"#)); - let parsed: GatewayMetadata = serde_json::from_str(&json).unwrap(); - assert_eq!(parsed.client_lifecycle_managed, Some(false)); - } - - #[test] - fn metadata_omits_client_lifecycle_managed_when_none() { - let meta = GatewayMetadata { - name: "test".to_string(), - gateway_endpoint: "https://127.0.0.1:8080".to_string(), - gateway_port: 8080, - ..Default::default() - }; - let json = serde_json::to_string(&meta).unwrap(); - assert!(!json.contains("client_lifecycle_managed")); - } - - #[test] - fn create_gateway_metadata_sets_client_lifecycle_managed_true() { - let meta = create_gateway_metadata("test", None, 8080); - assert_eq!(meta.client_lifecycle_managed, Some(true)); - } - - #[test] - fn local_gateway_metadata_with_gateway_host_override() { - let meta = create_gateway_metadata_with_host( - "test", - None, - 8080, - Some("host.docker.internal"), - false, - ); - assert_eq!(meta.name, "test"); - assert_eq!(meta.gateway_endpoint, "https://host.docker.internal:8080"); - assert_eq!(meta.gateway_port, 8080); - assert!(!meta.is_remote); - assert!(meta.remote_host.is_none()); - assert!(meta.resolved_host.is_none()); - } - - #[test] - fn local_gateway_metadata_with_no_gateway_host_override() { - // When gateway_host is None, behaviour matches create_gateway_metadata. - let meta = create_gateway_metadata_with_host("test", None, 8080, None, false); - assert_eq!(meta.gateway_endpoint, "https://127.0.0.1:8080"); - } - - #[test] - fn local_gateway_metadata_with_tls_disabled() { - let meta = create_gateway_metadata_with_host("test", None, 8080, None, true); - assert_eq!(meta.gateway_endpoint, "http://127.0.0.1:8080"); - assert_eq!(meta.auth_mode.as_deref(), Some("plaintext")); - } - - #[test] - fn local_gateway_metadata_with_tls_disabled_and_gateway_host() { - let meta = create_gateway_metadata_with_host( - "test", - None, - 8080, - Some("host.docker.internal"), - true, - ); - assert_eq!(meta.gateway_endpoint, "http://host.docker.internal:8080"); - assert_eq!(meta.auth_mode.as_deref(), Some("plaintext")); - } - - // ── GatewayMetadata::gateway_host() ────────────────────────────── - - #[test] - fn gateway_host_returns_custom_host() { - let meta = - create_gateway_metadata_with_host("t", None, 8082, Some("host.docker.internal"), false); - assert_eq!(meta.gateway_host(), Some("host.docker.internal")); - } - - #[test] - fn gateway_host_returns_none_for_loopback() { - let meta = create_gateway_metadata("t", None, 8080); - // Default endpoint is https://127.0.0.1:8080 - assert_eq!(meta.gateway_host(), None); - } - - #[test] - fn gateway_host_returns_none_for_localhost() { - let meta = GatewayMetadata { - name: "t".into(), - gateway_endpoint: "https://localhost:8080".into(), - gateway_port: 8080, - ..Default::default() - }; - assert_eq!(meta.gateway_host(), None); - } - - #[test] - fn gateway_host_returns_ip_for_remote() { - let meta = GatewayMetadata { - name: "t".into(), - gateway_endpoint: "https://10.0.0.5:8080".into(), - is_remote: true, - gateway_port: 8080, - remote_host: Some("user@10.0.0.5".into()), - resolved_host: Some("10.0.0.5".into()), - ..Default::default() - }; - assert_eq!(meta.gateway_host(), Some("10.0.0.5")); - } - - #[test] - fn gateway_host_handles_http_scheme() { - let meta = - create_gateway_metadata_with_host("t", None, 8080, Some("host.docker.internal"), true); - assert_eq!(meta.gateway_host(), Some("host.docker.internal")); - } - - #[test] - fn remote_gateway_metadata_with_tls_disabled() { - let opts = RemoteOptions::new("user@10.0.0.5"); - let meta = create_gateway_metadata_with_host("test", Some(&opts), 8080, None, true); - assert!(meta.is_remote); - assert!(meta.gateway_endpoint.starts_with("http://")); - assert!(!meta.gateway_endpoint.starts_with("https://")); - assert_eq!(meta.auth_mode.as_deref(), Some("plaintext")); - } - // ── last-sandbox persistence ────────────────────────────────────── /// Helper: hold the shared XDG test lock, set `XDG_CONFIG_HOME` to a diff --git a/crates/openshell-bootstrap/src/push.rs b/crates/openshell-bootstrap/src/push.rs deleted file mode 100644 index 336d46c3e..000000000 --- a/crates/openshell-bootstrap/src/push.rs +++ /dev/null @@ -1,236 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Push locally-built images into a k3s gateway's containerd runtime. -//! -//! This module implements the "push" path for local development: images are -//! exported from the local Docker daemon (equivalent to `docker save`), -//! uploaded into the gateway container as a tar file via the Docker -//! `put_archive` API, and then imported into containerd via `ctr images import`. -//! -//! To avoid unbounded memory usage with large images, the export is streamed -//! to a temporary file on disk, then streamed back through a tar wrapper into -//! the Docker upload API. Peak memory usage is `O(chunk_size)` regardless of -//! image size. -//! -//! The standalone `ctr` binary is used (not `k3s ctr` which may not work in -//! all k3s versions) with the k3s containerd socket. The default containerd -//! namespace in k3s is already `k8s.io`, which is what kubelet uses. - -use std::pin::Pin; - -use bollard::Docker; -use bollard::query_parameters::UploadToContainerOptionsBuilder; -use bytes::Bytes; -use futures::{Stream, StreamExt}; -use miette::{IntoDiagnostic, Result, WrapErr}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; - -use crate::runtime::exec_capture_with_exit; - -/// Containerd socket path inside a k3s container. -const CONTAINERD_SOCK: &str = "/run/k3s/containerd/containerd.sock"; - -/// Path inside the container where the image tar is staged. -const IMPORT_TAR_PATH: &str = "/tmp/openshell-images.tar"; - -/// Size of chunks read from the temp file during streaming upload (8 MiB). -const UPLOAD_CHUNK_SIZE: usize = 8 * 1024 * 1024; - -/// Report export progress every N bytes (100 MiB). -const PROGRESS_INTERVAL_BYTES: u64 = 100 * 1024 * 1024; - -/// Push a list of images from the local Docker daemon into a k3s gateway's -/// containerd runtime. -/// -/// All images are exported as a single tar (shared layers are deduplicated), -/// streamed to a temporary file, then uploaded to the container filesystem -/// and imported into containerd. Memory usage is bounded to `O(chunk_size)` -/// regardless of image size. -pub async fn push_local_images( - local_docker: &Docker, - gateway_docker: &Docker, - container_name: &str, - images: &[&str], - on_log: &mut impl FnMut(String), -) -> Result<()> { - if images.is_empty() { - return Ok(()); - } - - // 1. Export all images from the local Docker daemon to a temp file. - let (tmp_file, file_size) = export_to_tempfile(local_docker, images, on_log).await?; - on_log(format!( - "[progress] Exported {} MiB", - file_size / (1024 * 1024) - )); - - // 2. Stream the image tar wrapped in an outer tar archive into the - // container filesystem via the Docker put_archive API. - let parent_dir = IMPORT_TAR_PATH.rsplit_once('/').map_or("/", |(dir, _)| dir); - let options = UploadToContainerOptionsBuilder::default() - .path(parent_dir) - .build(); - - let upload_stream = streaming_tar_upload(IMPORT_TAR_PATH, tmp_file, file_size); - gateway_docker - .upload_to_container( - container_name, - Some(options), - bollard::body_try_stream(upload_stream), - ) - .await - .into_diagnostic() - .wrap_err("failed to upload image tar into container")?; - on_log("[progress] Uploaded to gateway".to_string()); - - // 3. Import the tar into containerd via ctr. - let (output, exit_code) = exec_capture_with_exit( - gateway_docker, - container_name, - vec![ - "ctr".to_string(), - "-a".to_string(), - CONTAINERD_SOCK.to_string(), - "-n".to_string(), - "k8s.io".to_string(), - "images".to_string(), - "import".to_string(), - IMPORT_TAR_PATH.to_string(), - ], - ) - .await?; - - if exit_code != 0 { - return Err(miette::miette!( - "ctr images import exited with code {exit_code}\n{output}" - )); - } - - // 4. Clean up the staged tar file. - let _ = exec_capture_with_exit( - gateway_docker, - container_name, - vec![ - "rm".to_string(), - "-f".to_string(), - IMPORT_TAR_PATH.to_string(), - ], - ) - .await; - - Ok(()) -} - -/// Stream the Docker image export directly to a temporary file. -/// -/// Returns the temp file handle and the total number of bytes written. -/// Memory usage is `O(chunk_size)` — only one chunk is held at a time. -/// Progress is reported every [`PROGRESS_INTERVAL_BYTES`] bytes. -async fn export_to_tempfile( - docker: &Docker, - images: &[&str], - on_log: &mut impl FnMut(String), -) -> Result<(tempfile::NamedTempFile, u64)> { - let tmp = tempfile::NamedTempFile::new() - .into_diagnostic() - .wrap_err("failed to create temp file for image export")?; - - // Open a second handle for async writing; the NamedTempFile retains - // ownership and ensures cleanup on drop. - let std_file = tmp - .reopen() - .into_diagnostic() - .wrap_err("failed to reopen temp file for writing")?; - let mut async_file = tokio::fs::File::from_std(std_file); - - let mut stream = docker.export_images(images); - let mut total_bytes: u64 = 0; - let mut last_reported: u64 = 0; - - while let Some(chunk) = stream.next().await { - let bytes = chunk - .into_diagnostic() - .wrap_err("failed to read image export stream")?; - async_file - .write_all(&bytes) - .await - .into_diagnostic() - .wrap_err("failed to write image data to temp file")?; - total_bytes += bytes.len() as u64; - - // Report progress periodically. - if total_bytes >= last_reported + PROGRESS_INTERVAL_BYTES { - let mb = total_bytes / (1024 * 1024); - on_log(format!("[progress] Exported {mb} MiB")); - last_reported = total_bytes; - } - } - - async_file - .flush() - .await - .into_diagnostic() - .wrap_err("failed to flush temp file")?; - - Ok((tmp, total_bytes)) -} - -/// Create a stream that yields an outer tar archive containing the image tar -/// as a single entry, reading the image data from the temp file in chunks. -/// -/// The Docker `put_archive` API expects a tar that is extracted at a target -/// directory. We construct a tar with one entry whose name is the basename -/// of `file_path`. The stream yields: -/// 1. A 512-byte GNU tar header -/// 2. The file content in [`UPLOAD_CHUNK_SIZE`] chunks -/// 3. Padding to a 512-byte boundary + two 512-byte zero EOF blocks -/// -/// Memory usage is O([`UPLOAD_CHUNK_SIZE`]) regardless of file size. -fn streaming_tar_upload( - file_path: &str, - tmp_file: tempfile::NamedTempFile, - file_size: u64, -) -> Pin> + Send>> { - let file_name = file_path - .rsplit('/') - .next() - .unwrap_or(file_path) - .to_string(); - - Box::pin(async_stream::try_stream! { - // 1. Build and yield the tar header. - let mut header = tar::Header::new_gnu(); - header.set_path(&file_name)?; - header.set_size(file_size); - header.set_mode(0o644); - header.set_cksum(); - yield Bytes::copy_from_slice(header.as_bytes()); - - // 2. Stream the temp file content in chunks. - let std_file = tmp_file.reopen()?; - let mut async_file = tokio::fs::File::from_std(std_file); - let mut buf = vec![0u8; UPLOAD_CHUNK_SIZE]; - loop { - let n = async_file.read(&mut buf).await?; - if n == 0 { - break; - } - yield Bytes::copy_from_slice(&buf[..n]); - } - - // 3. Yield tar padding and EOF blocks. - // Tar entries must be padded to a 512-byte boundary, followed by - // two 512-byte zero blocks to signal end-of-archive. - let padding_len = if file_size.is_multiple_of(512) { - 0 - } else { - 512 - (file_size % 512) as usize - }; - let footer = vec![0u8; padding_len + 1024]; - yield Bytes::from(footer); - - // The NamedTempFile is dropped here, cleaning up the temp file. - drop(tmp_file); - }) -} diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs deleted file mode 100644 index 5bbfb5b6c..000000000 --- a/crates/openshell-bootstrap/src/runtime.rs +++ /dev/null @@ -1,690 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -use crate::constants::{KUBECONFIG_PATH, container_name, node_name}; -use bollard::Docker; -use bollard::container::LogOutput; -use bollard::exec::CreateExecOptions; -use bollard::models::HealthStatusEnum; -use bollard::query_parameters::{InspectContainerOptions, LogsOptionsBuilder}; -use futures::StreamExt; -use miette::{IntoDiagnostic, Result}; -use std::collections::VecDeque; -use std::time::Duration; -use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; - -/// Log markers emitted by the entrypoint and health-check scripts when DNS -/// resolution fails inside the container. Detecting these early lets us -/// short-circuit the 6-minute polling loop and surface a clear diagnosis. -const DNS_FAILURE_MARKERS: &[&str] = &["DNS_PROBE_FAILED", "HEALTHCHECK_DNS_FAILURE"]; - -/// Log marker emitted by the health-check script when a Kubernetes node is -/// under resource pressure (`DiskPressure`, `MemoryPressure`, `PIDPressure`). -/// When a node has pressure conditions the kubelet evicts pods and rejects -/// new scheduling, so the cluster will never become healthy on its own. -const NODE_PRESSURE_MARKER: &str = "HEALTHCHECK_NODE_PRESSURE"; - -/// Log marker emitted by the health-check script when the sandbox supervisor -/// binary (`/opt/openshell/bin/openshell-sandbox`) is missing from the node -/// filesystem. Without this binary, every sandbox pod will crash immediately -/// with "no such file or directory". This is a permanent error that requires -/// rebuilding or updating the cluster image. -const MISSING_SUPERVISOR_MARKER: &str = "HEALTHCHECK_MISSING_SUPERVISOR"; - -/// Number of consecutive polling iterations that must observe DNS failure -/// markers before we treat the failure as persistent and abort. A small -/// grace period avoids false positives on transient hiccups during startup. -const DNS_FAILURE_GRACE_ITERATIONS: u32 = 5; - -/// Number of consecutive polling iterations that must observe node pressure -/// markers before aborting. Slightly longer grace period than DNS since -/// transient pressure can occur during image extraction on startup. -const NODE_PRESSURE_GRACE_ITERATIONS: u32 = 8; - -pub async fn wait_for_gateway_ready(docker: &Docker, name: &str, mut on_log: F) -> Result<()> -where - F: FnMut(String) + Send, -{ - let container_name = container_name(name); - let (log_tx, mut log_rx) = unbounded_channel(); - let log_docker = docker.clone(); - let log_container_name = container_name.clone(); - let log_task = tokio::spawn(async move { - stream_container_logs(&log_docker, &log_container_name, &log_tx).await; - }); - - let mut recent_logs = VecDeque::with_capacity(15); - let attempts = 180; - let mut result = None; - let mut dns_failure_seen_count: u32 = 0; - let mut node_pressure_seen_count: u32 = 0; - - for attempt in 0..attempts { - drain_logs(&mut log_rx, &mut recent_logs, &mut on_log); - - // -- Early DNS failure detection --------------------------------- - // Check recent logs for DNS failure markers emitted by the - // entrypoint or health-check scripts. If the marker persists for - // several consecutive iterations the DNS proxy is broken and - // waiting longer won't help. - let dns_failing = recent_logs - .iter() - .any(|line| DNS_FAILURE_MARKERS.iter().any(|m| line.contains(m))); - if dns_failing { - dns_failure_seen_count += 1; - if dns_failure_seen_count >= DNS_FAILURE_GRACE_ITERATIONS { - result = Some(Err(miette::miette!( - "dial tcp: lookup registry: Try again\n\ - DNS resolution is failing inside the gateway container.\n{}", - format_recent_logs(&recent_logs) - ))); - break; - } - } else { - dns_failure_seen_count = 0; - } - - // -- Early node pressure detection ------------------------------- - // Check for DiskPressure / MemoryPressure / PIDPressure markers - // emitted by the health-check script. Under pressure the kubelet - // evicts pods and blocks new scheduling, so waiting won't help. - let pressure_lines: Vec<&str> = recent_logs - .iter() - .filter(|line| line.contains(NODE_PRESSURE_MARKER)) - .map(String::as_str) - .collect(); - if pressure_lines.is_empty() { - node_pressure_seen_count = 0; - } else { - node_pressure_seen_count += 1; - if node_pressure_seen_count >= NODE_PRESSURE_GRACE_ITERATIONS { - // Extract the specific pressure type(s) from the marker lines - let conditions: Vec = pressure_lines - .iter() - .filter_map(|line| { - line.find(NODE_PRESSURE_MARKER) - .map(|pos| &line[pos + NODE_PRESSURE_MARKER.len()..]) - .map(|rest| rest.trim_start_matches(':').trim().to_string()) - }) - .filter(|s| !s.is_empty()) - .collect(); - let condition_list = if conditions.is_empty() { - "unknown pressure condition".to_string() - } else { - conditions.join(", ") - }; - result = Some(Err(miette::miette!( - "HEALTHCHECK_NODE_PRESSURE: {condition_list}\n\ - The cluster node is under resource pressure. \ - The kubelet is evicting pods and rejecting new scheduling.\n{}", - format_recent_logs(&recent_logs) - ))); - break; - } - } - - // -- Missing supervisor binary detection ---------------------------- - // The health-check script verifies that /opt/openshell/bin/openshell-sandbox - // exists on the node filesystem. If missing, every sandbox pod will crash. - // This is a permanent error — fail immediately with actionable guidance. - if recent_logs - .iter() - .any(|line| line.contains(MISSING_SUPERVISOR_MARKER)) - { - result = Some(Err(miette::miette!( - "The sandbox supervisor binary is missing from the cluster image.\n\ - The file /opt/openshell/bin/openshell-sandbox was not found in the gateway \ - container. Without it, sandbox pods cannot start.\n\n\ - This usually means the cluster image was built or published without the \ - staged prebuilt openshell-sandbox binary.\n\n\ - To fix:\n \ - 1. Rebuild the cluster image: mise run docker:build:cluster\n \ - 2. Or update to a cluster image that includes the supervisor binary\n \ - 3. Then recreate the gateway: openshell gateway destroy && openshell gateway start\n\n{}", - format_recent_logs(&recent_logs) - ))); - break; - } - - let inspect = docker - .inspect_container(&container_name, None::) - .await - .into_diagnostic()?; - - // Check if the container has exited before checking health - let running = inspect - .state - .as_ref() - .and_then(|s| s.running) - .unwrap_or(false); - if !running { - drain_logs(&mut log_rx, &mut recent_logs, &mut on_log); - let exit_code = inspect - .state - .as_ref() - .and_then(|s| s.exit_code) - .unwrap_or(-1); - let error_msg = inspect - .state - .as_ref() - .and_then(|s| s.error.as_deref()) - .unwrap_or(""); - let mut detail = - format!("gateway container exited unexpectedly (exit_code={exit_code})"); - if !error_msg.is_empty() { - use std::fmt::Write; - let _ = write!(detail, ", error={error_msg}"); - } - result = Some(Err(miette::miette!( - "{detail}\n{}", - format_recent_logs(&recent_logs) - ))); - break; - } - - let status = inspect - .state - .and_then(|state| state.health) - .and_then(|health| health.status); - - match status { - Some(HealthStatusEnum::HEALTHY) => { - result = Some(Ok(())); - break; - } - Some(HealthStatusEnum::UNHEALTHY) if attempt + 1 == attempts => { - result = Some(Err(miette::miette!( - "gateway health check reported unhealthy\n{}", - format_recent_logs(&recent_logs) - ))); - break; - } - Some(HealthStatusEnum::NONE | HealthStatusEnum::EMPTY) | None if attempt == 0 => { - result = Some(Err(miette::miette!( - "gateway container does not expose a health check\n{}", - format_recent_logs(&recent_logs) - ))); - break; - } - _ => {} - } - - tokio::time::sleep(Duration::from_secs(2)).await; - } - - if result.is_none() { - drain_logs(&mut log_rx, &mut recent_logs, &mut on_log); - result = Some(Err(miette::miette!( - "timed out waiting for gateway health check\n{}", - format_recent_logs(&recent_logs) - ))); - } - - log_task.abort(); - - result.unwrap_or_else(|| Err(miette::miette!("gateway health status unavailable"))) -} - -async fn stream_container_logs( - docker: &Docker, - container_name: &str, - tx: &UnboundedSender, -) { - let options = LogsOptionsBuilder::new() - .follow(true) - .stdout(true) - .stderr(true) - .tail("0") - .build(); - let mut stream = docker.logs(container_name, Some(options)); - - let mut stdout_partial = String::new(); - let mut stderr_partial = String::new(); - let mut console_partial = String::new(); - while let Some(item) = stream.next().await { - match item { - Ok(log) => match log { - LogOutput::StdOut { message } => { - append_log_chunk(tx, &mut stdout_partial, &String::from_utf8_lossy(&message)); - } - LogOutput::StdErr { message } => { - append_log_chunk(tx, &mut stderr_partial, &String::from_utf8_lossy(&message)); - } - LogOutput::Console { message } => { - append_log_chunk(tx, &mut console_partial, &String::from_utf8_lossy(&message)); - } - LogOutput::StdIn { .. } => {} - }, - Err(err) => { - let _ = tx.send(format!("[log stream error] {err}")); - return; - } - } - } - - flush_partial(tx, &mut stdout_partial); - flush_partial(tx, &mut stderr_partial); - flush_partial(tx, &mut console_partial); -} - -fn append_log_chunk(tx: &UnboundedSender, partial: &mut String, chunk: &str) { - partial.push_str(chunk); - while let Some(pos) = partial.find('\n') { - let line = partial[..pos].trim_end_matches('\r').to_string(); - if !line.is_empty() { - let _ = tx.send(line); - } - partial.drain(..=pos); - } -} - -fn flush_partial(tx: &UnboundedSender, partial: &mut String) { - let line = partial.trim(); - if !line.is_empty() { - let _ = tx.send(line.to_string()); - } - partial.clear(); -} - -fn drain_logs( - rx: &mut UnboundedReceiver, - recent_logs: &mut VecDeque, - on_log: &mut F, -) where - F: FnMut(String), -{ - while let Ok(line) = rx.try_recv() { - if recent_logs.len() == 15 { - recent_logs.pop_front(); - } - recent_logs.push_back(line.clone()); - on_log(line); - } -} - -fn format_recent_logs(recent_logs: &VecDeque) -> String { - if recent_logs.is_empty() { - return "container logs: none received".to_string(); - } - - let mut rendered = String::from("container logs:"); - for line in recent_logs { - rendered.push('\n'); - rendered.push_str(" "); - rendered.push_str(line); - } - rendered -} - -/// Fetch the last `n` lines of container logs (non-streaming, for error context). -pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize) -> String { - let options = LogsOptionsBuilder::new() - .follow(false) - .stdout(true) - .stderr(true) - .tail(&n.to_string()) - .build(); - let mut stream = docker.logs(container_name, Some(options)); - - let mut lines = Vec::new(); - while let Some(item) = stream.next().await { - match item { - Ok(log) => { - let text = match log { - LogOutput::StdOut { message } - | LogOutput::StdErr { message } - | LogOutput::Console { message } => { - String::from_utf8_lossy(&message).to_string() - } - LogOutput::StdIn { .. } => continue, - }; - for line in text.lines() { - let trimmed = line.trim(); - if !trimmed.is_empty() { - lines.push(trimmed.to_string()); - } - } - } - Err(_) => break, - } - } - - if lines.is_empty() { - return "container logs: none available".to_string(); - } - - let mut rendered = String::from("container logs:"); - for line in &lines { - rendered.push('\n'); - rendered.push_str(" "); - rendered.push_str(line); - } - rendered -} - -/// Remove stale k3s nodes and their orphaned pods from a resumed cluster. -/// -/// When a cluster container is recreated but the volume is reused, k3s registers -/// a new node (using the container ID as the hostname) while old node entries -/// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run, -/// causing health checks to fail. -/// -/// This function retries with backoff until `kubectl` becomes available (k3s may -/// still be initialising), then: -/// 1. Deletes all `NotReady` nodes so k3s stops tracking them. -/// 2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and -/// Deployments can reschedule replacements on the current (Ready) node. -/// -/// Returns the number of stale nodes removed. -pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { - // Retry until kubectl is responsive. k3s can take 10-20 s to start the - // API server after a container restart, so we allow up to ~45 s. - const MAX_ATTEMPTS: u32 = 15; - const RETRY_DELAY: Duration = Duration::from_secs(3); - - let container_name = container_name(name); - let mut stale_nodes: Vec = Vec::new(); - - // Determine the current node name. With the deterministic `--node-name` - // entrypoint change the k3s node is `openshell-{gateway}`. However, older - // cluster images (built before that change) still use the container hostname - // (= Docker container ID) as the node name. We must handle both: - // - // 1. If the expected deterministic name appears in the node list, use it. - // 2. Otherwise fall back to the container hostname (old behaviour). - // - // This ensures backward compatibility during upgrades where the bootstrap - // CLI is newer than the cluster image. - let deterministic_node = node_name(name); - - for attempt in 1..=MAX_ATTEMPTS { - let (output, exit_code) = exec_capture_with_exit( - docker, - &container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \ - --no-headers -o custom-columns=NAME:.metadata.name \ - 2>/dev/null" - ), - ], - ) - .await?; - - if exit_code == 0 { - let all_nodes: Vec<&str> = output - .lines() - .map(str::trim) - .filter(|l| !l.is_empty()) - .collect(); - - // Pick the current node identity: prefer the deterministic name, - // fall back to the container hostname for older cluster images. - let current_node = if all_nodes.contains(&deterministic_node.as_str()) { - deterministic_node.clone() - } else { - // Older cluster image without --node-name: read hostname. - let (hostname_out, _) = - exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) - .await?; - hostname_out.trim().to_string() - }; - - stale_nodes = all_nodes - .into_iter() - .filter(|n| *n != current_node) - .map(ToString::to_string) - .collect(); - break; - } - - if attempt < MAX_ATTEMPTS { - tracing::debug!( - "kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s", - RETRY_DELAY.as_secs() - ); - tokio::time::sleep(RETRY_DELAY).await; - } - } - - if stale_nodes.is_empty() { - return Ok(0); - } - - let node_list = stale_nodes.join(" "); - let count = stale_nodes.len(); - tracing::info!("removing {} stale node(s): {}", count, node_list); - - // Step 1: delete the stale node objects. - let (_output, exit_code) = exec_capture_with_exit( - docker, - &container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found" - ), - ], - ) - .await?; - - if exit_code != 0 { - tracing::warn!("failed to delete stale nodes (exit code {exit_code})"); - } - - // Step 2: force-delete pods stuck in Terminating. After the stale node is - // removed, pods that were scheduled on it transition to Terminating but - // will never complete graceful shutdown (the node is gone). StatefulSets - // will not create a replacement until the old pod is fully deleted. - let (_output, exit_code) = exec_capture_with_exit( - docker, - &container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \ - --field-selector=status.phase=Running -o name 2>/dev/null; \ - for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \ - --no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \ - ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \ - KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \ - --force --grace-period=0 --ignore-not-found 2>/dev/null; \ - done" - ), - ], - ) - .await?; - - if exit_code != 0 { - tracing::debug!( - "force-delete of terminating pods returned exit code {exit_code} (non-fatal)" - ); - } - - // Step 3: delete PersistentVolumeClaims in the openshell namespace whose - // backing PV has node affinity for a stale node. local-path-provisioner - // creates PVs tied to the original node; when the node changes, the PV is - // unschedulable and the `StatefulSet` pod stays Pending. Deleting the PVC - // (and its PV) lets the provisioner create a fresh one on the current node. - let (_output, exit_code) = exec_capture_with_exit( - docker, - &container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - r#"KUBECONFIG={KUBECONFIG_PATH}; export KUBECONFIG; \ - CURRENT_NODE=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1); \ - [ -z "$CURRENT_NODE" ] && exit 0; \ - for pv in $(kubectl get pv -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null); do \ - NODE=$(kubectl get pv "$pv" -o jsonpath='{{.spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]}}' 2>/dev/null); \ - [ "$NODE" = "$CURRENT_NODE" ] && continue; \ - NS=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.namespace}}' 2>/dev/null); \ - PVC=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.name}}' 2>/dev/null); \ - [ -n "$PVC" ] && kubectl delete pvc "$PVC" -n "$NS" --ignore-not-found 2>/dev/null; \ - kubectl delete pv "$pv" --ignore-not-found 2>/dev/null; \ - done"# - ), - ], - ) - .await?; - - if exit_code != 0 { - tracing::debug!("PV/PVC cleanup returned exit code {exit_code} (non-fatal)"); - } - - Ok(count) -} - -/// Restart the openshell workload so pods pick up updated images or secrets. -/// -/// Probes for a `StatefulSet` first, then falls back to a `Deployment`, matching -/// the same detection pattern used by `cluster-deploy-fast.sh`. -pub async fn restart_openshell_deployment(docker: &Docker, name: &str) -> Result<()> { - let cname = container_name(name); - - // Detect which workload kind exists in the cluster. - let workload_kind = detect_openshell_workload_kind(docker, &cname).await?; - let workload_ref = format!("{workload_kind}/openshell"); - - let (restart_output, restart_exit) = exec_capture_with_exit( - docker, - &cname, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl rollout restart {workload_ref} -n openshell" - ), - ], - ) - .await?; - if restart_exit != 0 { - return Err(miette::miette!( - "failed to restart openshell {workload_ref} (exit code {restart_exit})\n{restart_output}" - )); - } - - let (status_output, status_exit) = exec_capture_with_exit( - docker, - &cname, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl rollout status {workload_ref} -n openshell --timeout=180s" - ), - ], - ) - .await?; - if status_exit != 0 { - return Err(miette::miette!( - "openshell rollout status failed for {workload_ref} (exit code {status_exit})\n{status_output}" - )); - } - - Ok(()) -} - -/// Check whether an openshell workload exists in the cluster (`StatefulSet` or `Deployment`). -pub async fn openshell_workload_exists(docker: &Docker, name: &str) -> Result { - let cname = container_name(name); - match detect_openshell_workload_kind(docker, &cname).await { - Ok(_) => Ok(true), - Err(_) => Ok(false), - } -} - -/// Detect whether openshell is deployed as a `StatefulSet` or `Deployment`. -/// Returns "statefulset" or "deployment". -async fn detect_openshell_workload_kind(docker: &Docker, container_name: &str) -> Result { - // Check StatefulSet first (primary workload type for fresh deploys) - let (_, ss_exit) = exec_capture_with_exit( - docker, - container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl get statefulset/openshell -n openshell -o name 2>/dev/null" - ), - ], - ) - .await?; - if ss_exit == 0 { - return Ok("statefulset".to_string()); - } - - // Fall back to Deployment - let (_, dep_exit) = exec_capture_with_exit( - docker, - container_name, - vec![ - "sh".to_string(), - "-c".to_string(), - format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl get deployment/openshell -n openshell -o name 2>/dev/null" - ), - ], - ) - .await?; - if dep_exit == 0 { - return Ok("deployment".to_string()); - } - - Err(miette::miette!( - "no openshell workload (statefulset or deployment) found in namespace 'openshell'" - )) -} - -pub async fn exec_capture_with_exit( - docker: &Docker, - container_name: &str, - cmd: Vec, -) -> Result<(String, i64)> { - let exec = docker - .create_exec( - container_name, - CreateExecOptions { - attach_stdout: Some(true), - attach_stderr: Some(true), - cmd: Some(cmd), - ..Default::default() - }, - ) - .await - .into_diagnostic()? - .id; - - let start = docker.start_exec(&exec, None).await.into_diagnostic()?; - let mut buffer = String::new(); - if let bollard::exec::StartExecResults::Attached { mut output, .. } = start { - while let Some(item) = output.next().await { - let log = item.into_diagnostic()?; - match log { - LogOutput::StdOut { message } - | LogOutput::StdErr { message } - | LogOutput::Console { message } => { - buffer.push_str(&String::from_utf8_lossy(&message)); - } - LogOutput::StdIn { .. } => {} - } - } - } - - let mut exit_code = None; - for _ in 0..20 { - let inspect = docker.inspect_exec(&exec).await.into_diagnostic()?; - if let Some(code) = inspect.exit_code { - exit_code = Some(code); - break; - } - tokio::time::sleep(Duration::from_millis(200)).await; - } - - Ok((buffer, exit_code.unwrap_or(1))) -} diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs deleted file mode 100644 index fcb7744ab..000000000 --- a/crates/openshell-cli/src/bootstrap.rs +++ /dev/null @@ -1,367 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Auto-bootstrap helpers for sandbox creation. -//! -//! When `sandbox create` cannot reach a gateway, these helpers determine whether -//! to attempt gateway bootstrap and execute the local or remote bootstrap flow. -//! Bootstrap proceeds automatically unless the user opts out with `--no-bootstrap`. - -use std::time::Duration; - -use crate::tls::{TlsOptions, grpc_client}; -use miette::Result; -use owo_colors::OwoColorize; - -use crate::run::{deploy_gateway_with_panel, print_deploy_summary}; - -/// Default gateway name used during auto-bootstrap. -const DEFAULT_GATEWAY_NAME: &str = "openshell"; - -/// Determines if a gRPC connection error indicates the gateway is unreachable -/// and bootstrap should be offered. -/// -/// Returns `true` for connectivity errors (connection refused, timeout, DNS failure) -/// and for missing default TLS materials (which implies no gateway has been deployed). -/// -/// Returns `false` for explicit TLS configuration errors, auth failures, and other -/// non-connectivity issues. -pub fn should_attempt_bootstrap(error: &miette::Report, tls: &TlsOptions) -> bool { - // If TLS paths were explicitly provided (e.g. in tests) and they failed, - // that's a configuration error, not a missing-gateway situation. - if tls.has_any() { - return is_connectivity_error(error); - } - - // With no explicit TLS options, missing default cert files strongly implies - // no gateway has been bootstrapped yet. - let msg = format!("{error:?}"); - if is_missing_tls_material(&msg) { - return true; - } - - is_connectivity_error(error) -} - -/// Check if the error message indicates missing TLS material files at default paths. -fn is_missing_tls_material(msg: &str) -> bool { - let lower = msg.to_lowercase(); - // require_tls_materials fails with "failed to read TLS ..." when cert files are absent - (lower.contains("failed to read tls") || lower.contains("tls ca is required")) - && (lower.contains("no such file") - || lower.contains("not found") - || lower.contains("is required")) -} - -/// Check if the error represents a network connectivity failure. -fn is_connectivity_error(error: &miette::Report) -> bool { - let msg = format!("{error:?}"); - let lower = msg.to_lowercase(); - - // Connection-level failures - let connectivity_patterns = [ - "connection refused", - "connect error", - "tcp connect", - "dns error", - "name resolution", - "no route to host", - "network unreachable", - "connection reset", - "broken pipe", - "connection timed out", - "operation timed out", - ]; - - // TLS/auth errors that should NOT trigger bootstrap - let non_connectivity_patterns = [ - "certificate", - "handshake", - "ssl", - "tls error", - "authorization", - "authentication", - "permission denied", - "forbidden", - "unauthorized", - ]; - - // If any non-connectivity pattern matches, don't offer bootstrap - if non_connectivity_patterns.iter().any(|p| lower.contains(p)) { - return false; - } - - // Check for connectivity patterns - connectivity_patterns.iter().any(|p| lower.contains(p)) -} - -/// Decide whether gateway bootstrap should proceed. -/// -/// When `override_value` is `Some(false)` (from `--no-bootstrap`), returns -/// `false` to skip bootstrap. Otherwise returns `true` — a gateway is created -/// automatically without prompting the user. -pub fn confirm_bootstrap(override_value: Option) -> Result { - if override_value == Some(false) { - return Ok(false); - } - Ok(true) -} - -/// Resolve the gateway name for bootstrap. -/// -/// Respects `$OPENSHELL_GATEWAY` if set, otherwise falls back to the default. -fn resolve_bootstrap_name() -> String { - std::env::var("OPENSHELL_GATEWAY") - .ok() - .filter(|v| !v.trim().is_empty()) - .unwrap_or_else(|| DEFAULT_GATEWAY_NAME.to_string()) -} - -/// Bootstrap a local gateway and return refreshed TLS options that pick up the -/// newly-written mTLS certificates, along with the gateway name used. -pub async fn run_bootstrap( - remote: Option<&str>, - ssh_key: Option<&str>, - gpu: bool, -) -> Result<(TlsOptions, String, String)> { - let gateway_name = resolve_bootstrap_name(); - let location = if remote.is_some() { "remote" } else { "local" }; - - eprintln!(); - eprintln!( - "{} No gateway found — starting one automatically.", - "ℹ".cyan().bold() - ); - eprintln!(); - eprintln!(" The Gateway provides a secure control plane for OpenShell. It streamlines"); - eprintln!(" access for humans and agents alike — handles sandbox orchestration, and"); - eprintln!(" enables secure, concurrent agent workflows."); - eprintln!(); - eprintln!( - " Manage it later with: {} or {}", - "openshell status".bold(), - "openshell gateway stop".bold(), - ); - eprintln!(); - - // Build deploy options. The deploy flow auto-resumes from existing state - // (preserving sandboxes and secrets) when it finds an existing gateway. - // If the initial attempt fails, fall back to a full recreate. - let build_options = |recreate: bool| { - let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name) - .with_recreate(recreate) - .with_gpu(if gpu { - vec!["auto".to_string()] - } else { - vec![] - }); - if let Some(dest) = remote { - let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); - if let Some(key) = ssh_key { - remote_opts = remote_opts.with_ssh_key(key); - } - opts = opts.with_remote(remote_opts); - } - // Read registry credentials from environment for the auto-bootstrap path. - // The explicit `--registry-username` / `--registry-token` flags are only - // on `gateway start`; when bootstrapping via `sandbox create`, the env - // vars are the mechanism. - if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") - && !username.trim().is_empty() - { - opts = opts.with_registry_username(username); - } - if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") - && !token.trim().is_empty() - { - opts = opts.with_registry_token(token); - } - // Read gateway host override from environment. Needed whenever the - // client cannot reach the Docker host at 127.0.0.1 — CI containers, - // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag - // is only on `gateway start`; this env var covers the auto-bootstrap - // path triggered by `sandbox create`. - if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") - && !host.trim().is_empty() - { - opts = opts.with_gateway_host(host); - } - opts - }; - - // Deploy the gateway. The deploy flow auto-resumes from existing state - // when it finds one. If that fails, fall back to a full recreate. - let handle = match Box::pin(deploy_gateway_with_panel( - build_options(false), - &gateway_name, - location, - )) - .await - { - Ok(handle) => handle, - Err(resume_err) => { - tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}"); - Box::pin(deploy_gateway_with_panel( - build_options(true), - &gateway_name, - location, - )) - .await? - } - }; - let server = handle.gateway_endpoint().to_string(); - - print_deploy_summary(&gateway_name, &handle); - - // Auto-activate the bootstrapped gateway. - if let Err(err) = openshell_bootstrap::save_active_gateway(&gateway_name) { - tracing::debug!("failed to set active gateway after bootstrap: {err}"); - } - - // Build fresh TLS options that resolve the newly-written mTLS certs from - // the default XDG path for this gateway, using the gateway name directly. - let tls = TlsOptions::default() - .with_gateway_name(&gateway_name) - .with_default_paths(&server); - - // Wait for the gateway gRPC endpoint to accept connections before - // handing back to the caller. The Docker health check may pass before - // the gRPC listener is fully ready, so retry with backoff. - wait_for_grpc_ready(&server, &tls).await?; - - Ok((tls, server, gateway_name)) -} - -/// Retry connecting to the gateway gRPC endpoint until it succeeds or a -/// timeout is reached. Uses exponential backoff starting at 500 ms, doubling -/// up to 4 s, with a total deadline of 90 s. -/// -/// The generous timeout accounts for gateway resume scenarios where stale k3s -/// nodes must be cleaned up and workload pods rescheduled before the gRPC -/// endpoint becomes available. -pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { - const MAX_WAIT: Duration = Duration::from_secs(90); - const INITIAL_BACKOFF: Duration = Duration::from_millis(500); - - let start = std::time::Instant::now(); - let mut backoff = INITIAL_BACKOFF; - let mut last_err = None; - - while start.elapsed() < MAX_WAIT { - match grpc_client(server, tls).await { - Ok(_client) => return Ok(()), - Err(err) => { - tracing::debug!( - elapsed = ?start.elapsed(), - "gateway not yet accepting connections: {err:#}" - ); - last_err = Some(err); - } - } - tokio::time::sleep(backoff).await; - backoff = (backoff * 2).min(Duration::from_secs(4)); - } - - Err(last_err - .unwrap_or_else(|| miette::miette!("timed out waiting for gateway")) - .wrap_err("gateway deployed but not accepting connections after 90 s")) -} - -#[cfg(test)] -mod tests { - use super::*; - - // -- should_attempt_bootstrap / is_connectivity_error tests -- - - fn report(msg: &str) -> miette::Report { - miette::miette!("{}", msg) - } - - #[test] - fn connection_refused_triggers_bootstrap() { - let err = report("tcp connect error: Connection refused (os error 111)"); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn dns_error_triggers_bootstrap() { - let err = report("dns error: failed to lookup address information"); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn timeout_triggers_bootstrap() { - let err = report("operation timed out"); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn no_route_triggers_bootstrap() { - let err = report("connect error: No route to host"); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn network_unreachable_triggers_bootstrap() { - let err = report("connect error: Network unreachable"); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn missing_default_tls_files_triggers_bootstrap() { - let err = report( - "failed to read TLS CA from /home/user/.config/openshell/clusters/openshell/mtls/ca.crt: No such file or directory", - ); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn tls_ca_required_triggers_bootstrap() { - let err = report("TLS CA is required for https endpoints"); - assert!(should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn certificate_error_does_not_trigger() { - let err = report("tls handshake error: certificate verify failed"); - assert!(!should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn auth_error_does_not_trigger() { - let err = report("authorization failed: permission denied"); - assert!(!should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn generic_error_does_not_trigger() { - let err = report("sandbox missing from response"); - assert!(!should_attempt_bootstrap(&err, &TlsOptions::default())); - } - - #[test] - fn explicit_tls_with_missing_files_does_not_trigger() { - // When the user explicitly provided TLS paths and they failed to read, - // that's a config error, not a missing cluster. - let tls = TlsOptions::new( - Some("/explicit/path/ca.crt".into()), - Some("/explicit/path/tls.crt".into()), - Some("/explicit/path/tls.key".into()), - ); - let err = - report("failed to read TLS CA from /explicit/path/ca.crt: No such file or directory"); - assert!(!should_attempt_bootstrap(&err, &tls)); - } - - #[test] - fn explicit_tls_with_connection_refused_triggers() { - // Even with explicit TLS, a connectivity error should still trigger bootstrap. - let tls = TlsOptions::new( - Some("/path/ca.crt".into()), - Some("/path/tls.crt".into()), - Some("/path/tls.key".into()), - ); - let err = report("tcp connect error: Connection refused"); - assert!(should_attempt_bootstrap(&err, &tls)); - } -} diff --git a/crates/openshell-cli/src/completers.rs b/crates/openshell-cli/src/completers.rs index d8ba3ff93..d5d9a0a88 100644 --- a/crates/openshell-cli/src/completers.rs +++ b/crates/openshell-cli/src/completers.rs @@ -178,7 +178,6 @@ mod tests { gateway_endpoint: "https://alpha.example.com".to_string(), is_remote: true, auth_mode: Some("cloudflare_jwt".to_string()), - client_lifecycle_managed: Some(false), ..Default::default() }, ) diff --git a/crates/openshell-cli/src/doctor_llm_prompt.md b/crates/openshell-cli/src/doctor_llm_prompt.md deleted file mode 100644 index fbb26a565..000000000 --- a/crates/openshell-cli/src/doctor_llm_prompt.md +++ /dev/null @@ -1,355 +0,0 @@ - - - -# Debug OpenShell Gateway - -You are diagnosing an OpenShell gateway cluster. Use **only** `openshell` CLI commands (`openshell status`, `openshell doctor logs`, `openshell doctor exec`) to inspect and fix the cluster. Do **not** use raw `docker`, `ssh`, or `kubectl` commands directly — always go through the `openshell doctor` interface. The CLI auto-resolves local vs remote gateways, so the same commands work everywhere. Run diagnostics automatically through the steps below in order. Stop and report findings as soon as a root cause is identified. - -## Tools Available - -All diagnostics go through three `openshell` commands. They auto-resolve local vs remote gateways — the same commands work for both: - -```bash -# Quick connectivity check (run this first) -openshell status - -# Fetch container logs -openshell doctor logs --lines 100 -openshell doctor logs --tail # stream live - -# Run any command inside the gateway container (KUBECONFIG is pre-configured) -openshell doctor exec -- kubectl get pods -A -openshell doctor exec -- kubectl -n openshell logs statefulset/openshell --tail=100 -openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml -openshell doctor exec -- df -h / -openshell doctor exec -- free -h -openshell doctor exec -- sh # interactive shell -``` - -## Overview - -`openshell gateway start` creates a Docker container running k3s with the OpenShell server deployed via Helm. The deployment stages, in order, are: - -1. **Pre-deploy check**: `openshell gateway start` in interactive mode prompts to **reuse** (keep volume, clean stale nodes) or **recreate** (destroy everything, fresh start). `mise run cluster` always recreates before deploy. -2. Ensure cluster image is available (local build or remote pull) -3. Create Docker network (`openshell-cluster`) and volume (`openshell-cluster-{name}`) -4. Create and start a privileged Docker container (`openshell-cluster-{name}`) -5. Wait for k3s to generate kubeconfig (up to 60s) -6. **Clean stale nodes**: Remove any `NotReady` k3s nodes left over from previous container instances that reused the same persistent volume -7. **Prepare local images** (if `OPENSHELL_PUSH_IMAGES` is set): In `internal` registry mode, bootstrap waits for the in-cluster registry and pushes tagged images there. In `external` mode, bootstrap uses legacy `ctr -n k8s.io images import` push-mode behavior. -8. **Reconcile TLS PKI**: Load existing TLS secrets from the cluster; if missing, incomplete, or malformed, generate fresh PKI (CA + server + client certs). Apply secrets to cluster. If rotation happened and the OpenShell workload is already running, rollout restart and wait for completion (failed rollout aborts deploy). -9. **Store CLI mTLS credentials**: Persist client cert/key/CA locally for CLI authentication. -10. Wait for cluster health checks to pass (up to 6 min): - - k3s API server readiness (`/readyz`) - - `openshell` statefulset ready in `openshell` namespace - - TLS secrets `openshell-server-tls` and `openshell-client-tls` exist in `openshell` namespace - -For local deploys, metadata endpoint selection depends on Docker connectivity: - -- default local Docker socket (`unix:///var/run/docker.sock`): `https://127.0.0.1:{port}` (default port 8080) -- TCP Docker daemon (`DOCKER_HOST=tcp://:`): `https://:{port}` for non-loopback hosts - -The host port is configurable via `--port` on `openshell gateway start` (default 8080) and is stored in `ClusterMetadata.gateway_port`. - -The TCP host is also added as an extra gateway TLS SAN so mTLS hostname validation succeeds. - -The default cluster name is `openshell`. The container is `openshell-cluster-{name}`. - -## Workflow - -### Determine Context - -Before running commands, establish: - -1. **Cluster name**: Default is `openshell`, giving container name `openshell-cluster-openshell` -2. **Remote or local**: The `openshell doctor` commands auto-resolve this from gateway metadata — no special flags needed for the active gateway -3. **Config directory**: `~/.config/openshell/gateways/{name}/` - -### Step 0: Quick Connectivity Check - -Run `openshell status` first. This immediately reveals: - -- Which gateway and endpoint the CLI is targeting -- Whether the CLI can reach the server (mTLS handshake success/failure) -- The server version if connected - -Common errors at this stage: - -- **`tls handshake eof`**: The server isn't running or mTLS credentials are missing/mismatched -- **`connection refused`**: The container isn't running or port mapping is broken -- **`No gateway configured`**: No gateway has been deployed yet - -### Step 1: Check Container Logs - -Get recent container logs to identify startup failures: - -```bash -openshell doctor logs --lines 100 -``` - -Look for: - -- DNS resolution failures in the entrypoint script -- k3s startup errors (certificate issues, port binding failures) -- Manifest copy errors from `/opt/openshell/manifests/` -- `iptables` or `cgroup` errors (privilege/capability issues) - -### Step 2: Check k3s Cluster Health - -Verify k3s itself is functional: - -```bash -# API server readiness -openshell doctor exec -- kubectl get --raw="/readyz" - -# Node status -openshell doctor exec -- kubectl get nodes -o wide - -# All pods -openshell doctor exec -- kubectl get pods -A -o wide -``` - -If `/readyz` fails, k3s is still starting or has crashed. Check container logs (Step 1). - -If pods are in `CrashLoopBackOff`, `ImagePullBackOff`, or `Pending`, investigate those pods specifically. - -Also check for node pressure conditions that cause the kubelet to evict pods and reject scheduling: - -```bash -# Check node conditions (DiskPressure, MemoryPressure, PIDPressure) -openshell doctor exec -- kubectl get nodes -o jsonpath="{range .items[*]}{.metadata.name}{range .status.conditions[*]} {.type}={.status}{end}{\"\n\"}{end}" - -# Check disk usage inside the container -openshell doctor exec -- df -h / - -# Check memory usage -openshell doctor exec -- free -h -``` - -If any pressure condition is `True`, pods will be evicted and new ones rejected. The bootstrap detects `HEALTHCHECK_NODE_PRESSURE` markers from the health-check script and aborts early with a clear diagnosis. To fix: free disk/memory on the host, then recreate the gateway. - -### Step 3: Check OpenShell Server StatefulSet - -The OpenShell server is deployed via a HelmChart CR as a StatefulSet named `openshell` in the `openshell` namespace. Check its status: - -```bash -# StatefulSet status -openshell doctor exec -- kubectl -n openshell get statefulset/openshell -o wide - -# OpenShell pod logs -openshell doctor exec -- kubectl -n openshell logs statefulset/openshell --tail=100 - -# Describe statefulset for events -openshell doctor exec -- kubectl -n openshell describe statefulset/openshell - -# Helm install job logs (the job that installs the OpenShell chart) -openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-openshell --tail=200 -``` - -Common issues: - -- **Replicas 0/0**: The StatefulSet has been scaled to zero — no pods are running. This can happen after a failed deploy, manual scale-down, or Helm values misconfiguration. Fix: `openshell doctor exec -- kubectl -n openshell scale statefulset openshell --replicas=1` -- **ImagePullBackOff**: The component image failed to pull. In `internal` mode, verify internal registry readiness and pushed image tags (Step 5). In `external` mode, check `/etc/rancher/k3s/registries.yaml` credentials/endpoints and DNS (Step 8). Default external registry is `ghcr.io/nvidia/openshell/` (public, no auth required). If using a private registry, ensure `--registry-username` and `--registry-token` (or `OPENSHELL_REGISTRY_USERNAME`/`OPENSHELL_REGISTRY_TOKEN`) were provided during deploy. -- **CrashLoopBackOff**: The server is crashing. Check pod logs for the actual error. -- **Pending**: Insufficient resources or scheduling constraints. - -### Step 4: Check Networking - -The OpenShell server is exposed via a NodePort service on port `30051`: - -```bash -# Service status -openshell doctor exec -- kubectl -n openshell get service/openshell -``` - -Expected port: `30051/tcp` (mapped to configurable host port, default 8080; set via `--port` on deploy). - -### Step 5: Check Image Availability - -Component images (server, sandbox) can reach kubelet via two paths: - -**Local/external pull mode** (default local via `mise run cluster`): Local images are tagged to the configured local registry base (default `127.0.0.1:5000/openshell/*`), pushed to that registry, and pulled by k3s via `registries.yaml` mirror endpoint (typically `host.docker.internal:5000`). The `cluster` task pushes prebuilt local tags (`openshell/*:dev`, falling back to `localhost:5000/openshell/*:dev` or `127.0.0.1:5000/openshell/*:dev`). - -```bash -# Verify image refs currently used by openshell deployment -openshell doctor exec -- kubectl -n openshell get statefulset openshell -o jsonpath="{.spec.template.spec.containers[*].image}" - -# Verify registry mirror/auth endpoint configuration -openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml -``` - -**Legacy push mode**: Images are imported into the k3s containerd `k8s.io` namespace. - -```bash -# Check if images were imported into containerd (k3s default namespace is k8s.io) -openshell doctor exec -- ctr -a /run/k3s/containerd/containerd.sock images ls | grep openshell -``` - -**External pull mode** (remote deploy, or local with `OPENSHELL_REGISTRY_HOST`/`IMAGE_REPO_BASE` pointing at a non-local registry): Images are pulled from an external registry at runtime. The entrypoint generates `/etc/rancher/k3s/registries.yaml`. - -```bash -# Verify registries.yaml exists and has credentials -openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml - -# Test pulling an image manually from inside the cluster -openshell doctor exec -- crictl pull ghcr.io/nvidia/openshell/gateway:latest -``` - -If `registries.yaml` is missing or has wrong values, verify env wiring (`OPENSHELL_REGISTRY_HOST`, `OPENSHELL_REGISTRY_INSECURE`, username/password for authenticated registries). - -### Step 6: Check mTLS / PKI - -TLS certificates are generated by the `openshell-bootstrap` crate (using `rcgen`) and stored as K8s secrets before the Helm release installs. There is no PKI job or cert-manager — certificates are applied directly via `kubectl apply`. - -```bash -# Check if the three TLS secrets exist -openshell doctor exec -- kubectl -n openshell get secret openshell-server-tls openshell-server-client-ca openshell-client-tls - -# Inspect server cert expiry (if openssl is available in the container) -openshell doctor exec -- sh -c 'kubectl -n openshell get secret openshell-server-tls -o jsonpath="{.data.tls\.crt}" | base64 -d | openssl x509 -noout -dates 2>/dev/null || echo "openssl not available"' - -# Check if CLI-side mTLS files exist locally -ls -la ~/.config/openshell/gateways//mtls/ -``` - -On redeploy, bootstrap reuses existing secrets if they are valid PEM. If secrets are missing or malformed, fresh PKI is generated and the OpenShell workload is automatically restarted. If the rollout restart fails after rotation, the deploy aborts and CLI-side certs are not updated. Certificates use rcgen defaults (effectively never expire). - -If the local mTLS files are missing but the secrets exist in the cluster, you can extract them manually: - -```bash -mkdir -p ~/.config/openshell/gateways//mtls -openshell doctor exec -- kubectl -n openshell get secret openshell-client-tls -o jsonpath='{.data.ca\.crt}' | base64 -d > ~/.config/openshell/gateways//mtls/ca.crt -openshell doctor exec -- kubectl -n openshell get secret openshell-client-tls -o jsonpath='{.data.tls\.crt}' | base64 -d > ~/.config/openshell/gateways//mtls/tls.crt -openshell doctor exec -- kubectl -n openshell get secret openshell-client-tls -o jsonpath='{.data.tls\.key}' | base64 -d > ~/.config/openshell/gateways//mtls/tls.key -``` - -Common mTLS issues: - -- **Secrets missing**: The `openshell` namespace may not have been created yet (Helm controller race). Bootstrap waits up to about 5 minutes for the namespace. -- **mTLS mismatch after manual secret deletion**: Delete all three secrets and redeploy — bootstrap will regenerate and restart the workload. -- **CLI can't connect after redeploy**: Check that `~/.config/openshell/gateways//mtls/` contains `ca.crt`, `tls.crt`, `tls.key` and that they were updated at deploy time. -- **Local mTLS files missing**: The gateway was deployed but CLI credentials weren't persisted (e.g., interrupted deploy). Extract from the cluster secret as shown above. - -### Step 7: Check Kubernetes Events - -Events catch scheduling failures, image pull errors, and resource issues: - -```bash -openshell doctor exec -- kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 -``` - -Look for: - -- `FailedScheduling` — resource constraints -- `ImagePullBackOff` / `ErrImagePull` — registry auth failure or DNS issue (check `/etc/rancher/k3s/registries.yaml`) -- `CrashLoopBackOff` — application crashes -- `OOMKilled` — memory limits too low -- `FailedMount` — volume issues - -### Step 8: Check DNS Resolution - -DNS misconfiguration is a common root cause, especially on remote/Linux hosts: - -```bash -# Check the resolv.conf k3s is using -openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf - -# Test DNS resolution from inside the container -openshell doctor exec -- sh -c 'nslookup google.com || wget -q -O /dev/null http://google.com && echo "network ok" || echo "network unreachable"' -``` - -Check the entrypoint's DNS decision in the container logs: - -```bash -openshell doctor logs --lines 20 -``` - -The entrypoint script selects DNS resolvers in this priority: - -1. Viable nameservers from `/etc/resolv.conf` (not loopback/link-local) -2. Docker `ExtServers` from `/etc/resolv.conf` comments -3. Host gateway IP (Docker Desktop only, `192.168.*`) -4. Fallback to `8.8.8.8` / `8.8.4.4` - -If DNS is broken, all image pulls from the distribution registry will fail, as will pods that need external network access. - -## Common Failure Patterns - -| Symptom | Likely Cause | Fix | -|---------|-------------|-----| -| `tls handshake eof` from `openshell status` | Server not running or mTLS credentials missing/mismatched | Check StatefulSet replicas (Step 3) and mTLS files (Step 6) | -| StatefulSet `0/0` replicas | StatefulSet scaled to zero (failed deploy, manual scale-down, or Helm misconfiguration) | `openshell doctor exec -- kubectl -n openshell scale statefulset openshell --replicas=1` | -| Local mTLS files missing | Deploy was interrupted before credentials were persisted | Extract from cluster secret `openshell-client-tls` (Step 6) | -| Container not found | Image not built | `mise run docker:build:cluster` (local) or re-deploy (remote) | -| Container exited, OOMKilled | Insufficient memory | Increase host memory or reduce workload | -| Container exited, non-zero exit | k3s crash, port conflict, privilege issue | Check `openshell doctor logs` for details | -| `/readyz` fails | k3s still starting or crashed | Wait longer or check container logs for k3s errors | -| OpenShell pods `Pending` | Insufficient CPU/memory for scheduling, or PVC not bound | `openshell doctor exec -- kubectl describe pod -n openshell` and `openshell doctor exec -- kubectl get pvc -n openshell` | -| OpenShell pods `CrashLoopBackOff` | Server application error | `openshell doctor exec -- kubectl -n openshell logs statefulset/openshell` | -| OpenShell pods `ImagePullBackOff` (push mode) | Images not imported or wrong containerd namespace | `openshell doctor exec -- ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images ls` (Step 5) | -| OpenShell pods `ImagePullBackOff` (pull mode) | Registry auth or DNS issue | `openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml` and DNS (Step 8) | -| Image import fails | Corrupt tar stream or containerd not ready | Retry after k3s is fully started; check container logs | -| Push mode images not found by kubelet | Imported into wrong containerd namespace | Must use `k3s ctr -n k8s.io images import`, not `k3s ctr images import` | -| mTLS secrets missing | Bootstrap couldn't apply secrets (namespace not ready) | Check deploy logs and verify `openshell` namespace exists (Step 6) | -| mTLS mismatch after redeploy | PKI rotated but workload not restarted, or rollout failed | Check that all three TLS secrets exist and that the openshell pod restarted after cert rotation (Step 6) | -| Helm install job failed | Chart values error or dependency issue | `openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-openshell` | -| Architecture mismatch (remote) | Built on arm64, deploying to amd64 | Cross-build the image for the target architecture | -| Port conflict | Another service on the configured gateway host port (default 8080) | Stop conflicting service or use `--port` on `openshell gateway start` to pick a different host port | -| gRPC connect refused to `127.0.0.1:443` in CI | Docker daemon is remote (`DOCKER_HOST=tcp://...`) but metadata still points to loopback | Verify metadata endpoint host matches `DOCKER_HOST` and includes non-loopback host | -| DNS failures inside container | Entrypoint DNS detection failed | `openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf` and `openshell doctor logs --lines 20` | -| Node DiskPressure / MemoryPressure / PIDPressure | Insufficient disk, memory, or PIDs on host | Free disk (`docker system prune -a --volumes`), increase memory, or expand host resources | -| Pods evicted with "The node had condition: [DiskPressure]" | Host disk full, kubelet evicting pods | Free disk space on host, then `openshell gateway destroy && openshell gateway start` | -| `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component | -| Stale NotReady nodes from previous deploys | Volume reused across container recreations | Deploy flow auto-cleans stale nodes; if it still fails, manually delete NotReady nodes or choose "Recreate" when prompted | -| gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` | - -## Full Diagnostic Dump - -Run all diagnostics at once for a comprehensive report: - -```bash -echo "=== Connectivity Check ===" -openshell status - -echo "=== Container Logs (last 50 lines) ===" -openshell doctor logs --lines 50 - -echo "=== k3s Readiness ===" -openshell doctor exec -- kubectl get --raw='/readyz' - -echo "=== Nodes ===" -openshell doctor exec -- kubectl get nodes -o wide - -echo "=== Node Conditions ===" -openshell doctor exec -- kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{range .status.conditions[*]} {.type}={.status}{end}{"\n"}{end}' - -echo "=== Disk Usage ===" -openshell doctor exec -- df -h / - -echo "=== All Pods ===" -openshell doctor exec -- kubectl get pods -A -o wide - -echo "=== Failing Pods ===" -openshell doctor exec -- kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded - -echo "=== OpenShell StatefulSet ===" -openshell doctor exec -- kubectl -n openshell get statefulset/openshell -o wide - -echo "=== OpenShell Service ===" -openshell doctor exec -- kubectl -n openshell get service/openshell - -echo "=== TLS Secrets ===" -openshell doctor exec -- kubectl -n openshell get secret openshell-server-tls openshell-server-client-ca openshell-client-tls - -echo "=== Recent Events ===" -openshell doctor exec -- kubectl get events -A --sort-by=.lastTimestamp | tail -n 50 - -echo "=== Helm Install OpenShell Logs ===" -openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-openshell --tail=100 - -echo "=== Registry Configuration ===" -openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml - -echo "=== DNS Configuration ===" -openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf -``` diff --git a/crates/openshell-cli/src/lib.rs b/crates/openshell-cli/src/lib.rs index d518557b7..84a87acd2 100644 --- a/crates/openshell-cli/src/lib.rs +++ b/crates/openshell-cli/src/lib.rs @@ -9,7 +9,6 @@ pub(crate) static TEST_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); pub mod auth; -pub mod bootstrap; pub mod completers; pub mod edge_tunnel; pub mod oidc_auth; diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 295ce98a2..d6dd11a98 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -89,14 +89,14 @@ fn resolve_gateway( miette::miette!( "No active gateway.\n\ Set one with: openshell gateway select \n\ - Or deploy a new gateway: openshell gateway start" + Or register one with: openshell gateway add " ) })?; let metadata = load_gateway_metadata(&name).map_err(|_| { miette::miette!( "Unknown gateway '{name}'.\n\ - Deploy it first: openshell gateway start --name {name}\n\ + Register it first: openshell gateway add --name {name}\n\ Or list available gateways: openshell gateway select" ) })?; @@ -107,10 +107,6 @@ fn resolve_gateway( }) } -/// Resolve only the gateway name (without requiring metadata to exist). -/// -/// Used by gateway commands that operate on a gateway by name but may not need -/// the gateway endpoint (e.g., `gateway start` creates the gateway). fn resolve_gateway_name(gateway_flag: &Option) -> Option { gateway_flag .clone() @@ -207,7 +203,7 @@ const HELP_TEMPLATE: &str = "\ provider: Manage provider configuration \x1b[1mGATEWAY COMMANDS\x1b[0m - gateway: Manage the gateway lifecycle + gateway: Manage gateway registrations status: Show gateway status and information inference: Manage inference configuration doctor: Diagnose gateway issues @@ -223,7 +219,7 @@ const HELP_TEMPLATE: &str = "\ \x1b[1mEXAMPLES\x1b[0m $ openshell sandbox create - $ openshell gateway start + $ openshell gateway add http://127.0.0.1:8080 --local $ openshell logs my-sandbox \x1b[1mLEARN MORE\x1b[0m @@ -317,11 +313,10 @@ const GATEWAY_EXAMPLES: &str = "\x1b[1mALIAS\x1b[0m gw \x1b[1mEXAMPLES\x1b[0m - $ openshell gateway start - $ openshell gateway start --name my-gateway --port 9090 - $ openshell gateway stop + $ openshell gateway add http://127.0.0.1:8080 --local $ openshell gateway select my-gateway $ openshell gateway info + $ openshell gateway remove my-gateway "; const INFERENCE_EXAMPLES: &str = "\x1b[1mEXAMPLES\x1b[0m @@ -335,18 +330,6 @@ const DOCTOR_HELP: &str = "\x1b[1mALIAS\x1b[0m \x1b[1mEXAMPLES\x1b[0m $ openshell doctor check - $ openshell doctor logs --lines 100 - $ openshell doctor exec -- kubectl get pods -A - $ openshell doctor llm.txt - -\x1b[1mAI AGENT USAGE\x1b[0m - If you are a coding agent (LLM) diagnosing a gateway issue, run: - - openshell doctor llm.txt - - This prints a detailed diagnostic prompt with step-by-step instructions - for debugging gateway clusters using `openshell doctor logs` and - `openshell doctor exec`. "; /// `OpenShell` CLI - agent execution and management. @@ -483,7 +466,7 @@ enum Commands { // =================================================================== // GATEWAY COMMANDS // =================================================================== - /// Manage the gateway lifecycle. + /// Manage gateway registrations. #[command(alias = "gw", after_help = GATEWAY_EXAMPLES, help_template = SUBCOMMAND_HELP_TEMPLATE)] Gateway { #[command(subcommand)] @@ -506,9 +489,7 @@ enum Commands { // =================================================================== /// Diagnose gateway issues. /// - /// Inspect logs, run commands inside the gateway container, and get - /// AI-assisted debugging guidance. If you are a coding agent, run - /// `openshell doctor llm.txt` for a full diagnostic prompt. + /// Check local prerequisites for gateway development. #[command(visible_alias = "dr", hide = true, after_help = DOCTOR_HELP, help_template = SUBCOMMAND_HELP_TEMPLATE)] Doctor { #[command(subcommand)] @@ -786,172 +767,20 @@ enum ProviderCommands { #[derive(Subcommand, Debug)] enum GatewayCommands { - /// Deploy/start the gateway. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Start { - /// Gateway name. - #[arg(long, default_value = "openshell", env = "OPENSHELL_GATEWAY")] - name: String, - - /// SSH destination for remote deployment (e.g., user@hostname). - #[arg(long)] - remote: Option, - - /// Path to SSH private key for remote deployment. - #[arg(long, value_hint = ValueHint::FilePath)] - ssh_key: Option, - - /// Host port to map to the gateway (default: 8080). - #[arg(long, default_value_t = openshell_bootstrap::DEFAULT_GATEWAY_PORT)] - port: u16, - - /// Override the gateway host written into cluster metadata. - /// - /// By default, local clusters advertise 127.0.0.1. Set this when - /// the client cannot reach the Docker host at 127.0.0.1 — for - /// example in CI containers, WSL, or when Docker runs on a - /// remote host. Common values: `host.docker.internal`, a LAN IP, - /// or a hostname. - #[arg(long)] - gateway_host: Option, - - /// Destroy and recreate the gateway from scratch if one already exists. - /// - /// Without this flag, an interactive prompt asks whether to recreate; - /// in non-interactive mode the existing gateway is reused silently. - #[arg(long)] - recreate: bool, - - /// Listen on plaintext HTTP instead of mTLS. - /// - /// Use when the gateway sits behind a reverse proxy (e.g., Cloudflare - /// Tunnel) that terminates TLS at the edge. - #[arg(long)] - plaintext: bool, - - /// Disable gateway authentication (mTLS client certificate requirement). - /// - /// The server still listens on TLS, but clients are not required to - /// present a certificate. Use when a reverse proxy (e.g., Cloudflare - /// Tunnel) terminates TLS and cannot forward client certs. - /// Ignored when --plaintext is set. - #[arg(long)] - disable_gateway_auth: bool, - - /// Username for authenticating with the container image registry. - /// - /// Defaults to `__token__` when `--registry-token` is set (the - /// standard convention for GHCR PAT-based auth). Only needed for - /// private registries — public GHCR repos pull without auth. - #[arg(long, env = "OPENSHELL_REGISTRY_USERNAME")] - registry_username: Option, - - /// Authentication token for pulling container images from the registry. - /// - /// For GHCR, this is a GitHub personal access token (PAT) with - /// `read:packages` scope. Only needed for private registries — - /// public GHCR repos pull without auth. Used to pull the cluster - /// bootstrap image and passed into the k3s cluster so it can pull - /// server, sandbox, and community images at runtime. - #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")] - registry_token: Option, - - /// Enable NVIDIA GPU passthrough. - /// - /// Passes all host GPUs into the cluster container and deploys the - /// NVIDIA k8s-device-plugin so Kubernetes workloads can request - /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the - /// NVIDIA Container Toolkit on the host. - /// - /// When enabled, `OpenShell` auto-selects CDI when the Docker daemon has - /// CDI enabled and falls back to Docker's NVIDIA GPU request path - /// (`--gpus all`) otherwise. - #[arg(long)] - gpu: bool, - - /// OIDC issuer URL for JWT-based authentication. - /// When set, the K3s server will validate Bearer tokens against this issuer. - #[arg(long)] - oidc_issuer: Option, - - /// OIDC audience for the API resource server. - #[arg(long, default_value = "openshell-cli", requires = "oidc_issuer")] - oidc_audience: String, - - /// OIDC client ID stored in gateway metadata for CLI login. - #[arg(long, default_value = "openshell-cli", requires = "oidc_issuer")] - oidc_client_id: String, - - /// Dot-separated path to the roles array in the JWT claims. - #[arg(long, requires = "oidc_issuer")] - oidc_roles_claim: Option, - - /// Role name that grants admin access. - #[arg(long, requires = "oidc_issuer")] - oidc_admin_role: Option, - - /// Role name that grants standard user access. - #[arg(long, requires = "oidc_issuer")] - oidc_user_role: Option, - - /// Space-separated `OAuth2` scopes to request during OIDC login. - #[arg(long, requires = "oidc_issuer")] - oidc_scopes: Option, - - /// Dot-separated path to the scopes value in the JWT claims. - /// When set, the server enforces scope-based permissions on top of roles. - #[arg(long, requires = "oidc_issuer")] - oidc_scopes_claim: Option, - }, - - /// Stop the gateway (preserves state). - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Stop { - /// Gateway name (defaults to active gateway). - #[arg(long, env = "OPENSHELL_GATEWAY", add = ArgValueCompleter::new(completers::complete_gateway_names))] - name: Option, - - /// Override SSH destination (auto-resolved from gateway metadata). - #[arg(long)] - remote: Option, - - /// Path to SSH private key for remote gateway. - #[arg(long, value_hint = ValueHint::FilePath)] - ssh_key: Option, - }, - - /// Destroy the gateway and its state. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Destroy { - /// Gateway name (defaults to active gateway). - #[arg(long, env = "OPENSHELL_GATEWAY", add = ArgValueCompleter::new(completers::complete_gateway_names))] - name: Option, - - /// Override SSH destination (auto-resolved from gateway metadata). - #[arg(long)] - remote: Option, - - /// Path to SSH private key for remote gateway. - #[arg(long, value_hint = ValueHint::FilePath)] - ssh_key: Option, - }, - /// Add an existing gateway. /// /// Registers a gateway endpoint so it appears in `openshell gateway select`. /// /// An `http://...` endpoint is treated as a direct plaintext gateway and - /// skips both mTLS certificate extraction and browser authentication. + /// skips both mTLS client certificate lookup and browser authentication. /// /// Without extra flags, an `https://...` endpoint is treated as an /// edge-authenticated (cloud) gateway and a browser is opened for /// authentication. /// - /// Pass `--remote ` to register a remote mTLS gateway whose - /// Docker daemon is reachable over SSH. Pass `--local` to register a - /// local mTLS gateway running in Docker on this machine. In both cases - /// the CLI extracts mTLS certificates from the running container - /// automatically. + /// Pass `--remote ` to register a remote mTLS gateway. Pass + /// `--local` to register a local mTLS gateway. In both cases, mTLS + /// certificates must already exist in the gateway config directory. /// /// An `ssh://` endpoint (e.g., `ssh://user@host:8080`) is shorthand /// for `--remote user@host` with the endpoint derived from the URL. @@ -970,10 +799,6 @@ enum GatewayCommands { #[arg(long, conflicts_with = "local")] remote: Option, - /// SSH private key for the remote host (used with `--remote` or `ssh://`). - #[arg(long, value_hint = ValueHint::FilePath)] - ssh_key: Option, - /// Register a local mTLS gateway running in Docker on this machine. /// With `http://...`, stores a local plaintext registration instead. #[arg(long, conflicts_with = "remote")] @@ -1000,6 +825,17 @@ enum GatewayCommands { oidc_scopes: Option, }, + /// Remove a local gateway registration. + /// + /// This only removes CLI metadata and stored auth tokens. It does not stop + /// or destroy the gateway service. + #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] + Remove { + /// Gateway name (defaults to active gateway). + #[arg(add = ArgValueCompleter::new(completers::complete_gateway_names))] + name: Option, + }, + /// Authenticate with an edge-authenticated or OIDC gateway. /// /// Opens a browser for the edge proxy's login flow and stores the @@ -1033,7 +869,7 @@ enum GatewayCommands { name: Option, }, - /// Show gateway deployment details. + /// Show gateway registration details. #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] Info { /// Gateway name (defaults to active gateway). @@ -1121,76 +957,10 @@ enum InferenceCommands { #[derive(Subcommand, Debug)] enum DoctorCommands { - /// Fetch logs from the gateway container. - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Logs { - /// Gateway name (defaults to active gateway). - #[arg(long, env = "OPENSHELL_GATEWAY")] - name: Option, - - /// Number of log lines to return (default: all). - #[arg(short, long)] - lines: Option, - - /// Stream live logs (follow mode). - #[arg(long)] - tail: bool, - - /// Override SSH destination for remote gateways. - #[arg(long)] - remote: Option, - - /// Path to SSH private key for remote gateways. - #[arg(long, value_hint = ValueHint::FilePath)] - ssh_key: Option, - }, - - /// Run a command inside the gateway container. - /// - /// Launches an interactive `docker exec` session in the gateway's k3s - /// container with KUBECONFIG pre-configured. When the gateway is remote, - /// the session is tunnelled over SSH automatically. - /// - /// Examples: - /// openshell doctor exec -- kubectl get pods -A - /// openshell doctor exec -- k9s - /// openshell doctor exec -- sh - #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] - Exec { - /// Gateway name (defaults to active gateway). - #[arg(long, env = "OPENSHELL_GATEWAY")] - name: Option, - - /// Override SSH destination for remote gateways. - #[arg(long)] - remote: Option, - - /// Path to SSH private key for remote gateways. - #[arg(long, value_hint = ValueHint::FilePath)] - ssh_key: Option, - - /// Command and arguments to run inside the container. - #[arg(trailing_var_arg = true, required = true)] - command: Vec, - }, - - /// Print a diagnostic prompt for AI-assisted gateway debugging. - /// - /// Outputs a system prompt that a coding agent can use to autonomously - /// diagnose gateway issues using `openshell doctor logs` and - /// `openshell doctor exec`. - /// - /// Examples: - /// openshell doctor llm.txt - /// openshell doctor llm.txt | pbcopy - #[command(name = "llm.txt", help_template = LEAF_HELP_TEMPLATE)] - LlmTxt, - /// Validate system prerequisites for running a gateway. /// /// Checks that a Docker-compatible runtime is installed, running, and - /// reachable. Reports version info and socket path. Use this to verify - /// your environment before running `openshell gateway start`. + /// reachable. Reports version info and socket path. /// /// Examples: /// openshell doctor check @@ -1215,8 +985,8 @@ enum SandboxCommands { /// `ghcr.io/nvidia/openshell-community/sandboxes/:latest` /// (override the prefix with `OPENSHELL_COMMUNITY_REGISTRY`). /// - /// When given a Dockerfile or directory, the image is built and pushed - /// into the cluster automatically before creating the sandbox. + /// When given a Dockerfile or directory, the image is built into the + /// local Docker daemon before creating the sandbox. #[arg(long, value_hint = ValueHint::AnyPath)] from: Option, @@ -1248,12 +1018,8 @@ enum SandboxCommands { editor: Option, /// Request GPU resources for the sandbox. - /// - /// When no gateway is running, auto-bootstrap starts a GPU-enabled - /// gateway using the same automatic injection selection as - /// `openshell gateway start --gpu`. GPU intent is also inferred - /// automatically for known GPU-designated image names such as - /// `nvidia-gpu`. + /// GPU intent is also inferred automatically for known GPU-designated + /// image names such as `nvidia-gpu`. #[arg(long)] gpu: bool, @@ -1262,16 +1028,6 @@ enum SandboxCommands { #[arg(long, requires = "gpu")] gpu_device: Option, - /// SSH destination for remote bootstrap (e.g., user@hostname). - /// Only used when no cluster exists yet; ignored if a cluster is - /// already active. - #[arg(long, help_heading = "BOOTSTRAP FLAGS")] - remote: Option, - - /// Path to SSH private key for remote bootstrap. - #[arg(long, value_hint = ValueHint::FilePath, help_heading = "BOOTSTRAP FLAGS")] - ssh_key: Option, - /// Provider names to attach to this sandbox. #[arg(long = "provider")] providers: Vec, @@ -1297,19 +1053,6 @@ enum SandboxCommands { #[arg(long, overrides_with = "tty")] no_tty: bool, - /// Auto-bootstrap a gateway if none is available (this is the default). - #[arg( - long, - overrides_with = "no_bootstrap", - help_heading = "BOOTSTRAP FLAGS", - hide = true - )] - bootstrap: bool, - - /// Never bootstrap a gateway automatically; error if none is available. - #[arg(long, overrides_with = "bootstrap", help_heading = "BOOTSTRAP FLAGS")] - no_bootstrap: bool, - /// Auto-create missing providers from local credentials. /// /// Without this flag, an interactive prompt asks per-provider; @@ -1847,80 +1590,10 @@ async fn main() -> Result<()> { Some(Commands::Gateway { command: Some(command), }) => match command { - GatewayCommands::Start { - name, - remote, - ssh_key, - port, - gateway_host, - recreate, - plaintext, - disable_gateway_auth, - registry_username, - registry_token, - gpu, - oidc_issuer, - oidc_audience, - oidc_client_id, - oidc_roles_claim, - oidc_admin_role, - oidc_user_role, - oidc_scopes, - oidc_scopes_claim, - } => { - let gpu = if gpu { - vec!["auto".to_string()] - } else { - vec![] - }; - Box::pin(run::gateway_admin_deploy( - &name, - remote.as_deref(), - ssh_key.as_deref(), - port, - gateway_host.as_deref(), - recreate, - plaintext, - disable_gateway_auth, - registry_username.as_deref(), - registry_token.as_deref(), - gpu, - oidc_issuer.as_deref(), - &oidc_audience, - &oidc_client_id, - oidc_roles_claim.as_deref(), - oidc_admin_role.as_deref(), - oidc_user_role.as_deref(), - oidc_scopes.as_deref(), - oidc_scopes_claim.as_deref(), - )) - .await?; - } - GatewayCommands::Stop { - name, - remote, - ssh_key, - } => { - let name = name - .or_else(|| resolve_gateway_name(&cli.gateway)) - .unwrap_or_else(|| "openshell".to_string()); - run::gateway_admin_stop(&name, remote.as_deref(), ssh_key.as_deref()).await?; - } - GatewayCommands::Destroy { - name, - remote, - ssh_key, - } => { - let name = name - .or_else(|| resolve_gateway_name(&cli.gateway)) - .unwrap_or_else(|| "openshell".to_string()); - run::gateway_admin_destroy(&name, remote.as_deref(), ssh_key.as_deref()).await?; - } GatewayCommands::Add { endpoint, name, remote, - ssh_key, local, oidc_issuer, oidc_client_id, @@ -1931,7 +1604,6 @@ async fn main() -> Result<()> { &endpoint, name.as_deref(), remote.as_deref(), - ssh_key.as_deref(), local, oidc_issuer.as_deref(), &oidc_client_id, @@ -1940,6 +1612,18 @@ async fn main() -> Result<()> { ) .await?; } + GatewayCommands::Remove { name } => { + let name = name + .or_else(|| resolve_gateway_name(&cli.gateway)) + .ok_or_else(|| { + miette::miette!( + "No active gateway.\n\ + Specify a gateway name: openshell gateway remove \n\ + Or list available gateways: openshell gateway select" + ) + })?; + run::gateway_remove(&name)?; + } GatewayCommands::Login { name } => { let name = name .or_else(|| resolve_gateway_name(&cli.gateway)) @@ -1984,34 +1668,8 @@ async fn main() -> Result<()> { Some(Commands::Doctor { command: Some(command), }) => match command { - DoctorCommands::Logs { - name, - lines, - tail, - remote, - ssh_key, - } => { - let name = name - .or_else(|| resolve_gateway_name(&cli.gateway)) - .unwrap_or_else(|| "openshell".to_string()); - run::doctor_logs(&name, lines, tail, remote.as_deref(), ssh_key.as_deref()).await?; - } - DoctorCommands::Exec { - name, - remote, - ssh_key, - command, - } => { - let name = name - .or_else(|| resolve_gateway_name(&cli.gateway)) - .unwrap_or_else(|| "openshell".to_string()); - run::doctor_exec(&name, remote.as_deref(), ssh_key.as_deref(), &command)?; - } - DoctorCommands::LlmTxt => { - run::doctor_llm()?; - } DoctorCommands::Check => { - run::doctor_check().await?; + run::doctor_check()?; } }, Some(Commands::Doctor { command: None }) => { @@ -2036,8 +1694,8 @@ async fn main() -> Result<()> { println!(" {} No gateway configured.", "Status:".dimmed()); println!(); println!( - "Deploy a gateway with: {}", - "openshell gateway start".dimmed() + "Register a gateway with: {}", + "openshell gateway add ".dimmed() ); } } @@ -2472,15 +2130,11 @@ async fn main() -> Result<()> { editor, gpu, gpu_device, - remote, - ssh_key, providers, policy, forward, tty, no_tty, - bootstrap, - no_bootstrap, auto_providers, no_auto_providers, labels, @@ -2495,16 +2149,6 @@ async fn main() -> Result<()> { None // auto-detect }; - // Resolve --bootstrap / --no-bootstrap into an Option. - // Bootstrap is the default; --no-bootstrap opts out. - let bootstrap_override = if no_bootstrap { - Some(false) - } else if bootstrap { - Some(true) - } else { - None // auto-bootstrap (default) - }; - // Resolve --auto-providers / --no-auto-providers. let auto_providers_override = if no_auto_providers { Some(false) @@ -2539,73 +2183,30 @@ async fn main() -> Result<()> { .transpose()?; let keep = keep || !no_keep || editor.is_some() || forward.is_some(); - // For `sandbox create`, a missing cluster is not fatal — the - // bootstrap flow inside `sandbox_create` can deploy one. - match resolve_gateway(&cli.gateway, &cli.gateway_endpoint) { - Ok(ctx) => { - if remote.is_some() { - eprintln!( - "{} --remote ignored: gateway '{}' is already active. \ - To redeploy, use: openshell gateway start", - "!".yellow(), - ctx.name, - ); - return Ok(()); - } - let endpoint = &ctx.endpoint; - let mut tls = tls.with_gateway_name(&ctx.name); - apply_auth(&mut tls, &ctx.name); - // The user already has a configured gateway. Disable - // auto-bootstrap in the retry path so we don't - // silently replace their selected gateway with a new - // "openshell" gateway if the connection fails. - Box::pin(run::sandbox_create( - endpoint, - name.as_deref(), - from.as_deref(), - &ctx.name, - upload_spec.as_ref(), - keep, - gpu, - gpu_device.as_deref(), - editor, - remote.as_deref(), - ssh_key.as_deref(), - &providers, - policy.as_deref(), - forward, - &command, - tty_override, - Some(false), - auto_providers_override, - &labels_map, - &tls, - )) - .await?; - } - Err(_) => { - // No gateway configured — go straight to bootstrap. - Box::pin(run::sandbox_create_with_bootstrap( - name.as_deref(), - from.as_deref(), - upload_spec.as_ref(), - keep, - gpu, - gpu_device.as_deref(), - editor, - remote.as_deref(), - ssh_key.as_deref(), - &providers, - policy.as_deref(), - forward, - &command, - tty_override, - bootstrap_override, - auto_providers_override, - )) - .await?; - } - } + let ctx = resolve_gateway(&cli.gateway, &cli.gateway_endpoint)?; + let endpoint = &ctx.endpoint; + let mut tls = tls.with_gateway_name(&ctx.name); + apply_auth(&mut tls, &ctx.name); + Box::pin(run::sandbox_create( + endpoint, + name.as_deref(), + from.as_deref(), + &ctx.name, + upload_spec.as_ref(), + keep, + gpu, + gpu_device.as_deref(), + editor, + &providers, + policy.as_deref(), + forward, + &command, + tty_override, + auto_providers_override, + &labels_map, + &tls, + )) + .await?; } SandboxCommands::Upload { name, @@ -2858,7 +2459,7 @@ async fn main() -> Result<()> { let meta = load_gateway_metadata(&g).map_err(|_| { miette::miette!( "Unknown gateway '{g}'.\n\ - Deploy it first: openshell gateway start --name {g}\n\ + Register it first: openshell gateway add --name {g}\n\ Or list available gateways: openshell gateway select" ) })?; @@ -3002,7 +2603,6 @@ mod tests { gateway_endpoint: endpoint.to_string(), is_remote: true, auth_mode: Some("cloudflare_jwt".to_string()), - client_lifecycle_managed: Some(false), ..Default::default() } } @@ -3093,16 +2693,6 @@ mod tests { fs::create_dir(temp.path().join("ctx")).expect("failed to create context directory"); let cases: Vec<(Vec<&str>, usize, &str)> = vec![ - ( - vec!["openshell", "gateway", "start", "--ssh-key", "id"], - 4, - "id_rsa", - ), - ( - vec!["openshell", "sandbox", "create", "--ssh-key", "id"], - 4, - "id_rsa", - ), ( vec!["openshell", "sandbox", "upload", "demo", "Do"], 4, diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 6e6c695e1..a5bb2a153 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -18,10 +18,10 @@ use hyper_util::{client::legacy::Client, rt::TokioExecutor}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use miette::{IntoDiagnostic, Result, WrapErr, miette}; use openshell_bootstrap::{ - DeployOptions, GatewayMetadata, RemoteOptions, clear_active_gateway, - clear_last_sandbox_if_matches, container_name, extract_host_from_ssh_destination, - get_gateway_metadata, list_gateways, load_active_gateway, remove_gateway_metadata, - resolve_ssh_hostname, save_active_gateway, save_last_sandbox, store_gateway_metadata, + GatewayMetadata, clear_active_gateway, clear_last_sandbox_if_matches, + extract_host_from_ssh_destination, get_gateway_metadata, list_gateways, load_active_gateway, + remove_gateway_metadata, resolve_ssh_hostname, save_active_gateway, save_last_sandbox, + store_gateway_metadata, }; use openshell_core::proto::ProviderProfileCategory; use openshell_core::proto::{ @@ -42,7 +42,7 @@ use openshell_providers::{ ProviderRegistry, detect_provider_from_command, normalize_provider_type, }; use owo_colors::OwoColorize; -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::{HashMap, HashSet}; use std::io::{IsTerminal, Read, Write}; use std::path::{Path, PathBuf}; use std::process::Command; @@ -428,274 +428,6 @@ fn print_sandbox_header(sandbox: &Sandbox, display: Option<&ProvisioningDisplay> } } -const CLUSTER_DEPLOY_LOG_LINES: usize = 15; - -/// Return the current terminal width, falling back to 80 columns. -fn term_width() -> usize { - crossterm::terminal::size().map_or(80, |(w, _)| w as usize) -} - -/// Build a horizontal rule of `─` characters with an optional centered label. -fn horizontal_rule(label: Option<&str>, width: usize) -> String { - match label { - Some(text) => { - let text_with_pad = format!(" {text} "); - let text_len = text_with_pad.len(); - if width <= text_len { - return text_with_pad; - } - let remaining = width - text_len; - let left = remaining / 2; - let right = remaining - left; - format!("{}{}{}", "─".repeat(left), text_with_pad, "─".repeat(right)) - } - None => "─".repeat(width), - } -} - -/// Truncate a string to fit within the given column width. -/// -/// If the string is longer than `max_width`, it is cut and an ellipsis (`…`) -/// is appended so the total visible width equals `max_width`. -fn truncate_to_width(s: &str, max_width: usize) -> String { - if max_width == 0 { - return String::new(); - } - // Fast path: ASCII-only check via byte length (covers the vast majority of log lines). - if s.len() <= max_width { - return s.to_string(); - } - // The string is longer than the budget. We need to truncate. - // Walk by chars to handle multi-byte UTF-8 correctly. - let mut end = 0; - for (count, (idx, ch)) in s.char_indices().enumerate() { - if count + 1 > max_width.saturating_sub(1) { - break; - } - end = idx + ch.len_utf8(); - } - format!("{}…", &s[..end]) -} - -struct GatewayDeployLogPanel { - mp: MultiProgress, - status: String, - progress: Option, - current_step: Option, - spinner: ProgressBar, - /// Blank line below the spinner so progress doesn't sit flush against the - /// bottom of the terminal. - spacer: ProgressBar, - completed_steps: Vec, - top_border: Option, - log_lines: Vec, - bottom_border: Option, - buffer: VecDeque, -} - -impl GatewayDeployLogPanel { - fn new(_name: &str, _location: &str) -> Self { - let mp = MultiProgress::new(); - - let spinner = mp.add(ProgressBar::new_spinner()); - spinner.set_style( - ProgressStyle::with_template("{spinner:.cyan} {msg}") - .unwrap_or_else(|_| ProgressStyle::default_spinner()), - ); - spinner.enable_steady_tick(Duration::from_millis(120)); - - // Keep a blank line below the spinner so it doesn't sit flush - // against the bottom of the terminal. - let spacer = mp.add(ProgressBar::new(0)); - spacer.set_style( - ProgressStyle::with_template("{msg}").unwrap_or_else(|_| ProgressStyle::default_bar()), - ); - spacer.set_message(""); - - let panel = Self { - mp, - status: "Starting".to_string(), - progress: None, - current_step: None, - spinner, - spacer, - completed_steps: Vec::new(), - top_border: None, - log_lines: Vec::with_capacity(CLUSTER_DEPLOY_LOG_LINES), - bottom_border: None, - buffer: VecDeque::with_capacity(CLUSTER_DEPLOY_LOG_LINES), - }; - panel.update_spinner_message(); - panel - } - - fn push_log(&mut self, line: String) { - let line = line.trim().to_string(); - if line.is_empty() { - return; - } - - if let Some(status) = line.strip_prefix("[status] ") { - self.handle_status(status.to_string()); - return; - } - - if let Some(detail) = line.strip_prefix("[progress] ") { - self.handle_progress(detail.to_string()); - return; - } - - self.ensure_log_panel(); - - if self.buffer.len() == CLUSTER_DEPLOY_LOG_LINES { - self.buffer.pop_front(); - } - self.buffer.push_back(line); - self.render(); - } - - fn handle_status(&mut self, status: String) { - if is_progress_status(&status) { - self.handle_progress(status); - return; - } - - if let Some(previous_step) = self.current_step.replace(status.clone()) { - self.push_completed_step(&previous_step, true); - } - - self.status = status; - self.progress = None; - self.update_spinner_message(); - } - - fn handle_progress(&mut self, detail: String) { - self.progress = Some(detail); - self.update_spinner_message(); - } - - fn ensure_log_panel(&mut self) { - if self.top_border.is_some() { - return; - } - - let line_style = - ProgressStyle::with_template("{msg}").unwrap_or_else(|_| ProgressStyle::default_bar()); - - let width = term_width(); - - let top_border = self.mp.add(ProgressBar::new(0)); - top_border.set_style(line_style.clone()); - top_border.set_message( - horizontal_rule(Some("Gateway Logs"), width) - .cyan() - .to_string(), - ); - - for _ in 0..CLUSTER_DEPLOY_LOG_LINES { - let line = self.mp.add(ProgressBar::new(0)); - line.set_style(line_style.clone()); - line.set_message(String::new()); - self.log_lines.push(line); - } - - let bottom_border = self.mp.add(ProgressBar::new(0)); - bottom_border.set_style(line_style); - bottom_border.set_message(horizontal_rule(None, width).cyan().to_string()); - - self.top_border = Some(top_border); - self.bottom_border = Some(bottom_border); - } - - fn push_completed_step(&mut self, step: &str, success: bool) { - if step.is_empty() { - return; - } - - let symbol = if success { - "✓".green().bold().to_string() - } else { - "x".red().bold().to_string() - }; - - let line_style = - ProgressStyle::with_template("{msg}").unwrap_or_else(|_| ProgressStyle::default_bar()); - let bar = self.mp.insert_before(&self.spinner, ProgressBar::new(0)); - bar.set_style(line_style); - bar.set_message(format!("{symbol} {step}")); - self.completed_steps.push(bar); - } - - fn update_spinner_message(&self) { - let msg = self.progress.as_ref().map_or_else( - || self.status.clone(), - |detail| format!("{} ({})", self.status, detail.dimmed()), - ); - self.spinner.set_message(msg); - } - - fn finish_success(&mut self) { - if let Some(step) = self.current_step.take() { - self.push_completed_step(&step, true); - } - // Keep completed step checkmarks visible, clear the log panel. - for bar in &self.completed_steps { - bar.finish(); - } - self.clear_log_panel(); - self.spinner.finish_and_clear(); - self.spacer.finish_and_clear(); - } - - fn finish_failure(&mut self) { - if let Some(step) = self.current_step.take() { - self.push_completed_step(&step, false); - } - // On failure, preserve everything (including logs) for debugging. - for bar in &self.completed_steps { - bar.finish(); - } - if let Some(top_border) = &self.top_border { - top_border.finish(); - } - for bar in &self.log_lines { - bar.finish(); - } - if let Some(bottom_border) = &self.bottom_border { - bottom_border.finish(); - } - self.spinner.finish_and_clear(); - self.spacer.finish_and_clear(); - } - - /// Clear the container log panel from the terminal output. - fn clear_log_panel(&self) { - if let Some(top_border) = &self.top_border { - top_border.finish_and_clear(); - } - for bar in &self.log_lines { - bar.finish_and_clear(); - } - if let Some(bottom_border) = &self.bottom_border { - bottom_border.finish_and_clear(); - } - } - - fn render(&self) { - let width = term_width(); - for (idx, bar) in self.log_lines.iter().enumerate() { - let line = self.buffer.get(idx).map(String::as_str).unwrap_or_default(); - bar.set_message(truncate_to_width(line, width)); - } - } -} - -fn is_progress_status(status: &str) -> bool { - status.starts_with("Exported ") - || status.starts_with("Downloading:") - || status.starts_with("Extracting:") -} - /// Show gateway status. #[allow(clippy::branches_sharing_code)] pub async fn gateway_status(gateway_name: &str, server: &str, tls: &TlsOptions) -> Result<()> { @@ -759,7 +491,7 @@ pub fn gateway_use(name: &str) -> Result<()> { get_gateway_metadata(name).ok_or_else(|| { miette::miette!( "No gateway metadata found for '{name}'.\n\ - Deploy a gateway first with: openshell gateway start --name {name}\n\ + Register it first with: openshell gateway add --name {name}\n\ Or list available gateways: openshell gateway select" ) })?; @@ -936,7 +668,6 @@ fn plaintext_gateway_metadata( remote_host, resolved_host, auth_mode: Some("plaintext".to_string()), - client_lifecycle_managed: Some(false), ..Default::default() } } @@ -985,15 +716,15 @@ where /// Register an existing gateway. /// /// An `http://...` endpoint is registered as a direct plaintext gateway with -/// no mTLS extraction or browser authentication. +/// no mTLS certificate lookup or browser authentication. /// /// Without extra flags, an `https://...` endpoint is treated as an /// edge-authenticated (cloud) gateway and a browser is opened for /// authentication. /// /// Pass `remote` (SSH destination) to register a remote mTLS gateway, or -/// `local = true` for a local mTLS gateway. In both cases the CLI extracts -/// mTLS certificates from the running container automatically. +/// `local = true` for a local mTLS gateway. In both cases mTLS certificates +/// must already exist in the gateway config directory. /// /// An `ssh://` endpoint (e.g., `ssh://user@host:8080`) is shorthand for /// `--remote user@host` with the gateway endpoint derived from the URL. @@ -1002,7 +733,6 @@ pub async fn gateway_add( endpoint: &str, name: Option<&str>, remote: Option<&str>, - ssh_key: Option<&str>, local: bool, oidc_issuer: Option<&str>, oidc_client_id: &str, @@ -1042,7 +772,7 @@ pub async fn gateway_add( }; // Resolve the SSH host alias (e.g. ~/.ssh/config HostName) so the // endpoint uses the actual hostname/IP that matches the TLS certificate - // SANs — consistent with the `gateway start` path. + // SANs. let resolved = resolve_ssh_hostname(host); let https_endpoint = format!("https://{resolved}:{port}"); @@ -1058,16 +788,9 @@ pub async fn gateway_add( }; let remote = remote.as_deref(); - // Validate --ssh-key requires a remote gateway context. - if ssh_key.is_some() && remote.is_none() { - return Err(miette::miette!( - "--ssh-key requires --remote or an ssh:// endpoint" - )); - } - // Derive a gateway name from the hostname when none is provided. // Loopback endpoints use the canonical "openshell" name, matching the - // convention in init-pki.sh, default_tls_dir, and bootstrap. + // convention in init-pki.sh and default_tls_dir. let derived_name; let name = if let Some(n) = name { n @@ -1087,7 +810,7 @@ pub async fn gateway_add( if get_gateway_metadata(name).is_some() { return Err(miette::miette!( "Gateway '{}' already exists.\n\ - Remove it first with: openshell gateway destroy --name {}\n\ + Remove it first with: openshell gateway remove {}\n\ Or choose a different name with: --name ", name, name, @@ -1097,15 +820,6 @@ pub async fn gateway_add( // OIDC takes precedence over plaintext/mTLS/edge detection — the user // explicitly opted in with --oidc-issuer regardless of scheme. if let Some(issuer) = oidc_issuer { - // When --local is combined with --oidc-issuer, extract mTLS certs - // from the running container so the CLI can establish a TLS - // connection while using OIDC for application-level auth. - if local { - let endpoint_port = url::Url::parse(&endpoint).ok().and_then(|u| u.port()); - eprintln!("• Extracting TLS certificates from gateway container..."); - openshell_bootstrap::extract_and_store_pki(name, None, endpoint_port).await?; - } - let metadata = GatewayMetadata { name: name.to_string(), gateway_endpoint: endpoint.clone(), @@ -1128,9 +842,6 @@ pub async fn gateway_add( ); eprintln!(" {} {}", "Endpoint:".dimmed(), endpoint); eprintln!(" {} oidc", "Auth:".dimmed()); - if local { - eprintln!("{} TLS certificates extracted", "✓".green().bold()); - } eprintln!(); // Check for client_credentials env var (CI mode). @@ -1235,31 +946,15 @@ pub async fn gateway_add( if remote.is_some() || local { // mTLS gateway (remote or local). - let remote_opts = remote.map(|dest| { - let mut opts = RemoteOptions::new(dest); - if let Some(key) = ssh_key { - opts = opts.with_ssh_key(key); - } - opts - }); - - // Extract certs BEFORE storing metadata — if this fails the gateway - // is not registered. Pass the endpoint port so the container can be - // identified by its host port binding when multiple gateways run on - // the same Docker host. - // - // Skip extraction when client certs are already on disk (e.g., - // RPM/systemd deployments where init-pki.sh pre-provisions them - // before the gateway starts). let certs_on_disk = mtls_certs_exist_for_endpoint(name, &endpoint); - - if certs_on_disk { - eprintln!("• TLS certificates already present, skipping extraction"); - } else { - let endpoint_port = url::Url::parse(&endpoint).ok().and_then(|u| u.port()); - eprintln!("• Extracting TLS certificates from gateway container..."); - openshell_bootstrap::extract_and_store_pki(name, remote_opts.as_ref(), endpoint_port) - .await?; + if !certs_on_disk { + return Err(miette::miette!( + "mTLS certificates for gateway '{name}' were not found.\n\ + Expected them under the default gateway config directory.\n\ + Start the gateway package first so it provisions client TLS material, \ + then retry: openshell gateway add {endpoint}{}", + if local { " --local" } else { "" }, + )); } let (remote_host, resolved_host) = remote.map_or((None, None), |dest| { @@ -1276,7 +971,6 @@ pub async fn gateway_add( remote_host, resolved_host, auth_mode: Some("mtls".to_string()), - client_lifecycle_managed: Some(false), ..Default::default() }; @@ -1306,15 +1000,7 @@ pub async fn gateway_add( "Type:".dimmed(), if local { "local" } else { "remote" }, ); - eprintln!( - "{} TLS certificates {}", - "✓".green().bold(), - if certs_on_disk { - "already present" - } else { - "extracted" - } - ); + eprintln!("{} TLS certificates present", "✓".green().bold()); } else { // Cloud (edge-authenticated) gateway. let metadata = GatewayMetadata { @@ -1322,7 +1008,6 @@ pub async fn gateway_add( gateway_endpoint: endpoint.clone(), is_remote: true, auth_mode: Some("cloudflare_jwt".to_string()), - client_lifecycle_managed: Some(false), ..Default::default() }; @@ -1455,7 +1140,7 @@ pub fn gateway_logout(name: &str) -> Result<()> { Ok(()) } -/// List all provisioned gateways. +/// List all registered gateways. pub fn gateway_list(gateway_flag: &Option) -> Result<()> { let gateways = list_gateways()?; let active = gateway_flag.clone().or_else(load_active_gateway); @@ -1464,8 +1149,8 @@ pub fn gateway_list(gateway_flag: &Option) -> Result<()> { println!("No gateways found."); println!(); println!( - "Deploy a gateway with: {}", - "openshell gateway start".dimmed() + "Register a gateway with: {}", + "openshell gateway add ".dimmed() ); return Ok(()); } @@ -1562,355 +1247,12 @@ async fn http_health_check(server: &str, tls: &TlsOptions) -> Result Result { - let interactive = std::io::stderr().is_terminal(); - - if interactive { - let panel = std::sync::Arc::new(std::sync::Mutex::new(GatewayDeployLogPanel::new( - name, location, - ))); - let panel_clone = std::sync::Arc::clone(&panel); - let result = openshell_bootstrap::deploy_gateway_with_logs(options, move |line| { - if let Ok(mut p) = panel_clone.lock() { - p.push_log(line); - } - }) - .await; - - let mut panel = std::sync::Arc::try_unwrap(panel) - .ok() - .expect("panel arc should have single owner after deploy") - .into_inner() - .expect("panel mutex should not be poisoned"); - match result { - Ok(handle) => { - panel.finish_success(); - Ok(handle) - } - Err(err) => { - panel.finish_failure(); - eprintln!( - "{} {} {name}", - "x".red().bold(), - "Gateway failed:".red().bold(), - ); - // Fetch container logs for pattern-based diagnosis - let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await; - let logs_opt = if container_logs.is_empty() { - None - } else { - Some(container_logs.as_str()) - }; - // Try to diagnose the failure and provide guidance - let err_str = format!("{err:?}"); - let diagnosis = - openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt) - .unwrap_or_else(|| { - openshell_bootstrap::errors::generic_failure_diagnosis(name) - }); - print_failure_diagnosis(&diagnosis); - Err(err) - } - } - } else { - eprintln!("Deploying {location} gateway {name}..."); - let result = openshell_bootstrap::deploy_gateway_with_logs(options, |line| { - if let Some(status) = line.strip_prefix("[status] ") { - eprintln!(" {status}"); - } else if line.strip_prefix("[progress] ").is_some() { - // Sub-step progress: skip in non-interactive mode - } else { - eprintln!(" {line}"); - } - }) - .await; - match result { - Ok(handle) => { - eprintln!("Gateway {name} ready."); - Ok(handle) - } - Err(err) => { - eprintln!( - "{} {} {name}", - "x".red().bold(), - "Gateway failed:".red().bold(), - ); - // Fetch container logs for pattern-based diagnosis - let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await; - let logs_opt = if container_logs.is_empty() { - None - } else { - Some(container_logs.as_str()) - }; - let err_str = format!("{err:?}"); - let diagnosis = - openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt) - .unwrap_or_else(|| { - openshell_bootstrap::errors::generic_failure_diagnosis(name) - }); - print_failure_diagnosis(&diagnosis); - Err(err) - } - } - } -} - -/// Print post-deploy summary showing the gateway name and endpoint. -pub(crate) fn print_deploy_summary(name: &str, handle: &openshell_bootstrap::GatewayHandle) { - eprintln!(); - eprintln!("{} Gateway ready", "✓".green().bold()); - eprintln!(); - eprintln!(" {} {name}", "Name:".bold()); - eprintln!(" {} {}", "Endpoint:".bold(), handle.gateway_endpoint()); - eprintln!(); -} - -/// Print a user-friendly failure diagnosis with recovery steps. -fn print_failure_diagnosis(diagnosis: &openshell_bootstrap::errors::GatewayFailureDiagnosis) { - eprintln!(); - eprintln!("{}", diagnosis.summary.yellow().bold()); - eprintln!(); - eprintln!(" {}", diagnosis.explanation); - eprintln!(); - - if !diagnosis.recovery_steps.is_empty() { - eprintln!(" {}:", "To fix".bold()); - for (i, step) in diagnosis.recovery_steps.iter().enumerate() { - eprintln!(); - eprintln!(" {}. {}", i + 1, step.description); - if let Some(cmd) = &step.command { - eprintln!(); - eprintln!(" {}", cmd.cyan()); - } - } - eprintln!(); - } -} - -/// Provision or start a gateway (local or remote). -#[allow(clippy::too_many_arguments)] // user-facing CLI command -pub async fn gateway_admin_deploy( - name: &str, - remote: Option<&str>, - ssh_key: Option<&str>, - port: u16, - gateway_host: Option<&str>, - recreate: bool, - disable_tls: bool, - disable_gateway_auth: bool, - registry_username: Option<&str>, - registry_token: Option<&str>, - gpu: Vec, - oidc_issuer: Option<&str>, - oidc_audience: &str, - oidc_client_id: &str, - oidc_roles_claim: Option<&str>, - oidc_admin_role: Option<&str>, - oidc_user_role: Option<&str>, - oidc_scopes: Option<&str>, - oidc_scopes_claim: Option<&str>, -) -> Result<()> { - let location = if remote.is_some() { "remote" } else { "local" }; - - // Build remote options once so we can reuse them for the existence check - // and the deploy options. - let remote_opts = remote.map(|dest| { - let mut opts = RemoteOptions::new(dest); - if let Some(key) = ssh_key { - opts = opts.with_ssh_key(key); - } - opts - }); - - // If the gateway is already running and we're not recreating, short-circuit. - if !recreate - && let Some(existing) = - openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? - && existing.container_running - { - eprintln!( - "{} Gateway '{name}' is already running.", - "✓".green().bold() - ); - return Ok(()); - } - - // When resuming an existing gateway (not recreating), prefer the port - // and gateway host from stored metadata over the CLI defaults. The user - // may have originally bootstrapped on a non-default port (e.g. `--port - // 8082`) or with `--gateway-host host.docker.internal`, and a bare - // `gateway start` without those flags should honour the original values. - let stored_metadata = if recreate { - None - } else { - openshell_bootstrap::load_gateway_metadata(name).ok() - }; - let effective_port = stored_metadata - .as_ref() - .filter(|m| m.gateway_port > 0) - .map_or(port, |m| m.gateway_port); - let effective_gateway_host: Option = gateway_host.map(String::from).or_else(|| { - stored_metadata - .as_ref() - .and_then(|m| m.gateway_host().map(String::from)) - }); - - let mut options = DeployOptions::new(name) - .with_port(effective_port) - .with_disable_tls(disable_tls) - .with_disable_gateway_auth(disable_gateway_auth) - .with_gpu(gpu) - .with_recreate(recreate); - if let Some(opts) = remote_opts { - options = options.with_remote(opts); - } - if let Some(host) = effective_gateway_host { - options = options.with_gateway_host(host); - } - if let Some(username) = registry_username { - options = options.with_registry_username(username); - } - if let Some(token) = registry_token { - options = options.with_registry_token(token); - } - if let Some(issuer) = oidc_issuer { - options = options.with_oidc_issuer(issuer); - options = options.with_oidc_audience(oidc_audience); - options.oidc_client_id = oidc_client_id.to_string(); - if let Some(claim) = oidc_roles_claim { - options.oidc_roles_claim = Some(claim.to_string()); - } - if let Some(role) = oidc_admin_role { - options.oidc_admin_role = Some(role.to_string()); - } - if let Some(role) = oidc_user_role { - options.oidc_user_role = Some(role.to_string()); - } - if let Some(claim) = oidc_scopes_claim { - options.oidc_scopes_claim = Some(claim.to_string()); - } - } - - let handle = Box::pin(deploy_gateway_with_panel(options, name, location)).await?; - - // Persist oidc_scopes in gateway metadata so `gateway login` can - // request the correct scopes later. - if let Some(scopes) = oidc_scopes - && let Ok(mut meta) = openshell_bootstrap::load_gateway_metadata(name) - { - meta.oidc_scopes = Some(scopes.to_string()); - let _ = store_gateway_metadata(name, &meta); - } - - // Wait for the gRPC endpoint to actually accept connections before - // declaring the gateway ready. The Docker health check may pass before - // the gRPC listener inside the pod is fully bound. - let server = handle.gateway_endpoint().to_string(); - let tls = TlsOptions::default() - .with_gateway_name(name) - .with_default_paths(&server); - crate::bootstrap::wait_for_grpc_ready(&server, &tls).await?; - - print_deploy_summary(name, &handle); - - // Auto-activate: set this gateway as the active gateway. - save_active_gateway(name)?; - eprintln!("{} Active gateway set to '{name}'", "✓".green().bold()); - - Ok(()) -} - -/// Resolve the remote SSH destination for a gateway. -/// -/// If `remote_override` is provided, use it. Otherwise, look up the remote -/// host from stored gateway metadata. -enum GatewayControlTarget { - Local, - Remote(String), - ExternalRegistration, -} - -fn resolve_gateway_control_target( - name: &str, - remote_override: Option<&str>, -) -> GatewayControlTarget { - resolve_gateway_control_target_from(get_gateway_metadata(name), remote_override) -} - -fn resolve_gateway_control_target_from( - metadata: Option, - remote_override: Option<&str>, -) -> GatewayControlTarget { - if let Some(r) = remote_override { - return GatewayControlTarget::Remote(r.to_string()); - } - - match metadata { - // Not client-managed (`gateway add`) — the gateway lifecycle is - // managed externally (e.g. systemd, Podman, bare-metal); only - // remove the local registration metadata on destroy/stop. - Some(ref m) if m.client_lifecycle_managed == Some(false) => { - GatewayControlTarget::ExternalRegistration - } - // Remote gateway with SSH destination — managed remote container. - Some(ref m) if m.is_remote => m.remote_host.clone().map_or( - GatewayControlTarget::ExternalRegistration, - GatewayControlTarget::Remote, - ), - // Client-managed (`gateway start`) or legacy metadata (no - // `client_lifecycle_managed` field) — treat as a - // locally-managed container. - _ => GatewayControlTarget::Local, - } -} - -fn gateway_control_target_options( - name: &str, - remote_override: Option<&str>, - ssh_key: Option<&str>, -) -> Result> { - match resolve_gateway_control_target(name, remote_override) { - GatewayControlTarget::Local => Ok(None), - GatewayControlTarget::Remote(dest) => { - let mut opts = RemoteOptions::new(&dest); - if let Some(key) = ssh_key { - opts = opts.with_ssh_key(key); - } - Ok(Some(opts)) - } - GatewayControlTarget::ExternalRegistration => Err(miette::miette!( - "Gateway '{name}' is an external registration, not a managed Docker gateway.\n\ - `openshell gateway stop` is only supported for local or SSH-managed gateways." - )), - } -} - fn remove_gateway_registration(name: &str) { if let Err(err) = openshell_bootstrap::edge_token::remove_edge_token(name) { tracing::debug!("failed to remove edge token: {err}"); } - if let Err(err) = remove_gateway_metadata(name) { - tracing::debug!("failed to remove gateway metadata: {err}"); - } - if load_active_gateway().as_deref() == Some(name) - && let Err(err) = clear_active_gateway() - { - tracing::debug!("failed to clear active gateway: {err}"); - } -} - -fn cleanup_gateway_metadata(name: &str) { - if let Err(err) = openshell_bootstrap::edge_token::remove_edge_token(name) { - tracing::debug!("failed to remove edge token: {err}"); + if let Err(err) = openshell_bootstrap::oidc_token::remove_oidc_token(name) { + tracing::debug!("failed to remove oidc token: {err}"); } if let Err(err) = remove_gateway_metadata(name) { tracing::debug!("failed to remove gateway metadata: {err}"); @@ -1922,58 +1264,29 @@ fn cleanup_gateway_metadata(name: &str) { } } -/// Stop a gateway. -pub async fn gateway_admin_stop( - name: &str, - remote: Option<&str>, - ssh_key: Option<&str>, -) -> Result<()> { - let remote_opts = gateway_control_target_options(name, remote, ssh_key)?; +/// Remove a local gateway registration without touching the gateway service. +pub fn gateway_remove(name: &str) -> Result<()> { + if get_gateway_metadata(name).is_none() { + return Err(miette::miette!( + "No gateway metadata found for '{name}'.\n\ + List available gateways: openshell gateway select" + )); + } - eprintln!("• Stopping gateway {name}..."); - let handle = openshell_bootstrap::gateway_handle(name, remote_opts.as_ref()).await?; - handle.stop().await?; - eprintln!("{} Gateway {name} stopped.", "✓".green().bold()); + remove_gateway_registration(name); + eprintln!( + "{} Gateway registration '{name}' removed.", + "✓".green().bold() + ); Ok(()) } -/// Destroy a gateway and its state. -pub async fn gateway_admin_destroy( - name: &str, - remote: Option<&str>, - ssh_key: Option<&str>, -) -> Result<()> { - match resolve_gateway_control_target(name, remote) { - GatewayControlTarget::ExternalRegistration => { - eprintln!("• Removing gateway registration {name}..."); - remove_gateway_registration(name); - eprintln!( - "{} Gateway registration {name} removed.", - "✓".green().bold() - ); - Ok(()) - } - GatewayControlTarget::Local | GatewayControlTarget::Remote(_) => { - let remote_opts = gateway_control_target_options(name, remote, ssh_key)?; - - eprintln!("• Destroying gateway {name}..."); - let handle = openshell_bootstrap::gateway_handle(name, remote_opts.as_ref()).await?; - handle.destroy().await?; - - cleanup_gateway_metadata(name); - - eprintln!("{} Gateway {name} destroyed.", "✓".green().bold()); - Ok(()) - } - } -} - -/// Show gateway deployment details. +/// Show gateway registration details. pub fn gateway_admin_info(name: &str) -> Result<()> { let metadata = get_gateway_metadata(name).ok_or_else(|| { miette::miette!( "No gateway metadata found for '{name}'.\n\ - Deploy a gateway first with: openshell gateway start --name {name}" + Register it first: openshell gateway add --name {name}" ) })?; @@ -2000,161 +1313,11 @@ pub fn gateway_admin_info(name: &str) -> Result<()> { Ok(()) } -/// Fetch logs from the gateway Docker container. -/// -/// Connects to the Docker daemon (local or remote via SSH) and retrieves -/// logs from the `openshell-cluster-{name}` container. -#[allow(clippy::future_not_send)] // Holds stdout lock; CLI command, never sent across threads. -pub async fn doctor_logs( - name: &str, - lines: Option, - tail: bool, - remote: Option<&str>, - ssh_key: Option<&str>, -) -> Result<()> { - // Build remote options: explicit --remote flag, or auto-resolve from metadata - let remote_opts = remote.map_or_else( - || { - if let Some(metadata) = get_gateway_metadata(name) - && metadata.is_remote - && let Some(ref host) = metadata.remote_host - { - let mut opts = RemoteOptions::new(host.clone()); - if let Some(key) = ssh_key { - opts = opts.with_ssh_key(key); - } - Some(opts) - } else { - None - } - }, - |dest| { - let mut opts = RemoteOptions::new(dest); - if let Some(key) = ssh_key { - opts = opts.with_ssh_key(key); - } - Some(opts) - }, - ); - - let stdout = std::io::stdout().lock(); - openshell_bootstrap::gateway_container_logs(remote_opts.as_ref(), name, lines, tail, stdout) - .await -} - -/// Run a command inside the gateway Docker container. -/// -/// Spawns `docker exec` (or `ssh docker exec` for remote gateways) -/// as a child process with the user's terminal attached, so interactive -/// tools like `k9s` and `kubectl` work natively. -pub fn doctor_exec( - name: &str, - remote: Option<&str>, - ssh_key: Option<&str>, - command: &[String], -) -> Result<()> { - validate_gateway_name(name)?; - let container = container_name(name); - let is_tty = std::io::stdin().is_terminal(); - - // Wrap the user command with KUBECONFIG set - let inner_cmd = if command.is_empty() { - "KUBECONFIG=/etc/rancher/k3s/k3s.yaml sh".to_string() - } else { - let escaped: Vec = command.iter().map(|a| shell_escape(a)).collect(); - format!("KUBECONFIG=/etc/rancher/k3s/k3s.yaml {}", escaped.join(" ")) - }; - - // Resolve remote destination: explicit --remote flag, or auto-resolve from metadata - let remote_host = remote.map_or_else( - || { - if let Some(metadata) = get_gateway_metadata(name) - && metadata.is_remote - { - metadata.remote_host - } else { - None - } - }, - |dest| Some(dest.to_string()), - ); - - let mut cmd = if let Some(ref host) = remote_host { - validate_ssh_host(host)?; - - // Remote: ssh docker exec [-it] sh -lc '' - // - // SSH concatenates all arguments after the hostname into a single - // string for the remote shell, so inner_cmd must be escaped twice: - // once for `sh -lc` (already done above) and once for the SSH - // remote shell (done here). - let ssh_escaped_cmd = shell_escape(&inner_cmd); - let mut c = Command::new("ssh"); - if let Some(key) = ssh_key { - c.args(["-i", key]); - } - // -t forces TTY allocation over SSH when we have a local TTY - if is_tty { - c.arg("-t"); - } - c.arg(host); - c.arg("docker"); - c.arg("exec"); - if is_tty { - c.args(["-it"]); - } else { - c.arg("-i"); - } - c.args([&container, "sh", "-lc", &ssh_escaped_cmd]); - c - } else { - // Local: docker exec [-it] sh -lc '' - let mut c = Command::new("docker"); - c.arg("exec"); - if is_tty { - c.args(["-it"]); - } else { - c.arg("-i"); - } - c.args([&container, "sh", "-lc", &inner_cmd]); - c - }; - - let status = cmd - .status() - .into_diagnostic() - .wrap_err("failed to execute docker exec")?; - - if !status.success() { - let code = status.code().unwrap_or(1); - std::process::exit(code); - } - - Ok(()) -} - -/// Print the LLM diagnostic prompt to stdout. -/// -/// Outputs a system prompt that a coding agent can use to autonomously -/// diagnose gateway issues using `openshell doctor logs` and -/// `openshell doctor exec`. -pub fn doctor_llm() -> Result<()> { - use std::io::Write; - let stdout = std::io::stdout(); - let mut handle = stdout.lock(); - handle - .write_all(include_bytes!("doctor_llm_prompt.md")) - .into_diagnostic() - .wrap_err("failed to write LLM prompt to stdout")?; - Ok(()) -} - /// Validate system prerequisites for running a gateway. /// /// Checks Docker connectivity and reports the result. Returns exit code 0 /// if all checks pass, 1 otherwise. -#[allow(clippy::future_not_send)] // Holds stdout lock; CLI command, never sent across threads. -pub async fn doctor_check() -> Result<()> { +pub fn doctor_check() -> Result<()> { use std::io::Write; let mut stdout = std::io::stdout().lock(); @@ -2164,142 +1327,32 @@ pub async fn doctor_check() -> Result<()> { write!(stdout, " Docker ............. ").into_diagnostic()?; stdout.flush().into_diagnostic()?; - match openshell_bootstrap::check_docker_available().await { - Ok(preflight) => { - let version_str = preflight.version.as_deref().unwrap_or("unknown"); - writeln!(stdout, "ok (version {version_str})").into_diagnostic()?; + let output = Command::new("docker") + .args(["info", "--format", "{{.ServerVersion}}"]) + .output() + .into_diagnostic() + .wrap_err("failed to execute docker info")?; - // --- DOCKER_HOST --- - write!(stdout, " DOCKER_HOST ........ ").into_diagnostic()?; - match std::env::var("DOCKER_HOST") { - Ok(val) => writeln!(stdout, "{val}").into_diagnostic()?, - Err(_) => writeln!(stdout, "(not set, using default socket)").into_diagnostic()?, - } + if output.status.success() { + let version = String::from_utf8_lossy(&output.stdout); + let version_str = version.trim(); + writeln!(stdout, "ok (version {version_str})").into_diagnostic()?; - writeln!(stdout, "\nAll checks passed.").into_diagnostic()?; - Ok(()) + // --- DOCKER_HOST --- + write!(stdout, " DOCKER_HOST ........ ").into_diagnostic()?; + match std::env::var("DOCKER_HOST") { + Ok(val) => writeln!(stdout, "{val}").into_diagnostic()?, + Err(_) => writeln!(stdout, "(not set, using default socket)").into_diagnostic()?, } - Err(err) => { - writeln!(stdout, "FAILED").into_diagnostic()?; - writeln!(stdout).into_diagnostic()?; - Err(err) - } - } -} - -/// Shell-escape a single argument for safe inclusion in a `sh -c` string. -fn shell_escape(s: &str) -> String { - if s.is_empty() { - return "''".to_string(); - } - // If the string is clean (alphanumeric, hyphens, underscores, dots, slashes, colons, equals), - // no quoting needed. - if s.chars() - .all(|c| c.is_ascii_alphanumeric() || "-_./,:=@".contains(c)) - { - return s.to_string(); - } - // Otherwise, single-quote it (escaping embedded single quotes) - format!("'{}'", s.replace('\'', "'\\''")) -} -/// Validate that a gateway name is safe for use in container/volume/network -/// names and shell commands. Rejects names with characters outside the set -/// `[a-zA-Z0-9._-]`. -fn validate_gateway_name(name: &str) -> Result<()> { - if name.is_empty() { - return Err(miette!("gateway name is empty")); - } - if !name - .bytes() - .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b'_')) - { - return Err(miette!( - "gateway name contains invalid characters (allowed: alphanumeric, '.', '-', '_')" - )); - } - Ok(()) -} - -/// Validate that an SSH host string is a reasonable hostname or IP address. -/// Rejects values with shell metacharacters, spaces, or control characters -/// that could be used for injection via a poisoned metadata.json. -fn validate_ssh_host(host: &str) -> Result<()> { - if host.is_empty() { - return Err(miette!("SSH host is empty")); - } - // Allow: alphanumeric, dots, hyphens, colons (IPv6), square brackets ([::1]), - // and @ (user@host). - if !host - .bytes() - .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b':' | b'[' | b']' | b'@')) - { - return Err(miette!("SSH host contains invalid characters: {host}")); + writeln!(stdout, "\nAll checks passed.").into_diagnostic()?; + return Ok(()); } - Ok(()) -} -/// Create a sandbox when no gateway is configured. -/// -/// Bootstraps a new gateway first, then delegates to [`sandbox_create`]. -#[allow(clippy::too_many_arguments)] -pub async fn sandbox_create_with_bootstrap( - name: Option<&str>, - from: Option<&str>, - upload: Option<&(String, Option, bool)>, - keep: bool, - gpu: bool, - gpu_device: Option<&str>, - editor: Option, - remote: Option<&str>, - ssh_key: Option<&str>, - providers: &[String], - policy: Option<&str>, - forward: Option, - command: &[String], - tty_override: Option, - bootstrap_override: Option, - auto_providers_override: Option, -) -> Result<()> { - if !crate::bootstrap::confirm_bootstrap(bootstrap_override)? { - return Err(miette::miette!( - "No active gateway.\n\ - Set one with: openshell gateway select \n\ - Or deploy a new gateway: openshell gateway start" - )); - } - let requested_gpu = gpu || from.is_some_and(source_requests_gpu); - let (tls, server, gateway_name) = Box::pin(crate::bootstrap::run_bootstrap( - remote, - ssh_key, - requested_gpu, - )) - .await?; - // Disable bootstrap inside sandbox_create so that a transient connection - // failure right after deploy does not trigger a second bootstrap attempt. - Box::pin(sandbox_create( - &server, - name, - from, - &gateway_name, - upload, - keep, - gpu, - gpu_device, - editor, - remote, - ssh_key, - providers, - policy, - forward, - command, - tty_override, - Some(false), - auto_providers_override, - &HashMap::new(), - &tls, - )) - .await + writeln!(stdout, "FAILED").into_diagnostic()?; + writeln!(stdout).into_diagnostic()?; + let stderr = String::from_utf8_lossy(&output.stderr); + Err(miette::miette!("docker info failed: {}", stderr.trim())) } fn sandbox_should_persist( @@ -2344,14 +1397,11 @@ pub async fn sandbox_create( gpu: bool, gpu_device: Option<&str>, editor: Option, - remote: Option<&str>, - ssh_key: Option<&str>, providers: &[String], policy: Option<&str>, forward: Option, command: &[String], tty_override: Option, - bootstrap_override: Option, auto_providers_override: Option, labels: &HashMap, tls: &TlsOptions, @@ -2368,52 +1418,15 @@ pub async fn sandbox_create( openshell_core::forward::check_port_available(spec)?; } - // Try connecting to the gateway. If the connection fails due to a - // connectivity error and bootstrap is allowed, start a new gateway. - // - // bootstrap_override is Some(false) when: - // - the user passed --no-bootstrap - // - an existing gateway was already resolved (don't replace it) - // - we already bootstrapped once (don't double-bootstrap) - let (mut client, effective_server, effective_tls) = match grpc_client(server, tls).await { - Ok(c) => (c, server.to_string(), tls.clone()), - Err(err) => { - if !crate::bootstrap::should_attempt_bootstrap(&err, tls) { - return Err(err); - } - if !crate::bootstrap::confirm_bootstrap(bootstrap_override)? { - // The gateway is configured but not reachable. Give the user - // actionable recovery steps instead of a raw connection error. - eprintln!(); - eprintln!( - "{} Gateway '{}' is not reachable.", - "!".yellow(), - gateway_name, - ); - eprintln!(); - eprintln!(" To destroy and recreate the gateway:"); - eprintln!(); - eprintln!( - " {} && {}", - format!("openshell gateway destroy --name {gateway_name}").cyan(), - "openshell gateway start".cyan(), - ); - eprintln!(); - return Err(err); - } - let requested_gpu = gpu || from.is_some_and(source_requests_gpu); - let (new_tls, new_server, _) = Box::pin(crate::bootstrap::run_bootstrap( - remote, - ssh_key, - requested_gpu, - )) - .await?; - let c = grpc_client(&new_server, &new_tls) - .await - .wrap_err("bootstrap succeeded but failed to connect to gateway")?; - (c, new_server, new_tls) - } - }; + let mut client = grpc_client(server, tls).await.wrap_err_with(|| { + format!( + "failed to connect to gateway '{gateway_name}' at {server}. \ + Start the gateway service with the installed package manager, \ + or register a different endpoint with `openshell gateway add `." + ) + })?; + let effective_server = server.to_string(); + let effective_tls = tls.clone(); // Resolve the --from flag into a container image reference, building from // a Dockerfile first if necessary. @@ -2891,7 +1904,7 @@ pub async fn sandbox_create( enum ResolvedSource { /// A ready-to-use container image reference. Image(String), - /// A Dockerfile that must be built and pushed before creating the sandbox. + /// A Dockerfile that must be built before creating the sandbox. Dockerfile { dockerfile: PathBuf, context: PathBuf, @@ -3006,13 +2019,6 @@ fn value_looks_like_bare_dockerfile_name(value: &str) -> bool { !value.contains('/') && !value.contains(':') && filename_looks_like_dockerfile(Path::new(value)) } -fn source_requests_gpu(source: &str) -> bool { - resolve_from(source).is_ok_and(|resolved| match resolved { - ResolvedSource::Image(image) => image_requests_gpu(&image), - ResolvedSource::Dockerfile { .. } => false, - }) -} - fn image_requests_gpu(image: &str) -> bool { let image_name = image .rsplit('/') @@ -3030,12 +2036,11 @@ fn dockerfile_sources_supported_for_gateway(metadata: Option<&GatewayMetadata>) !metadata.is_some_and(|metadata| metadata.is_remote) } -/// Build a Dockerfile and make the resulting image available to the gateway. +/// Build a Dockerfile and return the local Docker tag. /// -/// For local Kubernetes gateways running in Docker, this imports the built image -/// into the gateway runtime and returns the Docker tag. Standalone local -/// gateways use the same Docker daemon that the CLI built into, so the tag is -/// passed through directly and the active compute driver resolves it. +/// Package-managed local gateways use the same Docker daemon that the CLI +/// builds into, so the tag is passed through directly and the active compute +/// driver resolves it. async fn build_from_dockerfile( dockerfile: &Path, context: &Path, @@ -3077,24 +2082,6 @@ async fn build_from_dockerfile( ) .await?; - let existing_gateway = openshell_bootstrap::check_existing_deployment(gateway_name, None) - .await - .wrap_err("failed to inspect local gateway deployment state")?; - let pushed_into_gateway = existing_gateway - .is_some_and(|gateway| gateway.container_exists && gateway.container_running); - if pushed_into_gateway { - openshell_bootstrap::build::push_image_into_gateway(&tag, gateway_name, &mut on_log) - .await?; - eprintln!(); - eprintln!( - "{} Image {} is available in the gateway.", - "✓".green().bold(), - tag.cyan(), - ); - eprintln!(); - return Ok(tag); - } - eprintln!(); eprintln!( "{} Image {} is available in the local Docker daemon for gateway '{}'.", @@ -5983,14 +4970,12 @@ fn format_timestamp_ms(ms: i64) -> String { #[cfg(test)] mod tests { use super::{ - GatewayControlTarget, TlsOptions, dockerfile_sources_supported_for_gateway, - format_gateway_select_header, format_gateway_select_items, gateway_add, gateway_auth_label, - gateway_env_override_warning, gateway_select_with, gateway_type_label, git_sync_files, - http_health_check, image_requests_gpu, inferred_provider_type, parse_cli_setting_value, + TlsOptions, dockerfile_sources_supported_for_gateway, format_gateway_select_header, + format_gateway_select_items, gateway_add, gateway_auth_label, gateway_env_override_warning, + gateway_select_with, gateway_type_label, git_sync_files, http_health_check, + image_requests_gpu, inferred_provider_type, parse_cli_setting_value, parse_credential_pairs, plaintext_gateway_is_remote, provisioning_timeout_message, - ready_false_condition_message, resolve_from, resolve_gateway_control_target_from, - sandbox_should_persist, shell_escape, source_requests_gpu, validate_gateway_name, - validate_ssh_host, + ready_false_condition_message, resolve_from, sandbox_should_persist, }; use crate::TEST_ENV_LOCK; use hyper::StatusCode; @@ -6062,7 +5047,6 @@ mod tests { gateway_endpoint: endpoint.to_string(), is_remote: true, auth_mode: Some("cloudflare_jwt".to_string()), - client_lifecycle_managed: Some(false), ..Default::default() } } @@ -6231,12 +5215,6 @@ mod tests { } } - #[test] - fn source_requests_gpu_detects_known_community_gpu_name() { - assert!(source_requests_gpu("nvidia-gpu")); - assert!(!source_requests_gpu("base")); - } - #[test] fn resolve_from_classifies_existing_dockerfile_path() { let temp = tempfile::tempdir().expect("failed to create tempdir"); @@ -6445,67 +5423,6 @@ mod tests { assert_eq!(files, vec!["file.txt", "inner/child.txt"]); } - #[test] - fn resolve_gateway_control_target_marks_edge_registration_unmanaged() { - let metadata = edge_registration("edge-gateway", "https://gw.example.com"); - let target = resolve_gateway_control_target_from(Some(metadata), None); - assert!(matches!(target, GatewayControlTarget::ExternalRegistration)); - } - - #[test] - fn resolve_gateway_control_target_prefers_explicit_remote_override() { - let target = resolve_gateway_control_target_from(None, Some("user@host")); - match target { - GatewayControlTarget::Remote(dest) => assert_eq!(dest, "user@host"), - GatewayControlTarget::Local | GatewayControlTarget::ExternalRegistration => { - panic!("expected remote target") - } - } - } - - #[test] - fn resolve_gateway_control_target_non_managed_loopback_is_external() { - // A gateway registered via `gateway add http://localhost:8080` should - // be classified as an external registration, not a local container. - let metadata = GatewayMetadata { - name: "localhost".to_string(), - gateway_endpoint: "http://localhost:8080".to_string(), - auth_mode: Some("plaintext".to_string()), - client_lifecycle_managed: Some(false), - ..Default::default() - }; - let target = resolve_gateway_control_target_from(Some(metadata), None); - assert!(matches!(target, GatewayControlTarget::ExternalRegistration)); - } - - #[test] - fn resolve_gateway_control_target_managed_gateway_is_local() { - // A gateway deployed via `gateway start` should be classified as local. - let metadata = GatewayMetadata { - name: "openshell".to_string(), - gateway_endpoint: "https://127.0.0.1:8080".to_string(), - gateway_port: 8080, - client_lifecycle_managed: Some(true), - ..Default::default() - }; - let target = resolve_gateway_control_target_from(Some(metadata), None); - assert!(matches!(target, GatewayControlTarget::Local)); - } - - #[test] - fn resolve_gateway_control_target_legacy_metadata_defaults_to_local() { - // Legacy metadata without the `client_lifecycle_managed` field - // should preserve the existing behavior: non-remote → Local. - let metadata = GatewayMetadata { - name: "openshell".to_string(), - gateway_endpoint: "https://127.0.0.1:8080".to_string(), - gateway_port: 8080, - ..Default::default() - }; - let target = resolve_gateway_control_target_from(Some(metadata), None); - assert!(matches!(target, GatewayControlTarget::Local)); - } - #[test] fn gateway_select_uses_explicit_name_without_prompting() { let tmpdir = tempfile::tempdir().expect("create tmpdir"); @@ -6695,7 +5612,6 @@ mod tests { "http://127.0.0.1:8080", None, None, - None, false, None, "openshell-cli", @@ -6711,7 +5627,6 @@ mod tests { let metadata = load_gateway_metadata("openshell").expect("load stored gateway"); assert_eq!(metadata.auth_mode.as_deref(), Some("plaintext")); assert!(!metadata.is_remote); - assert_eq!(metadata.client_lifecycle_managed, Some(false)); assert_eq!(metadata.gateway_endpoint, "http://127.0.0.1:8080"); assert_eq!(load_active_gateway().as_deref(), Some("openshell")); }); @@ -6728,7 +5643,6 @@ mod tests { "http://gateway.example.com:8080", Some("dev-http"), None, - None, true, None, "openshell-cli", @@ -6742,7 +5656,6 @@ mod tests { let metadata = load_gateway_metadata("dev-http").expect("load stored gateway"); assert_eq!(metadata.auth_mode.as_deref(), Some("plaintext")); assert!(!metadata.is_remote); - assert_eq!(metadata.client_lifecycle_managed, Some(false)); assert_eq!(metadata.gateway_endpoint, "http://gateway.example.com:8080"); assert_eq!(load_active_gateway().as_deref(), Some("dev-http")); }); @@ -6776,58 +5689,4 @@ mod tests { server.join().expect("server thread"); assert_eq!(status, Some(StatusCode::OK)); } - - // ---- SEC-004: validate_gateway_name, validate_ssh_host, shell_escape ---- - - #[test] - fn validate_gateway_name_accepts_valid_names() { - assert!(validate_gateway_name("openshell").is_ok()); - assert!(validate_gateway_name("my-gateway").is_ok()); - assert!(validate_gateway_name("gateway_v2").is_ok()); - assert!(validate_gateway_name("gw.prod").is_ok()); - } - - #[test] - fn validate_gateway_name_rejects_invalid_names() { - assert!(validate_gateway_name("").is_err()); - assert!(validate_gateway_name("gw;rm -rf /").is_err()); - assert!(validate_gateway_name("gw name").is_err()); - assert!(validate_gateway_name("gw$(id)").is_err()); - assert!(validate_gateway_name("gw\nmalicious").is_err()); - } - - #[test] - fn validate_ssh_host_accepts_valid_hosts() { - assert!(validate_ssh_host("192.168.1.1").is_ok()); - assert!(validate_ssh_host("example.com").is_ok()); - assert!(validate_ssh_host("user@host.com").is_ok()); - assert!(validate_ssh_host("[::1]").is_ok()); - assert!(validate_ssh_host("2001:db8::1").is_ok()); - } - - #[test] - fn validate_ssh_host_rejects_invalid_hosts() { - assert!(validate_ssh_host("").is_err()); - assert!(validate_ssh_host("host;rm -rf /").is_err()); - assert!(validate_ssh_host("host$(id)").is_err()); - assert!(validate_ssh_host("host name").is_err()); - assert!(validate_ssh_host("host\nmalicious").is_err()); - } - - #[test] - fn shell_escape_double_escape_for_ssh() { - // Simulate the double-escape path for SSH: - // First escape for sh -lc, then escape again for SSH remote shell. - let inner_cmd = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml echo 'hello world'"; - let ssh_escaped = shell_escape(inner_cmd); - // The result should be single-quoted (wrapping the entire inner_cmd) - assert!( - ssh_escaped.starts_with('\''), - "should be single-quoted: {ssh_escaped}" - ); - assert!( - ssh_escaped.ends_with('\''), - "should end with single-quote: {ssh_escaped}" - ); - } } diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index b65bdd684..7f986ef42 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -586,15 +586,12 @@ async fn sandbox_create_keeps_command_sessions_by_default() { false, None, None, - None, - None, &[], None, None, &["echo".to_string(), "OK".to_string()], Some(false), Some(false), - Some(false), &HashMap::new(), &tls, ) @@ -628,15 +625,12 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { false, None, None, - None, - None, &[], None, None, &["echo".to_string(), "OK".to_string()], Some(false), Some(false), - Some(false), &HashMap::new(), &tls, ) @@ -673,15 +667,12 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { false, None, None, - None, - None, &[], None, None, &[], Some(true), Some(false), - Some(false), &HashMap::new(), &tls, ) @@ -718,15 +709,12 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { false, None, None, - None, - None, &[], None, None, &["echo".to_string(), "OK".to_string()], Some(false), Some(false), - Some(false), &HashMap::new(), &tls, ) @@ -764,15 +752,12 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { false, None, None, - None, - None, &[], None, Some(openshell_core::forward::ForwardSpec::new(forward_port)), &["echo".to_string(), "OK".to_string()], Some(false), Some(false), - Some(false), &HashMap::new(), &tls, ) diff --git a/crates/openshell-sandbox/src/procfs.rs b/crates/openshell-sandbox/src/procfs.rs index 988ee2412..e02e850b5 100644 --- a/crates/openshell-sandbox/src/procfs.rs +++ b/crates/openshell-sandbox/src/procfs.rs @@ -57,8 +57,8 @@ struct DescendantPid { /// ### Unlinked binaries (`(deleted)` suffix) /// /// When a running binary is unlinked from its filesystem path — the common -/// case is a `docker cp` hot-swap of `/opt/openshell/bin/openshell-sandbox` -/// during a `cluster-deploy-fast` dev upgrade — the kernel appends the +/// case is a hot-swap of `/opt/openshell/bin/openshell-sandbox` during a +/// development upgrade — the kernel appends the /// literal string `" (deleted)"` to the `/proc//exe` readlink target. /// The raw tainted path (e.g. `"/opt/openshell/bin/openshell-sandbox (deleted)"`) /// is not a real filesystem path: any downstream `stat()` fails with `ENOENT`. diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 2f8280b9c..ba5d64663 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -1727,7 +1727,6 @@ fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Re name: gateway_name.to_string(), gateway_endpoint: format!("https://127.0.0.1:{gateway_port}"), gateway_port, - client_lifecycle_managed: Some(true), ..Default::default() }; diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 5806f6919..62662fa93 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -8,7 +8,6 @@ # Targets: # gateway Final gateway image # supervisor Final supervisor image (Ubuntu base, supervisor binary) -# cluster Final cluster image # # Rust binaries are built natively before the image build and staged at: # deploy/docker/.build/prebuilt-binaries//openshell-{gateway,sandbox} @@ -17,14 +16,6 @@ # binaries inside Docker instead. BuildKit only executes the selected binary # staging stage, so missing prebuilt files do not cause a build failure. -# Pin by tag AND manifest-list digest to prevent silent upstream republishes -# from breaking the build. Update both when bumping k3s versions. -# To refresh: docker buildx imagetools inspect rancher/k3s: | head -3 -ARG K3S_VERSION=v1.35.3-k3s1 -ARG K3S_DIGEST=sha256:4607083d3cac07e1ccde7317297271d13ed5f60f35a78f33fcef84858a9f1d69 -ARG K9S_VERSION=v0.50.18 -ARG HELM_VERSION=v3.17.3 -ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1 # Controls binary source: 0 = prebuilt (release), 1 = compile in Docker (local dev). # Must be declared here (global scope) so it can be used in FROM instructions below. ARG BUILD_FROM_SOURCE=0 @@ -129,103 +120,3 @@ CMD ["--bind-address", "0.0.0.0", "--port", "8080"] # the binary, breaking the Kubernetes init-container path. FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 AS supervisor COPY --from=supervisor-binary /build/out/openshell-sandbox /openshell-sandbox - -# --------------------------------------------------------------------------- -# Cluster asset stages -# --------------------------------------------------------------------------- -FROM rancher/k3s:${K3S_VERSION}@${K3S_DIGEST} AS k3s - -FROM ubuntu:24.04 AS k9s -ARG K9S_VERSION -ARG TARGETARCH -RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates && \ - curl -fsSL "https://github.com/derailed/k9s/releases/download/${K9S_VERSION}/k9s_Linux_${TARGETARCH}.tar.gz" \ - | tar xz -C /tmp k9s && \ - chmod +x /tmp/k9s && \ - rm -rf /var/lib/apt/lists/* - -FROM ubuntu:24.04 AS helm -ARG HELM_VERSION -ARG TARGETARCH -RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates && \ - curl -fsSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-${TARGETARCH}.tar.gz" \ - | tar xz --strip-components=1 -C /tmp "linux-${TARGETARCH}/helm" && \ - chmod +x /tmp/helm && \ - rm -rf /var/lib/apt/lists/* - -FROM ubuntu:24.04 AS nvidia-container-toolkit -ARG NVIDIA_CONTAINER_TOOLKIT_VERSION - -RUN apt-get update && apt-get install -y --no-install-recommends \ - gpg curl ca-certificates && \ - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ - | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ - curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ - | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ - | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - "nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" && \ - rm -rf /var/lib/apt/lists/* - -# --------------------------------------------------------------------------- -# Final cluster image -# --------------------------------------------------------------------------- -FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 AS cluster - -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - iptables \ - mount \ - dnsutils \ - && apt-get install -y --only-upgrade gpgv \ - && rm -rf /var/lib/apt/lists/* - -COPY --from=k3s /bin/ /bin/ -COPY --from=k9s /tmp/k9s /usr/local/bin/k9s -COPY --from=helm /tmp/helm /usr/local/bin/helm -COPY --from=k3s /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/k3s-ca-certificates.crt -COPY --from=k3s /usr/share/zoneinfo/ /usr/share/zoneinfo/ - -ENV PATH="/var/lib/rancher/k3s/data/cni:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/bin/aux" \ - CRI_CONFIG_FILE="/var/lib/rancher/k3s/agent/etc/crictl.yaml" - -COPY --from=nvidia-container-toolkit /usr/bin/nvidia-cdi-hook /usr/bin/ -COPY --from=nvidia-container-toolkit /usr/bin/nvidia-container-runtime /usr/bin/ -COPY --from=nvidia-container-toolkit /usr/bin/nvidia-ctk /usr/bin/ -COPY --from=nvidia-container-toolkit /etc/nvidia-container-runtime /etc/nvidia-container-runtime -COPY --from=supervisor-binary /build/out/openshell-sandbox /opt/openshell/bin/openshell-sandbox - -RUN mkdir -p /var/lib/rancher/k3s/server/manifests \ - /var/lib/rancher/k3s/server/static/charts \ - /etc/rancher/k3s \ - /opt/openshell/manifests \ - /opt/openshell/charts \ - /opt/openshell/gpu-manifests \ - /run/flannel - -COPY deploy/docker/cluster-entrypoint.sh /usr/local/bin/cluster-entrypoint.sh -RUN chmod +x /usr/local/bin/cluster-entrypoint.sh - -COPY deploy/docker/cluster-healthcheck.sh /usr/local/bin/cluster-healthcheck.sh -RUN chmod +x /usr/local/bin/cluster-healthcheck.sh - -COPY deploy/docker/.build/charts/*.tgz /opt/openshell/charts/ -# Only the core k3s auto-deploy manifests belong in the cluster image. -# Gateway API routing is optional and requires Envoy Gateway CRDs, so -# deploy/kube/manifests/envoy-gateway-openshell.yaml stays repo-local and is -# applied manually by `mise run helm:gateway:apply` when grpcRoute is enabled. -COPY deploy/kube/manifests/openshell-helmchart.yaml \ - deploy/kube/manifests/agent-sandbox.yaml \ - /opt/openshell/manifests/ -COPY deploy/kube/gpu-manifests/*.yaml /opt/openshell/gpu-manifests/ - -ENTRYPOINT ["/usr/local/bin/cluster-entrypoint.sh"] -# Default to "server" so bare `docker run ` works without requiring -# the caller to pass a subcommand. The openshell CLI already passes -# ["server", "--disable=traefik", ...] as CMD; this default only affects -# manual `docker run` invocations that omit a command. -CMD ["server"] - -HEALTHCHECK --interval=5s --timeout=5s --start-period=20s --retries=60 \ - CMD ["/usr/local/bin/cluster-healthcheck.sh"] diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh deleted file mode 100644 index 56e577e7f..000000000 --- a/deploy/docker/cluster-entrypoint.sh +++ /dev/null @@ -1,630 +0,0 @@ -#!/bin/sh - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Entrypoint script for OpenShell cluster image. -# -# This script configures DNS resolution for k3s when running in Docker. -# -# Problem: On Docker custom networks, /etc/resolv.conf contains 127.0.0.11 -# (Docker's internal DNS). k3s detects this loopback address and automatically -# falls back to 8.8.8.8 - but on Docker Desktop (Mac/Windows), external UDP -# traffic to 8.8.8.8:53 doesn't work due to network limitations. The host -# gateway IP (host.docker.internal) is reachable but doesn't run a DNS server -# either. -# -# Solution: Use iptables to proxy DNS from the container's eth0 IP to Docker's -# embedded DNS resolver at 127.0.0.11. Docker's DNS listens on random high -# ports (visible in the DOCKER_OUTPUT iptables chain), so we parse those ports -# and set up DNAT rules to forward DNS traffic from k3s pods. We then point -# k3s's resolv-conf kubelet arg at the container's routable eth0 IP. -# -# Per k3s docs: "Manually specified resolver configuration files are not -# subject to viability checks." - -set -e - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- -# Escape a value for safe embedding as a YAML single-quoted scalar. -# Single quotes are the only character that needs escaping (' -> ''). -yaml_quote() { - printf "'%s'" "$(printf '%s' "$1" | sed "s/'/''/g")" -} - -# --------------------------------------------------------------------------- -# Select iptables backend -# --------------------------------------------------------------------------- -# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem -# but lack the nft_compat bridge that allows flannel and kube-proxy to use -# xt extension modules (xt_comment, xt_conntrack). Detect this by probing -# whether xt_comment is usable via the current iptables backend. If the -# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1 -# externally to skip the probe and force the legacy backend. -# --------------------------------------------------------------------------- -# Check br_netfilter kernel module -# --------------------------------------------------------------------------- -# br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through -# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are -# never applied to pod traffic, so pods cannot reach services such as -# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution. -# -# The module must be loaded on the HOST before the container starts — -# containers cannot load kernel modules themselves. If it is missing, log a -# warning rather than failing hard: some kernels have bridge netfilter support -# built-in or expose it differently, and will work correctly without the module -# being explicitly loaded as a separate .ko. -if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then - echo "Warning: br_netfilter does not appear to be loaded on the host." >&2 - echo " Pod-to-service networking (including kube-dns) may not work without it." >&2 - echo " If the cluster fails to start or DNS is broken, try loading it on the host:" >&2 - echo " sudo modprobe br_netfilter" >&2 - echo " To persist across reboots:" >&2 - echo " echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2 -fi - -if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then - if iptables -t filter -N _xt_probe 2>/dev/null; then - _probe_rc=0 - iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \ - 2>/dev/null || _probe_rc=$? - iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \ - 2>/dev/null || true - iptables -t filter -X _xt_probe 2>/dev/null || true - [ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1 - fi -fi - -if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then - echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy" - if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && \ - update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then - echo "Now using iptables-legacy mode" - else - echo "Warning: could not switch to iptables-legacy — cluster networking may fail" - fi -fi - -IPTABLES=$([ "${USE_IPTABLES_LEGACY:-0}" = "1" ] && echo iptables-legacy || echo iptables) - -RESOLV_CONF="/etc/rancher/k3s/resolv.conf" - -has_default_route() { - ip -4 route show default 2>/dev/null | grep -q '^default ' \ - || ip -6 route show default 2>/dev/null | grep -q '^default ' -} - -wait_for_default_route() { - attempts=${1:-30} - delay_s=${2:-1} - i=1 - - while [ "$i" -le "$attempts" ]; do - if has_default_route; then - return 0 - fi - sleep "$delay_s" - i=$((i + 1)) - done - - echo "Error: no default route present before starting k3s" - echo "IPv4 routes:" - ip -4 route show 2>/dev/null || true - echo "IPv6 routes:" - ip -6 route show 2>/dev/null || true - echo "/proc/net/route:" - cat /proc/net/route 2>/dev/null || true - echo "/proc/net/ipv6_route:" - cat /proc/net/ipv6_route 2>/dev/null || true - return 1 -} - -# --------------------------------------------------------------------------- -# Configure DNS proxy via iptables -# --------------------------------------------------------------------------- -# Docker's embedded DNS (127.0.0.11) is only reachable from the container's -# own network namespace via iptables OUTPUT rules. k3s pods run in separate -# network namespaces and route through PREROUTING, so they can't reach it -# directly. We solve this by: -# 1. Discovering the real DNS listener ports from Docker's iptables rules -# 2. Picking the container's eth0 IP as a routable DNS address -# 3. Adding DNAT rules so traffic to :53 reaches Docker's DNS -# 4. Writing that IP into the k3s resolv.conf - -setup_dns_proxy() { - # Extract Docker's actual DNS listener ports from the DOCKER_OUTPUT chain. - # Docker sets up rules like: - # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p udp --dport 53 -j DNAT --to-destination 127.0.0.11: - # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p tcp --dport 53 -j DNAT --to-destination 127.0.0.11: - UDP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \ - | grep -- '-p udp.*--dport 53' \ - | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \ - | head -1) - TCP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \ - | grep -- '-p tcp.*--dport 53' \ - | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \ - | head -1) - - if [ -z "$UDP_PORT" ] || [ -z "$TCP_PORT" ]; then - echo "Warning: Could not discover Docker DNS ports from iptables" - echo " UDP_PORT=${UDP_PORT:-} TCP_PORT=${TCP_PORT:-}" - return 1 - fi - - # Get the container's routable (non-loopback) IP - CONTAINER_IP=$(ip -4 addr show eth0 2>/dev/null \ - | awk '/inet /{print $2}' | cut -d/ -f1 | head -1) - - if [ -z "$CONTAINER_IP" ]; then - echo "Warning: Could not determine container IP from eth0" - return 1 - fi - - echo "Setting up DNS proxy: ${CONTAINER_IP}:53 -> 127.0.0.11 (udp:${UDP_PORT}, tcp:${TCP_PORT})" - - # Forward DNS from pods (PREROUTING) and local processes (OUTPUT) to Docker's DNS - $IPTABLES -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \ - --to-destination "127.0.0.11:${UDP_PORT}" - $IPTABLES -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \ - --to-destination "127.0.0.11:${TCP_PORT}" - - echo "nameserver $CONTAINER_IP" > "$RESOLV_CONF" - echo "Configured k3s DNS to use ${CONTAINER_IP} (proxied to Docker DNS)" -} - -if ! setup_dns_proxy; then - echo "DNS proxy setup failed, falling back to public DNS servers" - echo "Note: this may not work on Docker Desktop (Mac/Windows)" - cat > "$RESOLV_CONF" </dev/null 2>&1; then - return 0 - fi - sleep 1 - i=$((i + 1)) - done - return 1 -} - -if ! verify_dns; then - echo "DNS_PROBE_FAILED: cannot resolve ${REGISTRY_HOST:-ghcr.io} after DNS proxy setup" - echo " resolv.conf: $(cat "$RESOLV_CONF")" - echo " This usually means Docker DNS forwarding is broken." - echo " Try restarting Docker or pruning networks: docker network prune -f" - # Don't exit — let k3s start so the Rust-side polling loop can detect the - # failure via the log marker and present a user-friendly diagnosis. -fi - -# --------------------------------------------------------------------------- -# Generate k3s private registry configuration -# --------------------------------------------------------------------------- -# Write registries.yaml so k3s/containerd can authenticate when pulling -# component and community sandbox images from the registry at runtime. -# Credentials are passed as environment variables by the bootstrap code. -REGISTRIES_YAML="/etc/rancher/k3s/registries.yaml" -if [ -n "${REGISTRY_HOST:-}" ]; then - REGISTRY_SCHEME="https" - REGISTRY_ENDPOINT="${REGISTRY_ENDPOINT:-${REGISTRY_HOST}}" - insecure_value=$(printf '%s' "${REGISTRY_INSECURE:-false}" | tr '[:upper:]' '[:lower:]') - if [ "$insecure_value" = "true" ] || [ "$insecure_value" = "1" ] || [ "$insecure_value" = "yes" ] || [ "$insecure_value" = "on" ]; then - REGISTRY_SCHEME="http" - fi - - echo "Configuring registry mirror for ${REGISTRY_HOST} via ${REGISTRY_ENDPOINT} (${REGISTRY_SCHEME})" - cat > "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" <> "$REGISTRIES_YAML" </dev/null 2>&1; then - CHART_CHECKSUM=$(sha256sum "$OPENSHELL_CHART" | cut -d ' ' -f 1) - elif command -v shasum >/dev/null 2>&1; then - CHART_CHECKSUM=$(shasum -a 256 "$OPENSHELL_CHART" | cut -d ' ' -f 1) - fi - fi -fi - -# Copy bundled manifests to k3s manifests directory. -# These are stored in /opt/openshell/manifests/ because the volume mount -# on /var/lib/rancher/k3s overwrites any files baked into that path. -# -# When reusing a persistent volume from a previous deploy, stale manifests -# (e.g. envoy-gateway-helmchart.yaml from an older image) may linger. -# We remove any openshell-managed manifests that no longer exist in the -# bundled set so k3s does not keep installing removed components. -K3S_MANIFESTS="/var/lib/rancher/k3s/server/manifests" -BUNDLED_MANIFESTS="/opt/openshell/manifests" - -if [ -d "$BUNDLED_MANIFESTS" ]; then - echo "Copying bundled manifests to k3s..." - for manifest in "$BUNDLED_MANIFESTS"/*.yaml; do - [ ! -f "$manifest" ] && continue - cp "$manifest" "$K3S_MANIFESTS/" - done - - # Remove openshell-managed manifests that are no longer bundled. - # Only clean up files that look like openshell manifests (openshell-* or - # envoy-gateway-* or agent-*) to avoid removing built-in k3s manifests. - for existing in "$K3S_MANIFESTS"/openshell-*.yaml \ - "$K3S_MANIFESTS"/envoy-gateway-*.yaml \ - "$K3S_MANIFESTS"/agent-*.yaml; do - [ ! -f "$existing" ] && continue - basename=$(basename "$existing") - if [ ! -f "$BUNDLED_MANIFESTS/$basename" ]; then - echo "Removing stale manifest: $basename" - rm -f "$existing" - fi - done -fi - -# --------------------------------------------------------------------------- -# GPU support: deploy NVIDIA device plugin when GPU_ENABLED=true -# --------------------------------------------------------------------------- -# When the cluster is started with --gpu, the bootstrap code sets -# GPU_ENABLED=true. This copies the NVIDIA device plugin HelmChart CR into -# the k3s manifests directory so the Helm controller installs it automatically. -# The nvidia-container-runtime binary is already on PATH (baked into the image) -# so k3s registers the "nvidia" RuntimeClass at startup. -if [ "${GPU_ENABLED:-}" = "true" ]; then - echo "GPU support enabled — deploying NVIDIA device plugin" - - GPU_MANIFESTS="/opt/openshell/gpu-manifests" - if [ -d "$GPU_MANIFESTS" ]; then - for manifest in "$GPU_MANIFESTS"/*.yaml; do - [ ! -f "$manifest" ] && continue - cp "$manifest" "$K3S_MANIFESTS/" - done - fi -fi - -# --------------------------------------------------------------------------- -# Detect host gateway IP for sandbox pod hostAliases -# --------------------------------------------------------------------------- -# Sandbox pods need to reach services running on the Docker host (e.g. -# provider endpoints during local development). On Docker Desktop, -# host.docker.internal resolves to a special host-reachable IP that is NOT the -# bridge default gateway, so prefer Docker's own resolution when available. -# Fall back to the container default gateway on Linux engines where -# host.docker.internal commonly maps to the bridge gateway anyway. -HOST_GATEWAY_IP=$(getent ahostsv4 host.docker.internal 2>/dev/null | awk 'NR == 1 { print $1; exit }') -if [ -n "$HOST_GATEWAY_IP" ]; then - echo "Detected host gateway IP from host.docker.internal: $HOST_GATEWAY_IP" -else - HOST_GATEWAY_IP=$(ip -4 route | awk '/default/ { print $3; exit }') - if [ -n "$HOST_GATEWAY_IP" ]; then - echo "Detected host gateway IP from default route: $HOST_GATEWAY_IP" - else - echo "Warning: Could not detect host gateway IP from host.docker.internal or default route" - fi -fi - -# --------------------------------------------------------------------------- -# Override image tag and pull policy for local development -# --------------------------------------------------------------------------- -# When IMAGE_TAG is set, replace the default ":latest" tag on all component -# images in the HelmChart manifest so k3s deploys the locally-pushed versions. -# When IMAGE_PULL_POLICY is set, override the default "Always" so k3s uses -# images already present in containerd instead of pulling from the registry. -HELMCHART="/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml" - -if [ -n "${IMAGE_REPO_BASE:-}" ] && [ -f "$HELMCHART" ]; then - echo "Setting image repository base: ${IMAGE_REPO_BASE}" - sed -i -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${IMAGE_REPO_BASE}/gateway|" "$HELMCHART" - # Sandbox images come from the community registry — do not override -fi - -# In push mode, use the exact image references that were imported into cluster -# containerd so the Helm release cannot drift back to remote ":latest" tags. -# Gateway and supervisor images may be pushed; sandbox base images are pulled -# from the community registry at runtime. -if [ -n "${PUSH_IMAGE_REFS:-}" ] && [ -f "$HELMCHART" ]; then - server_image="" - supervisor_image="" - old_ifs="$IFS" - IFS=',' - for ref in $PUSH_IMAGE_REFS; do - case "$ref" in - */gateway:*) server_image="$ref" ;; - */supervisor:*) supervisor_image="$ref" ;; - esac - done - IFS="$old_ifs" - - if [ -n "$server_image" ]; then - server_repo="${server_image%:*}" - server_tag="${server_image##*:}" - echo "Setting server image repository: ${server_repo}" - echo "Setting server image tag: ${server_tag}" - sed -i -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${server_repo}|" "$HELMCHART" - sed -i -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${server_tag}\"|" "$HELMCHART" - fi - - if [ -n "$supervisor_image" ]; then - echo "Setting supervisor image: ${supervisor_image}" - sed -i -E "s|supervisorImage:[[:space:]]*\"?[^\"]+\"?|supervisorImage: ${supervisor_image}|" "$HELMCHART" - fi -fi - -if [ -n "${IMAGE_TAG:-}" ] && [ -f "$HELMCHART" ]; then - echo "Overriding gateway and supervisor image tags to: ${IMAGE_TAG}" - # server image tag (standalone value field) - # Handle both quoted and unquoted defaults: tag: "latest" / tag: latest - sed -i -E "s|tag:[[:space:]]*\"?latest\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" - # supervisor image is a full image ref under server.supervisorImage - sed -i -E "s|(supervisorImage:[[:space:]]*\"?[^\"]*:)[^\"[:space:]]+(\"?)|\\1${IMAGE_TAG}\\2|" "$HELMCHART" -fi - -if [ -f "$HELMCHART" ]; then - IMAGE_PULL_POLICY_VALUE="${IMAGE_PULL_POLICY:-Always}" - if [ -n "${IMAGE_PULL_POLICY:-}" ]; then - echo "Overriding image pull policy to: ${IMAGE_PULL_POLICY}" - fi - sed -i "s|__IMAGE_PULL_POLICY__|${IMAGE_PULL_POLICY_VALUE}|g" "$HELMCHART" - - SANDBOX_IMAGE_PULL_POLICY_VALUE="${SANDBOX_IMAGE_PULL_POLICY:-\"\"}" - sed -i "s|__SANDBOX_IMAGE_PULL_POLICY__|${SANDBOX_IMAGE_PULL_POLICY_VALUE}|g" "$HELMCHART" - - DB_URL_VALUE="${DB_URL:-\"sqlite:/var/openshell/openshell.db\"}" - sed -i "s|__DB_URL__|${DB_URL_VALUE}|g" "$HELMCHART" -fi - -# SSH handshake secret: previously generated here and injected via sed into the -# HelmChart CR. Now persisted as a Kubernetes Secret (openshell-ssh-handshake) -# created by the bootstrap process after k3s starts. This ensures the secret -# survives container restarts without regeneration. - -# Inject SSH gateway host/port into the HelmChart manifest so the openshell -# server returns the correct address to CLI clients for SSH proxy CONNECT. -if [ -f "$HELMCHART" ]; then - if [ -n "$SSH_GATEWAY_HOST" ]; then - echo "Setting SSH gateway host: $SSH_GATEWAY_HOST" - sed -i "s|__SSH_GATEWAY_HOST__|${SSH_GATEWAY_HOST}|g" "$HELMCHART" - else - # Clear the placeholder so the default (127.0.0.1) is used - sed -i "s|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: \"\"|g" "$HELMCHART" - fi - if [ -n "$SSH_GATEWAY_PORT" ]; then - echo "Setting SSH gateway port: $SSH_GATEWAY_PORT" - sed -i "s|__SSH_GATEWAY_PORT__|${SSH_GATEWAY_PORT}|g" "$HELMCHART" - else - # Clear the placeholder so the default (8080) is used - sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART" - fi - # Disable gateway auth: when set, the server accepts connections without - # client certificates (for reverse-proxy / Cloudflare Tunnel deployments). - if [ "${DISABLE_GATEWAY_AUTH:-}" = "true" ]; then - echo "Disabling gateway auth (mTLS client cert not required)" - sed -i "s|__DISABLE_GATEWAY_AUTH__|true|g" "$HELMCHART" - else - sed -i "s|__DISABLE_GATEWAY_AUTH__|false|g" "$HELMCHART" - fi - - # OIDC JWT authentication: when OIDC_ISSUER is set, the server validates - # Bearer tokens on gRPC requests against the issuer's JWKS endpoint. - if [ -n "${OIDC_ISSUER:-}" ]; then - echo "Enabling OIDC authentication (issuer: ${OIDC_ISSUER})" - sed -i "s|__OIDC_ISSUER__|${OIDC_ISSUER}|g" "$HELMCHART" - sed -i "s|__OIDC_AUDIENCE__|${OIDC_AUDIENCE:-openshell-cli}|g" "$HELMCHART" - sed -i "s|__OIDC_ROLES_CLAIM__|${OIDC_ROLES_CLAIM:-realm_access.roles}|g" "$HELMCHART" - sed -i "s|__OIDC_ADMIN_ROLE__|${OIDC_ADMIN_ROLE:-openshell-admin}|g" "$HELMCHART" - sed -i "s|__OIDC_USER_ROLE__|${OIDC_USER_ROLE:-openshell-user}|g" "$HELMCHART" - sed -i "s|__OIDC_SCOPES_CLAIM__|${OIDC_SCOPES_CLAIM:-}|g" "$HELMCHART" - else - sed -i "s|__OIDC_ISSUER__||g" "$HELMCHART" - sed -i "s|__OIDC_AUDIENCE__|openshell-cli|g" "$HELMCHART" - sed -i "s|__OIDC_ROLES_CLAIM__||g" "$HELMCHART" - sed -i "s|__OIDC_ADMIN_ROLE__||g" "$HELMCHART" - sed -i "s|__OIDC_USER_ROLE__||g" "$HELMCHART" - sed -i "s|__OIDC_SCOPES_CLAIM__||g" "$HELMCHART" - fi - - # Disable TLS entirely: the server listens on plaintext HTTP. - # Used when a reverse proxy / tunnel terminates TLS at the edge. - if [ "${DISABLE_TLS:-}" = "true" ]; then - echo "Disabling TLS (plaintext HTTP)" - sed -i "s|__DISABLE_TLS__|true|g" "$HELMCHART" - # The Helm template automatically rewrites https:// to http:// in - # OPENSHELL_GRPC_ENDPOINT when disableTls is true, so no sed needed here. - else - sed -i "s|__DISABLE_TLS__|false|g" "$HELMCHART" - fi -fi - -# Inject host gateway IP into the HelmChart manifest so sandbox pods can -# reach services on the Docker host via host.docker.internal / host.openshell.internal. -if [ -n "$HOST_GATEWAY_IP" ] && [ -f "$HELMCHART" ]; then - echo "Setting host gateway IP: $HOST_GATEWAY_IP" - sed -i "s|__HOST_GATEWAY_IP__|${HOST_GATEWAY_IP}|g" "$HELMCHART" -else - # Clear the placeholder so the server gets an empty string (feature disabled) - sed -i "s|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: \"\"|g" "$HELMCHART" -fi - -# Inject chart checksum into the HelmChart manifest so that a changed chart -# tarball causes the HelmChart CR spec to differ, forcing the k3s Helm -# controller to upgrade the release. -if [ -n "$CHART_CHECKSUM" ] && [ -f "$HELMCHART" ]; then - echo "Injecting chart checksum: ${CHART_CHECKSUM}" - sed -i "s|__CHART_CHECKSUM__|${CHART_CHECKSUM}|g" "$HELMCHART" -else - # Remove the placeholder line entirely so invalid YAML isn't left behind - sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" -fi - -# --------------------------------------------------------------------------- -# Ensure flannel CNI directories exist -# --------------------------------------------------------------------------- -# k3s uses flannel as its default CNI. Flannel writes subnet configuration to -# /run/flannel/subnet.env during startup. When running inside a Docker -# container, /run/flannel/ may not exist, causing a race where kubelet tries -# to create pod sandboxes before flannel can write the file. Without it, every -# pod (including CoreDNS) fails with: -# plugin type="flannel" failed (add): failed to load flannel 'subnet.env' -# Pre-creating the directory eliminates this failure mode. -mkdir -p /run/flannel - -# --------------------------------------------------------------------------- -# Detect cgroup version and set kubelet compatibility flags -# --------------------------------------------------------------------------- -# Kubernetes 1.35+ (k3s v1.35.x) rejects cgroup v1 by default. Hosts running -# older distros (e.g. Rocky Linux 8, CentOS 7/8, Ubuntu 18.04) still use -# cgroup v1. When we detect cgroup v1, pass --kubelet-arg=fail-cgroupv1=false -# so kubelet warns instead of refusing to start. This flag can be removed once -# cgroup v1 support is no longer needed. -EXTRA_KUBELET_ARGS="" -if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then - echo "Detected cgroup v1 — adding kubelet compatibility flag (fail-cgroupv1=false)" - EXTRA_KUBELET_ARGS="--kubelet-arg=fail-cgroupv1=false" -fi - -# On kernels where xt_comment is unavailable, kube-router's network policy -# controller panics at startup. Disable it when the iptables-legacy probe -# triggered; sandbox isolation is enforced by the NSSH1 HMAC handshake instead. -if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then - EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy" -fi - -# Docker Desktop can briefly start the container before its bridge default route -# is fully installed. k3s exits immediately in that state, so wait briefly for -# routing to settle first. -wait_for_default_route - -# --------------------------------------------------------------------------- -# Deterministic k3s node name -# --------------------------------------------------------------------------- -# By default k3s uses the container hostname (= Docker container ID) as the -# node name. When the container is recreated (e.g. after an image upgrade), -# the container ID changes, registering a new k3s node. The bootstrap code -# then deletes PVCs whose backing PVs have node affinity for the old node — -# wiping the server database and any sandbox persistent volumes. -# -# OPENSHELL_NODE_NAME is set by the bootstrap code to a deterministic value -# derived from the gateway name, so the node identity survives container -# recreation and PVCs are never orphaned. -NODE_NAME_ARG="" -if [ -n "${OPENSHELL_NODE_NAME:-}" ]; then - NODE_NAME_ARG="--node-name=${OPENSHELL_NODE_NAME}" - echo "Using deterministic k3s node name: ${OPENSHELL_NODE_NAME}" -fi - -# Execute k3s with explicit resolv-conf passed as a kubelet arg. -# k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag; -# it must be passed via --kubelet-arg instead. -# shellcheck disable=SC2086 -exec /bin/k3s "$@" $NODE_NAME_ARG --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh deleted file mode 100644 index 96c326446..000000000 --- a/deploy/docker/cluster-healthcheck.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/sh - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -eu - -export KUBECONFIG=/etc/rancher/k3s/k3s.yaml - -# --------------------------------------------------------------------------- -# Pre-flight: verify container DNS resolution is functional. -# If the DNS proxy is broken, nothing will work (image pulls fail, pods -# can't start, etc.). Fail fast with a clear signal instead of letting the -# health check return unhealthy for 5+ minutes with no useful output. -# --------------------------------------------------------------------------- - -# Check whether a string looks like an IP address (v4 or v6) with optional port. -is_ip_literal() { - local host="${1%:*}" - echo "$host" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' && return 0 - echo "$host" | grep -qE '^\[?[0-9a-fA-F:]+\]?$' && return 0 - return 1 -} - -DNS_TARGET="${REGISTRY_HOST:-ghcr.io}" -# IP-literal registry hosts (e.g. 127.0.0.1:5000) don't need DNS resolution. -if ! is_ip_literal "$DNS_TARGET"; then - DNS_LOOKUP="${DNS_TARGET%%:*}" - if ! nslookup "$DNS_LOOKUP" >/dev/null 2>&1; then - echo "HEALTHCHECK_DNS_FAILURE: cannot resolve $DNS_TARGET" >&2 - exit 1 - fi -fi - -kubectl get --raw='/readyz' >/dev/null 2>&1 || exit 1 - -# --------------------------------------------------------------------------- -# Check for node pressure conditions (DiskPressure, MemoryPressure, PIDPressure). -# When a node is under pressure the kubelet evicts pods and rejects new ones, -# so the cluster will never become healthy. Emit a marker to stderr so the -# bootstrap polling loop can detect it early and surface a clear diagnosis. -# --------------------------------------------------------------------------- -NODE_CONDITIONS=$(kubectl get nodes -o jsonpath='{range .items[*]}{range .status.conditions[*]}{.type}={.status}{"\n"}{end}{end}' 2>/dev/null || true) -for PRESSURE in DiskPressure MemoryPressure PIDPressure; do - if echo "$NODE_CONDITIONS" | grep -q "^${PRESSURE}=True$"; then - echo "HEALTHCHECK_NODE_PRESSURE: ${PRESSURE}" >&2 - fi -done - -kubectl -n openshell get statefulset/openshell >/dev/null 2>&1 || exit 1 -kubectl -n openshell wait --for=jsonpath='{.status.readyReplicas}'=1 statefulset/openshell --timeout=1s >/dev/null 2>&1 || exit 1 - -# --------------------------------------------------------------------------- -# Verify the sandbox supervisor binary exists on the node filesystem. -# Sandbox pods mount /opt/openshell/bin as a read-only hostPath volume and -# exec /opt/openshell/bin/openshell-sandbox as their entrypoint. If the binary -# is missing (e.g. cluster image was built without the staged prebuilt -# binary), every sandbox pod will crash with "no such file or directory". -# --------------------------------------------------------------------------- -if [ ! -x /opt/openshell/bin/openshell-sandbox ]; then - echo "HEALTHCHECK_MISSING_SUPERVISOR: /opt/openshell/bin/openshell-sandbox not found" >&2 - exit 1 -fi - -# Verify TLS secrets exist (created by openshell-bootstrap before the StatefulSet starts) -# Skip when TLS is disabled — secrets are not required. -if [ "${DISABLE_TLS:-}" != "true" ]; then - kubectl -n openshell get secret openshell-server-tls >/dev/null 2>&1 || exit 1 - kubectl -n openshell get secret openshell-client-tls >/dev/null 2>&1 || exit 1 -fi - -# Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets) -kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1 - -# --------------------------------------------------------------------------- -# Verify the gateway NodePort (30051) is actually accepting TCP connections. -# After a container restart, kube-proxy may need extra time to re-program -# iptables rules for NodePort routing. Without this check the health check -# can pass before the port is routable, causing "Connection refused" on the -# host-mapped port. -# Use /dev/tcp/127.0.0.1/30051 && exec 3>&-' 2>/dev/null || exit 1 diff --git a/deploy/man/openshell.1.md b/deploy/man/openshell.1.md index 65e060899..98a683ec5 100644 --- a/deploy/man/openshell.1.md +++ b/deploy/man/openshell.1.md @@ -8,7 +8,7 @@ date: 2025 # NAME -openshell - CLI for managing OpenShell sandboxes, gateways, and providers +openshell - CLI for managing OpenShell sandboxes, gateway registrations, and providers # SYNOPSIS @@ -18,49 +18,46 @@ openshell - CLI for managing OpenShell sandboxes, gateways, and providers **openshell** is the command-line interface for OpenShell, a platform providing safe, sandboxed runtimes for autonomous AI agents. It manages -the gateway control plane, sandbox lifecycle, credential providers, +gateway registrations, sandbox lifecycle, credential providers, network policies, and inference routing. The CLI communicates with a gateway server over gRPC. The gateway can -run as a systemd user service (RPM deployment with Podman driver), a -Docker container with embedded K3s, or behind a cloud reverse proxy. +run as a package-managed systemd user service, a Helm deployment, a +development task, or behind a cloud reverse proxy. # COMMANDS ## Gateway Management -**gateway start** -: Deploy a new gateway using Docker (not applicable to RPM deployments; - use **systemctl --user start openshell-gateway** instead). - -**gateway stop** -: Stop a Docker-managed gateway (use **systemctl --user stop - openshell-gateway** for RPM deployments). - -**gateway destroy** \[**--name** *NAME*\] -: Destroy a gateway. For RPM deployments, this removes the CLI - registration only. - **gateway add** *ENDPOINT* \[**--local**\] \[**--name** *NAME*\] \[**--remote** *USER@HOST*\] : Register an existing gateway with the CLI. +**gateway remove** \[*NAME*\] +: Remove a local CLI registration and stored auth tokens. This does not + stop or destroy the gateway service. + **gateway select** \[*NAME*\] : List registered gateways or switch the active gateway. **gateway info** \[**--name** *NAME*\] -: Show deployment details for a gateway. +: Show registration details for a gateway. + +**gateway list** +: List registered gateways. **gateway login** : Re-authenticate with a cloud gateway. +**gateway logout** +: Clear stored authentication credentials for a gateway. + **status** : Check the health of the active gateway. ## Sandbox Management **sandbox create** \[**--from** *IMAGE*\] \[**--policy** *FILE*\] \[**--provider** *NAME*\] \[**--gpu**\] \[**--upload** *SRC:DST*\] \[**--forward** *PORT*\] \[**--** *COMMAND*\] -: Create a new sandbox. If no gateway exists, auto-bootstraps one - (Docker mode only). +: Create a new sandbox on the active gateway. **sandbox list** \[**--selector** *LABEL*\] : List all sandboxes on the active gateway. @@ -145,9 +142,10 @@ Docker container with embedded K3s, or behind a cloud reverse proxy. **term** : Open the real-time TUI dashboard. -**doctor check** \| **logs** \| **exec** \| **llm.txt** -: Diagnostic tools (Docker/K3s mode only; see **TROUBLESHOOTING** - section for RPM alternatives). +**doctor check** +: Validate local Docker prerequisites for standalone gateway development. + For package-managed gateways, prefer systemd, journalctl, kubectl, or Helm + diagnostics. **completions** *SHELL* : Generate shell completions (bash, zsh, fish). diff --git a/deploy/rpm/TROUBLESHOOTING.md b/deploy/rpm/TROUBLESHOOTING.md index 78ef4c475..2c33e1a57 100644 --- a/deploy/rpm/TROUBLESHOOTING.md +++ b/deploy/rpm/TROUBLESHOOTING.md @@ -26,29 +26,29 @@ openshell inference set|get|update openshell settings get|set openshell forward start|stop|list openshell term -openshell gateway add|select|info -openshell gateway destroy (removes CLI registration only) +openshell gateway add|select|info|list|remove ``` -### Commands that do not apply +### Gateway lifecycle -These commands manage Docker container lifecycle and are not applicable -to the RPM/systemd deployment. Use the systemd equivalents instead. +Gateway service lifecycle is owned by systemd for RPM deployments. Use +systemd commands directly: -| CLI command | RPM alternative | -|-------------|-----------------| -| `openshell gateway start` | `systemctl --user start openshell-gateway` | -| `openshell gateway stop` | `systemctl --user stop openshell-gateway` | -| `openshell doctor check` | `systemctl --user status openshell-gateway` | -| `openshell doctor logs` | `journalctl --user -u openshell-gateway` | -| `openshell doctor logs --tail` | `journalctl --user -u openshell-gateway -f` | -| `openshell doctor exec` | Not applicable (no K3s container) | +| Task | Command | +|------|---------| +| Start gateway | `systemctl --user start openshell-gateway` | +| Stop gateway | `systemctl --user stop openshell-gateway` | +| Restart gateway | `systemctl --user restart openshell-gateway` | +| Check status | `systemctl --user status openshell-gateway` | +| View logs | `journalctl --user -u openshell-gateway` | +| Follow logs | `journalctl --user -u openshell-gateway -f` | +| Remove CLI registration | `openshell gateway remove [name]` | ### Building from local Dockerfiles -`openshell sandbox create --from ./Dockerfile` builds via Docker and -pushes into K3s containerd. With the Podman driver, build the image -with Podman and reference it directly: +`openshell sandbox create --from ./Dockerfile` builds via the local +Docker daemon. With the RPM Podman driver, build the image with Podman +and reference it directly: ```shell podman build -t my-sandbox ./my-dir diff --git a/docs/reference/gateway-auth.mdx b/docs/reference/gateway-auth.mdx index db96a332b..c87006c0d 100644 --- a/docs/reference/gateway-auth.mdx +++ b/docs/reference/gateway-auth.mdx @@ -96,7 +96,7 @@ Register a plaintext gateway with an explicit `http://` endpoint: openshell gateway add http://127.0.0.1:8080 --local ``` -This stores the gateway with `auth_mode = plaintext`, skips mTLS certificate extraction, and does not open the browser login flow. +This stores the gateway with `auth_mode = plaintext`, skips mTLS client certificate lookup, and does not open the browser login flow. ## File Layout diff --git a/docs/sandboxes/manage-gateways.mdx b/docs/sandboxes/manage-gateways.mdx index 82eb62031..c16c13799 100644 --- a/docs/sandboxes/manage-gateways.mdx +++ b/docs/sandboxes/manage-gateways.mdx @@ -185,6 +185,12 @@ openshell gateway info openshell gateway info --name production ``` +Remove a local CLI registration without stopping the gateway service: + +```shell +openshell gateway remove production +``` + ## Troubleshoot Check gateway health: diff --git a/e2e/rust/tests/cf_auth_smoke.rs b/e2e/rust/tests/cf_auth_smoke.rs index 1e8d616e3..29d436227 100644 --- a/e2e/rust/tests/cf_auth_smoke.rs +++ b/e2e/rust/tests/cf_auth_smoke.rs @@ -62,20 +62,22 @@ async fn run_with_config(tmpdir: &std::path::Path, args: &[&str]) -> (String, i3 } // ------------------------------------------------------------------- -// Test 8: `--plaintext` flag is recognized +// Test 8: gateway lifecycle commands are not exposed through the CLI // ------------------------------------------------------------------- -/// `openshell gateway start --help` must show `--plaintext`. +/// `openshell gateway --help` must not show removed lifecycle commands. #[tokio::test] -async fn gateway_start_help_shows_plaintext() { - let (output, code) = run_isolated(&["gateway", "start", "--help"]).await; - assert_eq!(code, 0, "gateway start --help should exit 0:\n{output}"); +async fn gateway_help_omits_lifecycle_commands() { + let (output, code) = run_isolated(&["gateway", "--help"]).await; + assert_eq!(code, 0, "gateway --help should exit 0:\n{output}"); let clean = strip_ansi(&output); - assert!( - clean.contains("--plaintext"), - "expected '--plaintext' in gateway start --help output:\n{clean}" - ); + for removed in ["start", "stop", "destroy"] { + assert!( + !clean.contains(removed), + "did not expect removed gateway lifecycle command '{removed}' in gateway help:\n{clean}" + ); + } } // ------------------------------------------------------------------- @@ -114,10 +116,6 @@ async fn gateway_add_help_shows_flags() { clean.contains("--remote"), "expected '--remote' in gateway add --help:\n{clean}" ); - assert!( - clean.contains("--ssh-key"), - "expected '--ssh-key' in gateway add --help:\n{clean}" - ); assert!( clean.contains("--local"), "expected '--local' in gateway add --help:\n{clean}" @@ -293,9 +291,9 @@ async fn gateway_add_remote_and_local_conflict() { ); } -/// `--ssh-key` requires `--remote`. +/// `--ssh-key` was removed from `gateway add`. #[tokio::test] -async fn gateway_add_ssh_key_requires_remote() { +async fn gateway_add_rejects_removed_ssh_key_flag() { let (output, code) = run_isolated(&[ "gateway", "add", @@ -307,7 +305,7 @@ async fn gateway_add_ssh_key_requires_remote() { assert_ne!( code, 0, - "--ssh-key without --remote should fail:\n{output}" + "--ssh-key should fail after gateway lifecycle bootstrap removal:\n{output}" ); } diff --git a/e2e/rust/tests/cli_smoke.rs b/e2e/rust/tests/cli_smoke.rs index 42e230e23..265b236c4 100644 --- a/e2e/rust/tests/cli_smoke.rs +++ b/e2e/rust/tests/cli_smoke.rs @@ -53,19 +53,27 @@ async fn help_shows_restructured_commands() { } } -/// `openshell gateway --help` must list start, stop, destroy, select, info. +/// `openshell gateway --help` must list registration/auth commands, not +/// service lifecycle commands. #[tokio::test] async fn gateway_help_shows_subcommands() { let (output, code) = run_isolated(&["gateway", "--help"]).await; assert_eq!(code, 0, "openshell gateway --help should exit 0"); let clean = strip_ansi(&output); - for sub in ["start", "stop", "destroy", "select", "info"] { + for sub in ["add", "remove", "login", "logout", "select", "info", "list"] { assert!( clean.contains(sub), "expected '{sub}' in gateway --help output:\n{clean}" ); } + + for removed in ["start", "stop", "destroy"] { + assert!( + !clean.contains(removed), + "did not expect removed gateway lifecycle subcommand '{removed}' in help:\n{clean}" + ); + } } /// `openshell sandbox --help` must list upload and download alongside create, @@ -85,9 +93,7 @@ async fn sandbox_help_shows_upload_download() { } /// `openshell sandbox create --help` must show `--gpu`, `--upload`, -/// `--no-git-ignore`, `--no-bootstrap`, `--editor`, and -/// `--auto-providers`/`--no-auto-providers`. -/// Note: `--bootstrap` is intentionally hidden (it's the default behaviour). +/// `--no-git-ignore`, `--editor`, and `--auto-providers`/`--no-auto-providers`. #[tokio::test] async fn sandbox_create_help_shows_new_flags() { let (output, code) = run_isolated(&["sandbox", "create", "--help"]).await; @@ -98,7 +104,6 @@ async fn sandbox_create_help_shows_new_flags() { "--gpu", "--upload", "--no-git-ignore", - "--no-bootstrap", "--editor", "--auto-providers", "--no-auto-providers", @@ -123,20 +128,20 @@ async fn sandbox_connect_help_shows_editor_flag() { ); } -/// `openshell gateway start --help` must show key flags. +/// Removed gateway lifecycle subcommands should fail during parsing. #[tokio::test] -async fn gateway_start_help_shows_key_flags() { - let (output, code) = run_isolated(&["gateway", "start", "--help"]).await; - assert_eq!(code, 0, "openshell gateway start --help should exit 0"); +async fn gateway_lifecycle_subcommands_are_removed() { + for subcommand in ["start", "stop", "destroy"] { + let (output, code) = run_isolated(&["gateway", subcommand, "--help"]).await; + assert!( + code != 0, + "openshell gateway {subcommand} should fail after lifecycle command removal" + ); - let clean = strip_ansi(&output); - for flag in [ - "--gpu", - "--recreate", - ] { + let clean = strip_ansi(&output); assert!( - clean.contains(flag), - "expected '{flag}' in gateway start --help:\n{clean}" + clean.contains("unrecognized subcommand") || clean.contains("error:"), + "expected parser error for removed gateway subcommand '{subcommand}':\n{clean}" ); } } @@ -161,7 +166,7 @@ async fn status_without_gateway_prints_friendly_message() { "expected 'No gateway configured' in status output:\n{clean}" ); assert!( - clean.contains("openshell gateway start"), - "expected hint to run 'openshell gateway start':\n{clean}" + clean.contains("openshell gateway add "), + "expected hint to register a gateway:\n{clean}" ); } diff --git a/e2e/rust/tests/community_image.rs b/e2e/rust/tests/community_image.rs index f68c65441..2fab46385 100644 --- a/e2e/rust/tests/community_image.rs +++ b/e2e/rust/tests/community_image.rs @@ -11,7 +11,7 @@ //! 3. Basic command execution works inside the community sandbox //! //! Prerequisites: -//! - A running openshell gateway (`openshell gateway start`) +//! - A running openshell gateway (`mise run gateway:docker`) //! - Network access to ghcr.io/nvidia/openshell-community/sandboxes/ use openshell_e2e::harness::output::strip_ansi; diff --git a/e2e/rust/tests/docker_preflight.rs b/e2e/rust/tests/docker_preflight.rs index 9a6ea9f65..bf05a132d 100644 --- a/e2e/rust/tests/docker_preflight.rs +++ b/e2e/rust/tests/docker_preflight.rs @@ -1,11 +1,10 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Docker preflight e2e tests. +//! Doctor Docker preflight e2e tests. //! -//! These tests verify that the CLI fails fast with actionable guidance when -//! Docker is not available, instead of starting a multi-minute deploy that -//! eventually times out with a cryptic error. +//! These tests verify that `openshell doctor check` reports actionable guidance +//! when Docker is not available. //! //! The tests do NOT require a running gateway or Docker — they intentionally //! point `DOCKER_HOST` at a non-existent socket to simulate Docker being @@ -13,6 +12,10 @@ use std::process::Stdio; use std::time::Instant; +use std::{env, fs}; + +#[cfg(unix)] +use std::os::unix::fs::PermissionsExt; use openshell_e2e::harness::binary::openshell_cmd; use openshell_e2e::harness::output::strip_ansi; @@ -24,12 +27,29 @@ use openshell_e2e::harness::output::strip_ansi; /// fails immediately regardless of the host's Docker configuration. async fn run_without_docker(args: &[&str]) -> (String, i32, std::time::Duration) { let tmpdir = tempfile::tempdir().expect("create isolated config dir"); + let bin_dir = tmpdir.path().join("bin"); + fs::create_dir(&bin_dir).expect("create fake bin dir"); + let fake_docker = bin_dir.join("docker"); + fs::write( + &fake_docker, + "#!/bin/sh\n\ + echo 'Cannot connect to Docker daemon. Check DOCKER_HOST and run docker info.' >&2\n\ + exit 1\n", + ) + .expect("write fake docker"); + #[cfg(unix)] + fs::set_permissions(&fake_docker, fs::Permissions::from_mode(0o755)) + .expect("chmod fake docker"); + + let old_path = env::var("PATH").unwrap_or_default(); + let path = format!("{}:{old_path}", bin_dir.display()); let start = Instant::now(); let mut cmd = openshell_cmd(); cmd.args(args) .env("XDG_CONFIG_HOME", tmpdir.path()) .env("HOME", tmpdir.path()) + .env("PATH", path) .env("DOCKER_HOST", "unix:///tmp/openshell-e2e-nonexistent.sock") .env_remove("OPENSHELL_GATEWAY") .env_remove("OPENSHELL_GATEWAY_ENDPOINT") @@ -45,131 +65,6 @@ async fn run_without_docker(args: &[&str]) -> (String, i32, std::time::Duration) (combined, code, elapsed) } -// ------------------------------------------------------------------- -// gateway start: fails fast when Docker is unavailable -// ------------------------------------------------------------------- - -/// `openshell gateway start` with no Docker should fail within seconds -/// (not minutes) and produce a non-zero exit code. -#[tokio::test] -async fn gateway_start_fails_fast_without_docker() { - let (output, code, elapsed) = run_without_docker(&["gateway", "start"]).await; - - assert_ne!( - code, 0, - "gateway start should fail when Docker is unavailable, output:\n{output}" - ); - - // The preflight check should cause failure in under 30 seconds. - // Before the preflight was added, this would time out after several minutes - // waiting for k3s namespace readiness. - assert!( - elapsed.as_secs() < 30, - "gateway start should fail fast (took {}s), output:\n{output}", - elapsed.as_secs() - ); -} - -/// When Docker is unavailable, the error output should mention Docker -/// so the user knows what to fix. -#[tokio::test] -async fn gateway_start_error_mentions_docker() { - let (output, code, _) = run_without_docker(&["gateway", "start"]).await; - - assert_ne!(code, 0); - let clean = strip_ansi(&output); - let lower = clean.to_lowercase(); - - assert!( - lower.contains("docker"), - "error output should mention 'Docker' so the user knows what to fix:\n{clean}" - ); -} - -/// When Docker is unavailable, the error output should include guidance -/// about `DOCKER_HOST` since that's the likely fix for non-default runtimes. -#[tokio::test] -async fn gateway_start_error_mentions_docker_host() { - let (output, code, _) = run_without_docker(&["gateway", "start"]).await; - - assert_ne!(code, 0); - let clean = strip_ansi(&output); - - assert!( - clean.contains("DOCKER_HOST"), - "error output should mention DOCKER_HOST for users with non-default socket paths:\n{clean}" - ); -} - -/// When Docker is unavailable, the error output should suggest a -/// verification command like `docker info`. -#[tokio::test] -async fn gateway_start_error_suggests_verification() { - let (output, code, _) = run_without_docker(&["gateway", "start"]).await; - - assert_ne!(code, 0); - let clean = strip_ansi(&output); - - assert!( - clean.contains("docker info"), - "error output should suggest 'docker info' as a verification step:\n{clean}" - ); -} - -// ------------------------------------------------------------------- -// gateway start --recreate: same preflight behavior -// ------------------------------------------------------------------- - -/// `openshell gateway start --recreate` should also fail fast when -/// Docker is unavailable (the recreate flag should not bypass the check). -#[tokio::test] -async fn gateway_start_recreate_fails_fast_without_docker() { - let (output, code, elapsed) = run_without_docker(&["gateway", "start", "--recreate"]).await; - - assert_ne!( - code, 0, - "gateway start --recreate should fail when Docker is unavailable, output:\n{output}" - ); - - assert!( - elapsed.as_secs() < 30, - "gateway start --recreate should fail fast (took {}s)", - elapsed.as_secs() - ); -} - -// ------------------------------------------------------------------- -// sandbox create with auto-bootstrap: same preflight behavior -// ------------------------------------------------------------------- - -/// `openshell sandbox create` triggers auto-bootstrap when no gateway -/// exists. With Docker unavailable, it should fail fast with Docker -/// guidance rather than timing out. -#[tokio::test] -async fn sandbox_create_auto_bootstrap_fails_fast_without_docker() { - let (output, code, elapsed) = - run_without_docker(&["sandbox", "create", "--from", "openclaw"]).await; - - assert_ne!( - code, 0, - "sandbox create should fail when Docker is unavailable, output:\n{output}" - ); - - // Auto-bootstrap path should also hit the preflight check quickly. - assert!( - elapsed.as_secs() < 30, - "sandbox create should fail fast via auto-bootstrap preflight (took {}s), output:\n{output}", - elapsed.as_secs() - ); - - let clean = strip_ansi(&output); - let lower = clean.to_lowercase(); - assert!( - lower.contains("docker"), - "sandbox create error should mention Docker:\n{clean}" - ); -} - // ------------------------------------------------------------------- // doctor check: validates system prerequisites // ------------------------------------------------------------------- diff --git a/e2e/rust/tests/edge_tunnel_e2e.rs b/e2e/rust/tests/edge_tunnel_e2e.rs index cb2c0bc7a..e39edc3ea 100644 --- a/e2e/rust/tests/edge_tunnel_e2e.rs +++ b/e2e/rust/tests/edge_tunnel_e2e.rs @@ -192,7 +192,7 @@ async fn ws_tunnel_status_through_edge_proxy() { if !endpoint.starts_with("http://") { eprintln!( "Skipping ws_tunnel test: gateway endpoint is not plaintext HTTP: {endpoint}\n\ - Deploy with `openshell gateway start --plaintext` for this test." + Use a plaintext local gateway such as `mise run gateway:docker` for this test." ); return; } diff --git a/e2e/rust/tests/port_forward.rs b/e2e/rust/tests/port_forward.rs index c534dbe28..b198eeb94 100644 --- a/e2e/rust/tests/port_forward.rs +++ b/e2e/rust/tests/port_forward.rs @@ -6,7 +6,7 @@ //! E2E test: TCP port forwarding through a sandbox. //! //! Prerequisites: -//! - A running openshell gateway (`openshell gateway start`) +//! - A running openshell gateway (`mise run gateway:docker`) //! - The `openshell` binary (built automatically from the workspace) use std::time::Duration; diff --git a/e2e/rust/tests/provider_auto_create.rs b/e2e/rust/tests/provider_auto_create.rs index 62fe5a461..46ccb7999 100644 --- a/e2e/rust/tests/provider_auto_create.rs +++ b/e2e/rust/tests/provider_auto_create.rs @@ -14,7 +14,7 @@ //! placeholder made it all the way through to the sandbox process environment. //! //! Prerequisites: -//! - A running openshell gateway (`openshell gateway start`) +//! - A running openshell gateway (`mise run gateway:docker`) //! - The `openshell` binary (built automatically from the workspace) use std::process::Stdio; @@ -84,7 +84,6 @@ async fn auto_created_provider_credential_available_in_sandbox() { .arg("--provider") .arg("claude") .arg("--auto-providers") - .arg("--no-bootstrap") .arg("--") .arg("printenv") .arg("ANTHROPIC_API_KEY") diff --git a/e2e/rust/tests/sync.rs b/e2e/rust/tests/sync.rs index f08f670c5..60abb42b4 100644 --- a/e2e/rust/tests/sync.rs +++ b/e2e/rust/tests/sync.rs @@ -6,7 +6,7 @@ //! E2E test: bidirectional file upload/download with a sandbox. //! //! Prerequisites: -//! - A running openshell gateway (`openshell gateway start`) +//! - A running openshell gateway (`mise run gateway:docker`) //! - The `openshell` binary (built automatically from the workspace) use std::fs; diff --git a/e2e/rust/tests/upload_create.rs b/e2e/rust/tests/upload_create.rs index f2c371651..8b978257b 100644 --- a/e2e/rust/tests/upload_create.rs +++ b/e2e/rust/tests/upload_create.rs @@ -10,7 +10,7 @@ //! so the command can read the uploaded content. //! //! Prerequisites: -//! - A running openshell gateway (`openshell gateway start`) +//! - A running openshell gateway (`mise run gateway:docker`) //! - The `openshell` binary (built automatically from the workspace) use std::fs; diff --git a/examples/bring-your-own-container/README.md b/examples/bring-your-own-container/README.md index 26cd3a33b..ea4f1cb9e 100644 --- a/examples/bring-your-own-container/README.md +++ b/examples/bring-your-own-container/README.md @@ -6,7 +6,7 @@ your local machine through port forwarding. ## Prerequisites -- A running OpenShell gateway (`openshell gateway start`) +- A running OpenShell gateway (`mise run gateway:docker` for local development) - Docker daemon running ## What's in this example diff --git a/examples/local-inference/README.md b/examples/local-inference/README.md index 62e056e35..6214ad1e5 100644 --- a/examples/local-inference/README.md +++ b/examples/local-inference/README.md @@ -69,7 +69,7 @@ stays the same (tokens arrive incrementally). ## Standalone (no cluster) -Run the sandbox binary directly with a route file — no OpenShell cluster needed: +Run the sandbox binary directly with a route file — no gateway needed: ```bash # 1. Edit routes.yaml to point at your local LLM (e.g. LM Studio on :1234) @@ -85,16 +85,16 @@ openshell-sandbox \ The sandbox loads routes from the YAML file at startup and routes inference requests locally — no gRPC server or cluster required. -### With a cluster +### With a gateway -#### 1. Start a OpenShell cluster +#### 1. Start an OpenShell gateway ```bash -mise run cluster +mise run gateway:docker openshell status ``` -#### 2. Configure cluster inference +#### 2. Configure gateway inference First make sure a provider record exists for the backend you want to use: @@ -102,11 +102,11 @@ First make sure a provider record exists for the backend you want to use: openshell provider list ``` -Then configure the cluster-managed `inference.local` route: +Then configure the gateway-managed `inference.local` route: ```bash # Example: use an existing provider record -openshell cluster inference set \ +openshell inference set \ --provider openai-prod \ --model nvidia/nemotron-3-nano-30b-a3b ``` @@ -114,7 +114,7 @@ openshell cluster inference set \ Verify the active config: ```bash -openshell cluster inference get +openshell inference get ``` #### 3. Run the example inside a sandbox @@ -127,8 +127,8 @@ openshell sandbox create \ ``` The script targets `https://inference.local/v1` directly. OpenShell -intercepts that connection and routes it to whatever backend cluster -inference is configured to use. +intercepts that connection and routes it to whatever backend gateway inference +is configured to use. Expected output: @@ -154,7 +154,7 @@ openshell sandbox delete inference-demo ## Customizing Routes Edit `routes.yaml` to change which backend endpoint/model standalone mode uses. -In cluster mode, use `openshell cluster inference set` instead. +In gateway mode, use `openshell inference set` instead. ## Supported Protocols diff --git a/examples/multi-agent-notepad/README.md b/examples/multi-agent-notepad/README.md index 5cc1b3819..2bff46cd3 100644 --- a/examples/multi-agent-notepad/README.md +++ b/examples/multi-agent-notepad/README.md @@ -70,7 +70,7 @@ The demo also exercises two OpenShell features: - OpenShell CLI from current `main` (or set `OPENSHELL_BIN` to the binary path) -- A running OpenShell gateway: `openshell gateway start` +- A running OpenShell gateway: `mise run gateway:docker` for local development - Local Codex sign-in on the host: `codex login` - `gh` (GitHub CLI) signed in, **or** a GitHub PAT with `contents:write` - `jq` on the host diff --git a/examples/multi-agent-notepad/demo.sh b/examples/multi-agent-notepad/demo.sh index 7a2e4bf2f..7ee88f9f8 100755 --- a/examples/multi-agent-notepad/demo.sh +++ b/examples/multi-agent-notepad/demo.sh @@ -103,7 +103,7 @@ validate_env() { info "GitHub repo ${DEMO_GITHUB_OWNER}/${DEMO_GITHUB_REPO}@${DEMO_BRANCH} and token present" info "checking OpenShell gateway is reachable..." - "$OPENSHELL_BIN" status >/dev/null 2>&1 || fail "OpenShell gateway is not reachable; run: openshell gateway start" + "$OPENSHELL_BIN" status >/dev/null 2>&1 || fail "OpenShell gateway is not reachable; run: mise run gateway:docker" export CODEX_AUTH_ACCESS_TOKEN export CODEX_AUTH_REFRESH_TOKEN diff --git a/examples/policy-advisor/README.md b/examples/policy-advisor/README.md index b62f940b2..52758ad47 100644 --- a/examples/policy-advisor/README.md +++ b/examples/policy-advisor/README.md @@ -67,7 +67,7 @@ SSRF override allows the connection. ### Prerequisites -- A running OpenShell gateway (`openshell gateway start` or a remote gateway) +- A running OpenShell gateway (`mise run gateway:docker` or a remote gateway) - The `openshell` CLI installed - Two terminal windows diff --git a/examples/sandbox-policy-quickstart/README.md b/examples/sandbox-policy-quickstart/README.md index a00f46be4..55a53ecca 100644 --- a/examples/sandbox-policy-quickstart/README.md +++ b/examples/sandbox-policy-quickstart/README.md @@ -7,7 +7,7 @@ while writes are blocked — all without restarting anything. ## Prerequisites -- A running OpenShell gateway (`openshell gateway start`) +- A running OpenShell gateway (`mise run gateway:docker` for local development) - Docker daemon running ## What's in this example diff --git a/examples/vscode-remote-sandbox.md b/examples/vscode-remote-sandbox.md index cdd4c9479..9477932e2 100644 --- a/examples/vscode-remote-sandbox.md +++ b/examples/vscode-remote-sandbox.md @@ -6,7 +6,7 @@ extension so you get a full IDE experience inside the sandbox environment. ## Prerequisites -- A running openshell gateway (`openshell gateway start`) +- A running OpenShell gateway (`mise run gateway:docker` for local development) - [VSCode](https://code.visualstudio.com/) with the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) extension installed diff --git a/openshell.spec b/openshell.spec index 82bf6459d..4b7aa5519 100644 --- a/openshell.spec +++ b/openshell.spec @@ -46,13 +46,13 @@ BuildRequires: pandoc # Python sub-package build dependencies BuildRequires: python3-devel -# Runtime: container runtime for gateway lifecycle (start/stop/destroy). +# Runtime: container runtime for package-managed gateway sandboxes. # Podman is preferred; Docker is also supported via --container-runtime flag. Recommends: podman %description OpenShell provides safe, sandboxed runtimes for autonomous AI agents. -It offers a CLI for managing gateways, sandboxes, and providers with +It offers a CLI for managing gateway registrations, sandboxes, and providers with policy-enforced egress routing, credential proxying, and privacy-aware LLM inference routing. diff --git a/scripts/bin/openshell b/scripts/bin/openshell index 0383d5b57..23000bc31 100755 --- a/scripts/bin/openshell +++ b/scripts/bin/openshell @@ -10,10 +10,9 @@ CALLER_PWD="$PWD" # --------------------------------------------------------------------------- # Fingerprint-based rebuild check # -# Mirrors the approach in tasks/scripts/cluster-deploy-fast.sh: collect dirty -# files from git, filter to paths in the openshell-cli dependency closure, -# hash their contents, and compare against a persisted state file. We also -# track HEAD so that branch switches / pulls are detected. +# Collect dirty files from git, filter to paths in the openshell-cli dependency +# closure, hash their contents, and compare against a persisted state file. We +# also track HEAD so that branch switches / pulls are detected. # --------------------------------------------------------------------------- needs_build=0 diff --git a/scripts/build-benchmark/README.md b/scripts/build-benchmark/README.md deleted file mode 100644 index 9b6834ec5..000000000 --- a/scripts/build-benchmark/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Build Benchmark - -Validation harness for cluster deploys. Tests change detection, build routing, and image cache reuse across component changes. All operations run through `mise run cluster` to replicate the real user workflow. - -## Usage - -The script bootstraps a cluster automatically via `mise run cluster` if one isn't already running. - -Run the full suite: - -```sh -scripts/build-benchmark/cluster-deploy-fast-test.sh -``` - -Run specific scenarios: - -```sh -scripts/build-benchmark/cluster-deploy-fast-test.sh noop gateway-auto supervisor-cache -``` - -## Scenarios - -| Scenario | Description | -|---|---| -| `noop` | Clean tree is a no-op after state is primed | -| `gateway-auto` | Gateway-only change triggers gateway rebuild + Helm upgrade | -| `supervisor-auto` | Supervisor-only change triggers supervisor refresh only | -| `shared-auto` | Shared dependency change triggers both rebuilds | -| `helm-auto` | Helm-only change triggers Helm upgrade only | -| `unrelated-auto` | Unrelated file change stays a no-op | -| `explicit-targets` | Explicit targets override change detection | -| `gateway-cache` | Cold vs warm gateway rebuild comparison | -| `supervisor-cache` | Cold vs warm supervisor rebuild comparison | -| `container-invalidation` | Mismatched container ID invalidates gateway + Helm state | - -## Environment Variables - -| Variable | Description | -|---|---| -| `CLUSTER_NAME` | Override cluster name to test against | -| `FAST_DEPLOY_TEST_REPORT_DIR` | Output directory (default: `.cache/cluster-deploy-fast-test/`) | -| `FAST_DEPLOY_TEST_KEEP_WORKTREES` | Set to `1` to keep temporary worktrees | -| `FAST_DEPLOY_TEST_SKIP_CACHE` | Set to `1` to skip cache timing scenarios | diff --git a/scripts/build-benchmark/cluster-deploy-fast-test.sh b/scripts/build-benchmark/cluster-deploy-fast-test.sh deleted file mode 100755 index e1867354b..000000000 --- a/scripts/build-benchmark/cluster-deploy-fast-test.sh +++ /dev/null @@ -1,634 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -usage() { - cat <<'EOF' -Usage: cluster-deploy-fast-test.sh [scenario...] - -Repeatable validation harness for tasks/scripts/cluster-deploy-fast.sh. - -Scenarios: - noop Validate clean-tree auto deploy is a no-op after state is primed - gateway-auto Gateway-only change triggers gateway rebuild + Helm upgrade - supervisor-auto Supervisor-only change triggers supervisor refresh only - shared-auto Shared change triggers gateway + supervisor rebuild - helm-auto Helm-only change triggers Helm upgrade only - unrelated-auto Unrelated change stays a no-op - explicit-targets Explicit targets override change detection - gateway-cache Compare cold vs warm gateway rebuild after a code change - supervisor-cache Compare cold vs warm supervisor rebuild after a code change - container-invalidation Mismatched container ID invalidates gateway + Helm state - -If no scenarios are provided, the full suite runs. - -Environment: - CLUSTER_NAME Override cluster name to test against - FAST_DEPLOY_TEST_REPORT_DIR Output directory (default: .cache/cluster-deploy-fast-test/) - FAST_DEPLOY_TEST_KEEP_WORKTREES Keep temporary worktrees when set to 1 - FAST_DEPLOY_TEST_SKIP_CACHE Skip the cache timing scenarios when set to 1 -EOF -} - -if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then - usage - exit 0 -fi - -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -REPO_ROOT=$(cd "${SCRIPT_DIR}/../.." && pwd) -RUN_ID=$(date +"%Y%m%d-%H%M%S") -REPORT_DIR=${FAST_DEPLOY_TEST_REPORT_DIR:-"${REPO_ROOT}/.cache/cluster-deploy-fast-test/${RUN_ID}"} -WORKTREE_ROOT="${REPORT_DIR}/worktrees" -LOG_DIR="${REPORT_DIR}/logs" -STATE_DIR="${REPORT_DIR}/state" -CACHE_DIR="${REPORT_DIR}/buildkit-cache" -SUMMARY_TSV="${REPORT_DIR}/summary.tsv" -SUMMARY_MD="${REPORT_DIR}/summary.md" -KEEP_WORKTREES=${FAST_DEPLOY_TEST_KEEP_WORKTREES:-0} -SKIP_CACHE=${FAST_DEPLOY_TEST_SKIP_CACHE:-0} - -normalize_name() { - echo "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//' -} - -ROOT_BASENAME=$(basename "${REPO_ROOT}") -CLUSTER_NAME=${CLUSTER_NAME:-$(normalize_name "${ROOT_BASENAME}")} - -mkdir -p "${WORKTREE_ROOT}" "${LOG_DIR}" "${STATE_DIR}" "${CACHE_DIR}" - -declare -a SCENARIOS=() -if [[ "$#" -gt 0 ]]; then - SCENARIOS=("$@") -else - SCENARIOS=( - noop - gateway-auto - supervisor-auto - shared-auto - helm-auto - unrelated-auto - explicit-targets - gateway-cache - supervisor-cache - container-invalidation - ) -fi - -if [[ "${SKIP_CACHE}" == "1" ]]; then - declare -a filtered=() - filtered=() - for scenario in "${SCENARIOS[@]}"; do - if [[ "${scenario}" != "gateway-cache" && "${scenario}" != "supervisor-cache" ]]; then - filtered+=("${scenario}") - fi - done - SCENARIOS=("${filtered[@]}") -fi - -declare -a CREATED_WORKTREES=() - -cleanup_worktrees() { - if [[ "${KEEP_WORKTREES}" == "1" ]]; then - return - fi - - local dir - for dir in "${CREATED_WORKTREES[@]:-}"; do - if [[ -d "${dir}" ]]; then - git -C "${REPO_ROOT}" worktree remove --force "${dir}" >/dev/null 2>&1 || true - fi - done -} -# cleanup_worktrees is called by on_exit trap set after TSV header is written - -ensure_cluster() { - echo "Ensuring cluster is running (mise run cluster)..." - mise run cluster -} - -create_worktree() { - local name=$1 - local dir="${WORKTREE_ROOT}/${name}" - rm -rf "${dir}" - git -C "${REPO_ROOT}" worktree add --detach "${dir}" HEAD >/dev/null - mise trust "${dir}/mise.toml" >/dev/null 2>&1 || true - CREATED_WORKTREES+=("${dir}") - printf '%s\n' "${dir}" -} - -append_marker() { - local file=$1 - local marker=$2 - printf '\n%s\n' "${marker}" >> "${file}" -} - -extract_plan_value() { - local log_file=$1 - local label=$2 - awk -F': +' -v pattern="${label}" '$0 ~ pattern {print $2; exit}' "${log_file}" -} - -extract_duration() { - local log_file=$1 - local label=$2 - awk -v prefix="${label} took " 'index($0, prefix) == 1 {sub(/^.* took /, "", $0); sub(/s$/, "", $0); print; exit}' "${log_file}" -} - -count_cached_lines() { - local log_file=$1 - grep -c " CACHED" "${log_file}" 2>/dev/null || true -} - -check_required_patterns() { - local log_file=$1 - local patterns=${2:-} - local pattern - - if [[ -z "${patterns}" ]]; then - return 0 - fi - - IFS='|' read -r -a pattern_array <<< "${patterns}" - for pattern in "${pattern_array[@]}"; do - if ! grep -Fq "${pattern}" "${log_file}"; then - return 1 - fi - done - - return 0 -} - -record_result() { - local scenario=$1 - local mode=$2 - local expected=$3 - local observed=$4 - local pass=$5 - local total_duration=$6 - local build_duration=$7 - local cached_lines=$8 - local notes=$9 - - printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ - "${scenario}" "${mode}" "${expected}" "${observed}" "${pass}" \ - "${total_duration}" "${build_duration}" "${cached_lines}" "${notes}" >> "${SUMMARY_TSV}" -} - -write_summary_md() { - { - echo "# Fast Deploy Cache Test Summary" - echo - echo "- Cluster: \`${CLUSTER_NAME}\`" - echo "- Report dir: \`${REPORT_DIR}\`" - echo - echo "## How to read this report" - echo - echo "Each row is an independent scenario that validates one aspect of the fast-deploy" - echo "change-detection and caching system." - echo - echo "**Columns:**" - echo - echo "- **Scenario** -- Name of the test. Suffixes like \`:cold\`/\`:warm\` indicate cache comparison phases." - echo "- **Mode** -- \`auto\` = change detection decides what to build; \`explicit\` = target forced via CLI arg; \`cache\` = cold vs warm timing comparison." - echo "- **Expected / Observed** -- The deploy plan the test expected vs what actually ran. Format: \`build gateway=N, build supervisor=N, helm upgrade=N\` where 1 = triggered, 0 = skipped." - echo "- **Pass** -- \`PASS\` = observed matched expected; \`FAIL\` = mismatch; \`INFO\` = informational baseline (no assertion)." - echo "- **Total (s)** -- Wall-clock time for the entire deploy including Docker context transfer, image push, and Helm rollout." - echo "- **Builds (s)** -- Time spent in cargo compilation and Docker image builds only. The gap between Total and Builds is deploy overhead (image push, Helm upgrade, pod rollout)." - echo "- **Cached lines** -- Number of Docker build steps that hit \`CACHED\`. Higher = more layer reuse. In cache scenarios, warm runs should show more cached lines than cold runs." - echo "- **Notes** -- Context about what the scenario tests or why it passed/failed." - echo - echo "**What to look for:**" - echo - echo "- All \`auto\` and \`explicit\` scenarios should be \`PASS\`. A \`FAIL\` means change detection routed incorrectly." - echo "- In \`cache\` scenarios, the \`:warm\` row should show either >= 30% faster build time or more cached lines than the \`:cold\` row." - echo "- The \`:cold\` rows are marked \`INFO\` -- they are baselines, not assertions." - echo "- If supervisor warm builds are slow, check the logs for full recompilation (many \`Compiling\` lines) vs cache hits (only workspace crates recompiling)." - echo - echo "## Results" - echo - echo "| Scenario | Mode | Expected | Observed | Pass | Total (s) | Builds (s) | Cached lines | Notes |" - echo "|---|---|---|---|---|---:|---:|---:|---|" - awk -F '\t' 'NR > 1 {printf "| %s | %s | `%s` | `%s` | %s | %s | %s | %s | %s |\n", $1, $2, $3, $4, $5, $6, $7, $8, $9}' "${SUMMARY_TSV}" - } > "${SUMMARY_MD}" -} - -run_fast_deploy() { - local worktree=$1 - local state_file=$2 - local log_file=$3 - shift 3 - - local start end - start=$(date +%s) - ( - cd "${worktree}" - env \ - BUILDKIT_PROGRESS=plain \ - CLUSTER_NAME="${CLUSTER_NAME}" \ - DEPLOY_FAST_STATE_FILE="${state_file}" \ - DOCKER_BUILD_CACHE_DIR="${CACHE_DIR}" \ - "$@" \ - mise run cluster - ) >"${log_file}" 2>&1 || true - end=$(date +%s) - printf '%s\n' $((end - start)) -} - -run_fast_deploy_args() { - local worktree=$1 - local state_file=$2 - local log_file=$3 - shift 3 - - local start end - start=$(date +%s) - ( - cd "${worktree}" - env \ - BUILDKIT_PROGRESS=plain \ - CLUSTER_NAME="${CLUSTER_NAME}" \ - DEPLOY_FAST_STATE_FILE="${state_file}" \ - DOCKER_BUILD_CACHE_DIR="${CACHE_DIR}" \ - mise run cluster -- "$@" - ) >"${log_file}" 2>&1 || true - end=$(date +%s) - printf '%s\n' $((end - start)) -} - -validate_plan() { - local log_file=$1 - local expected_gateway=$2 - local expected_supervisor=$3 - local expected_helm=$4 - - local gateway supervisor helm - gateway=$(extract_plan_value "${log_file}" "build gateway") - supervisor=$(extract_plan_value "${log_file}" "build supervisor") - helm=$(extract_plan_value "${log_file}" "helm upgrade") - - if [[ "${gateway}" == "${expected_gateway}" && "${supervisor}" == "${expected_supervisor}" && "${helm}" == "${expected_helm}" ]]; then - printf '%s\n' "build gateway=${gateway}, build supervisor=${supervisor}, helm upgrade=${helm}" - return 0 - fi - - printf '%s\n' "build gateway=${gateway:-missing}, build supervisor=${supervisor:-missing}, helm upgrade=${helm:-missing}" - return 1 -} - -clear_cache() { - rm -rf "${CACHE_DIR}" - mkdir -p "${CACHE_DIR}" -} - -prime_state() { - local name=$1 - local worktree state_file log_file - worktree=$(create_worktree "${name}-prime") - state_file="${STATE_DIR}/${name}.state" - log_file="${LOG_DIR}/${name}-prime.log" - run_fast_deploy "${worktree}" "${state_file}" "${log_file}" >/dev/null || true -} - -run_auto_scenario() { - local scenario=$1 - local file=$2 - local marker=$3 - local expected_gateway=$4 - local expected_supervisor=$5 - local expected_helm=$6 - local note=$7 - local required_patterns=${8:-} - - local worktree state_file log_file total_duration build_duration observed pass - worktree=$(create_worktree "${scenario}") - state_file="${STATE_DIR}/${scenario}.state" - log_file="${LOG_DIR}/${scenario}.log" - - prime_state "${scenario}" - append_marker "${worktree}/${file}" "${marker}" - - total_duration=$(run_fast_deploy "${worktree}" "${state_file}" "${log_file}") - build_duration=$(extract_duration "${log_file}" "Builds") - if observed=$(validate_plan "${log_file}" "${expected_gateway}" "${expected_supervisor}" "${expected_helm}"); then - pass=PASS - else - pass=FAIL - fi - - if [[ "${pass}" == "PASS" ]] && ! check_required_patterns "${log_file}" "${required_patterns}"; then - pass=FAIL - note="${note}; missing expected deploy log pattern" - fi - - record_result \ - "${scenario}" \ - "auto" \ - "build gateway=${expected_gateway}, build supervisor=${expected_supervisor}, helm upgrade=${expected_helm}" \ - "${observed}" \ - "${pass}" \ - "${total_duration}" \ - "${build_duration:-n/a}" \ - "$(count_cached_lines "${log_file}")" \ - "${note}" -} - -run_noop_scenario() { - local scenario=noop - local worktree state_file log_file total_duration build_duration observed pass notes - worktree=$(create_worktree "${scenario}") - state_file="${STATE_DIR}/${scenario}.state" - log_file="${LOG_DIR}/${scenario}.log" - - prime_state "${scenario}" - - total_duration=$(run_fast_deploy "${worktree}" "${state_file}" "${log_file}") - build_duration=$(extract_duration "${log_file}" "Builds") - if observed=$(validate_plan "${log_file}" 0 0 0); then - pass=PASS - else - pass=FAIL - fi - notes="clean tree should print no-op plan" - - if ! grep -q "No new local changes since last deploy." "${log_file}"; then - pass=FAIL - notes="missing no-op message" - fi - - record_result \ - "${scenario}" \ - "auto" \ - "build gateway=0, build supervisor=0, helm upgrade=0" \ - "${observed}" \ - "${pass}" \ - "${total_duration}" \ - "${build_duration:-n/a}" \ - "$(count_cached_lines "${log_file}")" \ - "${notes}" -} - -run_explicit_targets_scenario() { - local scenario=explicit-targets - local target worktree state_file log_file total_duration build_duration observed pass expected notes - - for target in gateway supervisor chart all; do - worktree=$(create_worktree "${scenario}-${target}") - state_file="${STATE_DIR}/${scenario}-${target}.state" - log_file="${LOG_DIR}/${scenario}-${target}.log" - - total_duration=$(run_fast_deploy_args "${worktree}" "${state_file}" "${log_file}" "${target}") - build_duration=$(extract_duration "${log_file}" "Builds") - - case "${target}" in - gateway) - if observed=$(validate_plan "${log_file}" 1 0 1); then - pass=PASS - else - pass=FAIL - fi - expected="build gateway=1, build supervisor=0, helm upgrade=1" - ;; - supervisor) - if observed=$(validate_plan "${log_file}" 0 1 0); then - pass=PASS - else - pass=FAIL - fi - expected="build gateway=0, build supervisor=1, helm upgrade=0" - ;; - chart) - if observed=$(validate_plan "${log_file}" 0 0 1); then - pass=PASS - else - pass=FAIL - fi - expected="build gateway=0, build supervisor=0, helm upgrade=1" - ;; - all) - if observed=$(validate_plan "${log_file}" 1 1 1); then - pass=PASS - else - pass=FAIL - fi - expected="build gateway=1, build supervisor=1, helm upgrade=1" - ;; - esac - notes="explicit target ${target}" - record_result \ - "${scenario}:${target}" \ - "explicit" \ - "${expected}" \ - "${observed}" \ - "${pass}" \ - "${total_duration}" \ - "${build_duration:-n/a}" \ - "$(count_cached_lines "${log_file}")" \ - "${notes}" - done -} - -run_cache_scenario() { - local scenario=$1 - local file=$2 - local marker=$3 - local target=$4 - - local worktree state_file cold_log warm_log cold_total warm_total cold_build warm_build cold_cached warm_cached pass notes - worktree=$(create_worktree "${scenario}") - state_file="${STATE_DIR}/${scenario}.state" - cold_log="${LOG_DIR}/${scenario}-cold.log" - warm_log="${LOG_DIR}/${scenario}-warm.log" - - append_marker "${worktree}/${file}" "${marker}" - clear_cache - - cold_total=$(run_fast_deploy_args "${worktree}" "${state_file}" "${cold_log}" "${target}") - cold_build=$(extract_duration "${cold_log}" "Builds") - cold_cached=$(count_cached_lines "${cold_log}") - - warm_total=$(run_fast_deploy_args "${worktree}" "${state_file}" "${warm_log}" "${target}") - warm_build=$(extract_duration "${warm_log}" "Builds") - warm_cached=$(count_cached_lines "${warm_log}") - - pass=FAIL - notes="warm rebuild should be faster or show cache hits" - - if [[ -n "${cold_build:-}" && -n "${warm_build:-}" && "${cold_build}" =~ ^[0-9]+$ && "${warm_build}" =~ ^[0-9]+$ && "${cold_build}" -gt 0 ]]; then - if [[ "${warm_build}" -le $((cold_build * 70 / 100)) ]]; then - pass=PASS - notes="warm build improved by at least 30%" - fi - fi - - if [[ "${pass}" != "PASS" && "${warm_cached}" =~ ^[0-9]+$ && "${warm_cached}" -gt "${cold_cached:-0}" ]]; then - pass=PASS - notes="warm build showed more cache hits" - fi - - record_result \ - "${scenario}:cold" \ - "cache" \ - "first rebuild of ${target} after cache clear" \ - "total=${cold_total}s, builds=${cold_build:-n/a}s" \ - "INFO" \ - "${cold_total}" \ - "${cold_build:-n/a}" \ - "${cold_cached}" \ - "baseline cold run" - - record_result \ - "${scenario}:warm" \ - "cache" \ - "second rebuild of ${target} should reuse cache" \ - "total=${warm_total}s, builds=${warm_build:-n/a}s" \ - "${pass}" \ - "${warm_total}" \ - "${warm_build:-n/a}" \ - "${warm_cached}" \ - "${notes}" -} - -run_container_invalidation_scenario() { - local scenario=container-invalidation - local worktree state_file prime_log rerun_log total_duration build_duration observed pass container_id notes - worktree=$(create_worktree "${scenario}") - state_file="${STATE_DIR}/${scenario}.state" - prime_log="${LOG_DIR}/${scenario}-prime.log" - rerun_log="${LOG_DIR}/${scenario}.log" - - run_fast_deploy "${worktree}" "${state_file}" "${prime_log}" >/dev/null - container_id=$(awk -F= '/^container_id=/ {print $2; exit}' "${state_file}") - if [[ -z "${container_id}" ]]; then - echo "Error: could not determine cluster container ID from state file for invalidation scenario." >&2 - exit 1 - fi - - sed -i.bak "s|^container_id=.*$|container_id=invalidated-${container_id}|" "${state_file}" - rm -f "${state_file}.bak" - - total_duration=$(run_fast_deploy "${worktree}" "${state_file}" "${rerun_log}") - build_duration=$(extract_duration "${rerun_log}" "Builds") - if observed=$(validate_plan "${rerun_log}" 1 0 1); then - pass=PASS - else - pass=FAIL - fi - notes="mismatched container ID should invalidate gateway and helm only" - - if [[ "${pass}" == "PASS" ]] && ! check_required_patterns "${rerun_log}" "Restarting gateway to pick up updated image...|Upgrading helm release..."; then - pass=FAIL - notes="${notes}; missing expected deploy log pattern" - fi - - record_result \ - "${scenario}" \ - "auto" \ - "build gateway=1, build supervisor=0, helm upgrade=1" \ - "${observed}" \ - "${pass}" \ - "${total_duration}" \ - "${build_duration:-n/a}" \ - "$(count_cached_lines "${rerun_log}")" \ - "${notes}" -} - -printf 'scenario\tmode\texpected\tobserved\tpass\ttotal_seconds\tbuild_seconds\tcached_lines\tnotes\n' > "${SUMMARY_TSV}" - -on_exit() { - write_summary_md - echo "" - cat "${SUMMARY_MD}" - echo "" - echo "Full report written to: ${SUMMARY_MD}" - cleanup_worktrees -} -trap on_exit EXIT - -ensure_cluster - -SCENARIO_COUNT=${#SCENARIOS[@]} -SCENARIO_IDX=0 - -for scenario in "${SCENARIOS[@]}"; do - SCENARIO_IDX=$((SCENARIO_IDX + 1)) - echo "[${SCENARIO_IDX}/${SCENARIO_COUNT}] Running scenario: ${scenario}" - case "${scenario}" in - noop) - run_noop_scenario - ;; - gateway-auto) - run_auto_scenario \ - "gateway-auto" \ - "crates/openshell-server/src/main.rs" \ - "// fast deploy cache test: gateway-auto ${RUN_ID}" \ - 1 0 1 \ - "gateway-only source change" \ - "Pushing updated images to local registry...|Restarting gateway to pick up updated image...|Upgrading helm release..." - ;; - supervisor-auto) - run_auto_scenario \ - "supervisor-auto" \ - "crates/openshell-sandbox/src/main.rs" \ - "// fast deploy cache test: supervisor-auto ${RUN_ID}" \ - 0 1 0 \ - "supervisor-only source change" \ - "Supervisor binary updated on cluster node." - ;; - shared-auto) - run_auto_scenario \ - "shared-auto" \ - "crates/openshell-policy/src/lib.rs" \ - "// fast deploy cache test: shared-auto ${RUN_ID}" \ - 1 1 1 \ - "shared dependency change should rebuild both binaries" \ - "Restarting gateway to pick up updated image...|Supervisor binary updated on cluster node." - ;; - helm-auto) - run_auto_scenario \ - "helm-auto" \ - "deploy/helm/openshell/values.yaml" \ - "# fast deploy cache test: helm-auto ${RUN_ID}" \ - 0 0 1 \ - "chart-only change" \ - "Upgrading helm release..." - ;; - unrelated-auto) - run_auto_scenario \ - "unrelated-auto" \ - "README.md" \ - "" \ - 0 0 0 \ - "unrelated file should stay a no-op" \ - "No new local changes since last deploy." - ;; - explicit-targets) - run_explicit_targets_scenario - ;; - gateway-cache) - run_cache_scenario \ - "gateway-cache" \ - "crates/openshell-server/src/main.rs" \ - "// fast deploy cache test: gateway-cache ${RUN_ID}" \ - "gateway" - ;; - supervisor-cache) - run_cache_scenario \ - "supervisor-cache" \ - "crates/openshell-sandbox/src/main.rs" \ - "// fast deploy cache test: supervisor-cache ${RUN_ID}" \ - "supervisor" - ;; - container-invalidation) - run_container_invalidation_scenario - ;; - *) - echo "Unknown scenario '${scenario}'" >&2 - exit 1 - ;; - esac - echo "[${SCENARIO_IDX}/${SCENARIO_COUNT}] Done: ${scenario}" -done diff --git a/scripts/docker-cleanup.sh b/scripts/docker-cleanup.sh index fc9a69809..b4dc86616 100755 --- a/scripts/docker-cleanup.sh +++ b/scripts/docker-cleanup.sh @@ -105,7 +105,7 @@ KEEP_IMAGE_PREFIXES=( should_keep_image() { local repo="$1" - # Keep current cluster images + # Keep current OpenShell images for prefix in "${CURRENT_IMAGE_PREFIXES[@]}"; do if [[ "$repo" == "$prefix"* ]]; then return 0 @@ -137,7 +137,7 @@ if [[ "$DRY_RUN" == true ]]; then echo elif [[ "$FORCE" != true ]]; then echo -e "${BOLD}This will remove stale images, unused volumes, and build cache.${RESET}" - echo "The currently deployed cluster images and running containers are preserved." + echo "The currently deployed OpenShell images and running containers are preserved." echo read -r -p "Continue? [y/N] " confirm if [[ ! "$confirm" =~ ^[Yy]$ ]]; then diff --git a/scripts/remote-deploy.sh b/scripts/remote-deploy.sh deleted file mode 100755 index 8b859b688..000000000 --- a/scripts/remote-deploy.sh +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Deploy the current checkout to a remote machine for gateway development/testing. -# -# The script syncs the local source tree to a remote host, bootstraps the toolchain -# there, builds the CLI and Docker images from the synced checkout, then starts or -# updates a gateway using `openshell gateway start`. - -set -euo pipefail - -usage() { - cat <<'EOF' -Usage: - ./scripts/remote-deploy.sh [options] - -Options: - --remote-dir DIR Remote checkout directory (default: openshell) - --name NAME Cluster name (default: openshell) - --port PORT Gateway port (default: 8080) - --ssh-key PATH SSH private key for ssh/rsync - --skip-sync Skip rsync and use the existing remote checkout - --recreate Destroy and recreate the gateway from scratch - --plaintext Listen on plaintext HTTP instead of mTLS - --disable-gateway-auth Keep TLS but disable client certificate enforcement - --image-tag TAG Docker image tag to build/deploy (default: dev) - --cargo-version VERSION Override OPENSHELL_CARGO_VERSION for remote Docker builds - --help Show this help - -Examples: - ./scripts/remote-deploy.sh ubuntu@devbox - ./scripts/remote-deploy.sh ubuntu@devbox --recreate --port 18080 - ./scripts/remote-deploy.sh ubuntu@devbox --plaintext --ssh-key ~/.ssh/devbox - ./scripts/remote-deploy.sh my-sandbox -./scripts/remote-deploy.sh my-sandbox --remote-dir --name openshell --port 8080 --recreate --plaintext -EOF -} - -info() { echo "==> $*"; } -err() { echo "ERROR: $*" >&2; } - -require_value() { - local flag="$1" - local value="${2-}" - if [[ -z "${value}" ]]; then - err "${flag} requires a value" - exit 1 - fi -} - -REMOTE_HOST="" -REMOTE_DIR=${REMOTE_DIR:-openshell} -CLUSTER_NAME=${CLUSTER_NAME:-openshell} -GATEWAY_PORT=${GATEWAY_PORT:-8080} -SSH_KEY="${SSH_KEY:-}" -IMAGE_TAG=${IMAGE_TAG:-dev} -CARGO_VERSION=${OPENSHELL_CARGO_VERSION:-0.0.0-dev} -SKIP_SYNC=false -RECREATE=false -PLAINTEXT=false -DISABLE_GATEWAY_AUTH=false - -while [[ $# -gt 0 ]]; do - case "$1" in - --remote-dir) - require_value "$1" "${2-}" - REMOTE_DIR="$2" - shift 2 - ;; - --name) - require_value "$1" "${2-}" - CLUSTER_NAME="$2" - shift 2 - ;; - --port) - require_value "$1" "${2-}" - GATEWAY_PORT="$2" - shift 2 - ;; - --ssh-key) - require_value "$1" "${2-}" - SSH_KEY="$2" - shift 2 - ;; - --skip-sync) - SKIP_SYNC=true - shift - ;; - --recreate) - RECREATE=true - shift - ;; - --plaintext) - PLAINTEXT=true - shift - ;; - --disable-gateway-auth) - DISABLE_GATEWAY_AUTH=true - shift - ;; - --image-tag) - require_value "$1" "${2-}" - IMAGE_TAG="$2" - shift 2 - ;; - --cargo-version) - require_value "$1" "${2-}" - CARGO_VERSION="$2" - shift 2 - ;; - --help|-h) - usage - exit 0 - ;; - --*) - err "Unknown argument: $1" - usage >&2 - exit 1 - ;; - *) - if [[ -n "${REMOTE_HOST}" ]]; then - err "Multiple remote hosts provided: ${REMOTE_HOST} and $1" - usage >&2 - exit 1 - fi - REMOTE_HOST="$1" - shift - ;; - esac -done - -if [[ -z "${REMOTE_HOST}" ]]; then - err "Remote host is required" - usage >&2 - exit 1 -fi - -if [[ "${PLAINTEXT}" == "true" && "${DISABLE_GATEWAY_AUTH}" == "true" ]]; then - err "--disable-gateway-auth is ignored when --plaintext is set; choose one mode" - exit 1 -fi - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" - -SSH_ARGS=() -if [[ -n "${SSH_KEY}" ]]; then - SSH_ARGS=(-i "${SSH_KEY}") -fi - -if [[ "${SKIP_SYNC}" != "true" ]]; then - info "Preparing ${REMOTE_HOST}:${REMOTE_DIR}" - ssh "${SSH_ARGS[@]}" "${REMOTE_HOST}" "mkdir -p '${REMOTE_DIR}'" - - info "Syncing source to ${REMOTE_HOST}:${REMOTE_DIR}" - RSYNC_SSH=(ssh) - if [[ -n "${SSH_KEY}" ]]; then - RSYNC_SSH+=(-i "${SSH_KEY}") - fi - - rsync -az --delete \ - -e "${RSYNC_SSH[*]}" \ - --exclude 'target/' \ - --exclude '.git/' \ - --exclude '.cache/' \ - --exclude 'node_modules/' \ - --exclude '*.pyc' \ - --exclude '__pycache__/' \ - --exclude '.venv/' \ - --exclude 'e2e/' \ - --exclude 'deploy/docker/.build/' \ - "${REPO_ROOT}/" "${REMOTE_HOST}:${REMOTE_DIR}/" - info "Sync complete" -fi - -SECURITY_MODE="mTLS enabled" -if [[ "${PLAINTEXT}" == "true" ]]; then - SECURITY_MODE="plaintext HTTP" -elif [[ "${DISABLE_GATEWAY_AUTH}" == "true" ]]; then - SECURITY_MODE="TLS enabled, client cert auth disabled" -fi - -info "Deploying gateway on ${REMOTE_HOST} (port=${GATEWAY_PORT}, security=${SECURITY_MODE})" -ssh -t "${SSH_ARGS[@]}" "${REMOTE_HOST}" \ - bash -s -- \ - "${REMOTE_DIR}" \ - "${CLUSTER_NAME}" \ - "${GATEWAY_PORT}" \ - "${IMAGE_TAG}" \ - "${CARGO_VERSION}" \ - "${RECREATE}" \ - "${PLAINTEXT}" \ - "${DISABLE_GATEWAY_AUTH}" <<'REMOTE_EOF' -set -euo pipefail - -REMOTE_DIR="$1" -CLUSTER_NAME="$2" -GATEWAY_PORT="$3" -IMAGE_TAG="$4" -CARGO_VERSION="$5" -RECREATE="$6" -PLAINTEXT="$7" -DISABLE_GATEWAY_AUTH="$8" - -cd "${REMOTE_DIR}" - -if ! command -v mise >/dev/null 2>&1; then - echo "==> Installing mise..." - curl https://mise.run | sh -fi -export PATH="$HOME/.local/bin:$PATH" - -echo "==> Installing tools via mise..." -mise trust --yes -mise install --yes - -if ! command -v podman >/dev/null 2>&1 && ! command -v docker >/dev/null 2>&1; then - echo "ERROR: Neither podman nor docker is installed on the remote host." >&2 - exit 1 -fi - -echo "==> Building openshell CLI..." -mise exec -- cargo build --release -p openshell-cli -mkdir -p "$HOME/.local/bin" -install -m 0755 target/release/openshell "$HOME/.local/bin/openshell" - -# Ensure `mise exec -- openshell` uses the release binary rather than the local -# development shim, which expects git metadata that is not synced to the VM. -install -m 0755 target/release/openshell scripts/bin/openshell - -# Prevent a stale repo-local .env from changing the deployment unexpectedly. -rm -f .env - -echo "==> Building container images (tag=${IMAGE_TAG})..." -export OPENSHELL_CARGO_VERSION="${CARGO_VERSION}" -export IMAGE_TAG -mise exec -- tasks/scripts/docker-build-image.sh cluster -mise exec -- tasks/scripts/docker-build-image.sh gateway - -export OPENSHELL_CLUSTER_IMAGE="openshell/cluster:${IMAGE_TAG}" -export OPENSHELL_PUSH_IMAGES="openshell/gateway:${IMAGE_TAG}" - -start_args=( - gateway - start - --name "${CLUSTER_NAME}" - --port "${GATEWAY_PORT}" -) - -if [[ "${RECREATE}" == "true" ]]; then - start_args+=(--recreate) -fi -if [[ "${PLAINTEXT}" == "true" ]]; then - start_args+=(--plaintext) -fi -if [[ "${DISABLE_GATEWAY_AUTH}" == "true" ]]; then - start_args+=(--disable-gateway-auth) -fi - -echo "==> Starting gateway..." -mise exec -- openshell "${start_args[@]}" - -echo "" -echo "============================================" -echo " Gateway deployed successfully" -echo " Cluster: ${CLUSTER_NAME}" -echo " Gateway port: ${GATEWAY_PORT}" -if [[ "${PLAINTEXT}" == "true" ]]; then - echo " Security: plaintext HTTP" -elif [[ "${DISABLE_GATEWAY_AUTH}" == "true" ]]; then - echo " Security: TLS enabled, client cert auth disabled" -else - echo " Security: mTLS enabled" -fi -echo "============================================" -REMOTE_EOF - -PROTO="https" -if [[ "${PLAINTEXT}" == "true" ]]; then - PROTO="http" -fi - -info "Done. Gateway is running on ${REMOTE_HOST}:${GATEWAY_PORT}" -info "Health check:" -info " curl ${PROTO}://${REMOTE_HOST}:${GATEWAY_PORT}/health" diff --git a/scripts/smoke-test-network-policy.sh b/scripts/smoke-test-network-policy.sh index ee5dbfdc7..3e82980aa 100755 --- a/scripts/smoke-test-network-policy.sh +++ b/scripts/smoke-test-network-policy.sh @@ -112,7 +112,7 @@ fi echo " Token is set" if ! openshell status >/dev/null 2>&1; then - echo -e "${RED}Error: No healthy gateway. Run: openshell gateway start${RESET}" + echo -e "${RED}Error: No healthy gateway. Run: mise run gateway:docker${RESET}" exit 1 fi echo " Gateway is healthy" diff --git a/tasks/cluster.toml b/tasks/cluster.toml deleted file mode 100644 index 248e61119..000000000 --- a/tasks/cluster.toml +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Cluster bootstrap and deploy tasks - -[cluster] -description = "Bootstrap or incremental deploy (creates cluster if needed, rebuilds changed components)" -run = "tasks/scripts/cluster.sh" - -["cluster:build:full"] -description = "Build and deploy local k3s cluster with OpenShell" -depends = [ - "build:docker:gateway", -] -run = "tasks/scripts/cluster-bootstrap.sh build" -hide = true - -["cluster:deploy"] -description = "Alias for cluster (incremental deploy)" -run = "tasks/scripts/cluster.sh" -hide = true - -["cluster:deploy:supervisor"] -description = "Fast deploy supervisor binary changes (docker cp into cluster)" -run = "tasks/scripts/cluster-deploy-fast.sh supervisor" -hide = true - -["cluster:deploy:all"] -description = "Pull-mode deploy using local registry pushes" -run = "tasks/scripts/cluster-deploy-fast.sh all" -hide = true - -["cluster:stop"] -description = "Stop and remove the local cluster container" -run = "tasks/scripts/cluster-stop.sh" - -["cluster:push:gateway"] -description = "Tag and push gateway image to pull registry" -run = "tasks/scripts/cluster-push-component.sh gateway" -hide = true diff --git a/tasks/docker.toml b/tasks/docker.toml index 78796d868..a58fdcf86 100644 --- a/tasks/docker.toml +++ b/tasks/docker.toml @@ -7,7 +7,6 @@ description = "Build all Docker images" depends = [ "build:docker:gateway", - "build:docker:cluster", "build:docker:supervisor", ] hide = true @@ -32,11 +31,6 @@ description = "Build the supervisor image (FROM scratch, binary only)" run = "tasks/scripts/docker-build-image.sh supervisor" hide = true -["build:docker:cluster"] -description = "Build the k3s cluster image (component images pulled at runtime from registry)" -run = "tasks/scripts/docker-build-image.sh cluster" -hide = true - ["docker:build:gateway"] description = "Alias for build:docker:gateway" depends = ["build:docker:gateway"] @@ -47,23 +41,18 @@ description = "Alias for build:docker:supervisor" depends = ["build:docker:supervisor"] hide = true -["docker:build:cluster"] -description = "Alias for build:docker:cluster" -depends = ["build:docker:cluster"] -hide = true - -["build:docker:cluster:multiarch"] -description = "Build multi-arch cluster image and push to a registry" +["build:docker:multiarch"] +description = "Build multi-arch gateway and supervisor images and push to a registry" run = "tasks/scripts/docker-publish-multiarch.sh" hide = true -["docker:build:cluster:multiarch"] -description = "Alias for build:docker:cluster:multiarch" -depends = ["build:docker:cluster:multiarch"] +["docker:build:multiarch"] +description = "Alias for build:docker:multiarch" +depends = ["build:docker:multiarch"] hide = true ["docker:cleanup"] -description = "Remove stale images, volumes, and build cache not used by the current cluster" +description = "Remove stale images, volumes, and build cache not used by current deployments" run = "scripts/docker-cleanup.sh --force" ["docker:cleanup:dry-run"] diff --git a/tasks/sandbox.toml b/tasks/sandbox.toml index b58f55587..23fb0f6e4 100644 --- a/tasks/sandbox.toml +++ b/tasks/sandbox.toml @@ -4,7 +4,7 @@ # Sandbox tasks [sandbox] -description = "Create or reconnect to the dev sandbox (redeploys cluster if dirty)" +description = "Create or reconnect to the dev sandbox using the active gateway" raw = true usage = """ arg "[command]" var=#true help="Command to run in the sandbox (default: claude)" diff --git a/tasks/scripts/cluster-bootstrap.sh b/tasks/scripts/cluster-bootstrap.sh deleted file mode 100755 index 7f4fcf175..000000000 --- a/tasks/scripts/cluster-bootstrap.sh +++ /dev/null @@ -1,296 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/container-engine.sh" - -# Normalize cluster name: lowercase, replace invalid chars with hyphens -normalize_name() { - echo "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//' -} - -MODE=${1:-build} -if [ "${MODE}" != "build" ] && [ "${MODE}" != "fast" ]; then - echo "usage: $0 [build|fast]" >&2 - exit 1 -fi - -if [ -n "${IMAGE_TAG:-}" ]; then - IMAGE_TAG=${IMAGE_TAG} -else - IMAGE_TAG=dev -fi -ENV_FILE=.env -PUBLISHED_IMAGE_REPO_BASE_DEFAULT=ghcr.io/nvidia/openshell -LOCAL_REGISTRY_CONTAINER=openshell-local-registry -LOCAL_REGISTRY_ADDR=127.0.0.1:5000 - -if [ -n "${CI:-}" ] && [ -n "${CI_REGISTRY_IMAGE:-}" ]; then - IMAGE_REPO_BASE_DEFAULT=${CI_REGISTRY_IMAGE} -elif [ "${MODE}" = "fast" ]; then - IMAGE_REPO_BASE_DEFAULT=${LOCAL_REGISTRY_ADDR}/openshell -else - IMAGE_REPO_BASE_DEFAULT=${LOCAL_REGISTRY_ADDR}/openshell -fi - -IMAGE_REPO_BASE=${IMAGE_REPO_BASE:-${OPENSHELL_REGISTRY:-${IMAGE_REPO_BASE_DEFAULT}}} -REGISTRY_HOST=${OPENSHELL_REGISTRY_HOST:-${IMAGE_REPO_BASE%%/*}} -REGISTRY_NAMESPACE_DEFAULT=${IMAGE_REPO_BASE#*/} - -if [ "${REGISTRY_NAMESPACE_DEFAULT}" = "${IMAGE_REPO_BASE}" ]; then - REGISTRY_NAMESPACE_DEFAULT=openshell -fi - -has_env_key() { - local key=$1 - [ -f "${ENV_FILE}" ] || return 1 - grep -Eq "^[[:space:]]*(export[[:space:]]+)?${key}=" "${ENV_FILE}" -} - -append_env_if_missing() { - local key=$1 - local value=$2 - if has_env_key "${key}"; then - return - fi - if [ -f "${ENV_FILE}" ] && [ -s "${ENV_FILE}" ]; then - # Ensure file ends with newline before appending, but don't add extra blank line - if [ "$(tail -c1 "${ENV_FILE}" | wc -l)" -eq 0 ]; then - printf "\n" >>"${ENV_FILE}" - fi - fi - printf "%s=%s\n" "${key}" "${value}" >>"${ENV_FILE}" -} - -port_is_in_use() { - local port=$1 - if command -v lsof >/dev/null 2>&1; then - lsof -nP -iTCP:"${port}" -sTCP:LISTEN >/dev/null 2>&1 - return $? - fi - - if command -v nc >/dev/null 2>&1; then - nc -z 127.0.0.1 "${port}" >/dev/null 2>&1 - return $? - fi - - (echo >/dev/tcp/127.0.0.1/"${port}") >/dev/null 2>&1 -} - -pick_random_port() { - local lower=20000 - local upper=60999 - local attempts=256 - local port - - for _ in $(seq 1 "${attempts}"); do - port=$((RANDOM % (upper - lower + 1) + lower)) - if ! port_is_in_use "${port}"; then - echo "${port}" - return 0 - fi - done - - echo "Error: could not find a free port after ${attempts} attempts." >&2 - return 1 -} - -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CLUSTER_NAME=$(normalize_name "${CLUSTER_NAME}") - -if [ -n "${GATEWAY_PORT:-}" ]; then - RESOLVED_GATEWAY_PORT=${GATEWAY_PORT} -elif [ "${MODE}" = "fast" ]; then - RESOLVED_GATEWAY_PORT=$(pick_random_port) -else - RESOLVED_GATEWAY_PORT=8080 -fi - -OPENSHELL_GATEWAY=${OPENSHELL_GATEWAY:-${CLUSTER_NAME}} -GATEWAY_PORT=${RESOLVED_GATEWAY_PORT} - -append_env_if_missing "GATEWAY_PORT" "${GATEWAY_PORT}" -append_env_if_missing "OPENSHELL_GATEWAY" "${OPENSHELL_GATEWAY}" - -export CLUSTER_NAME -export GATEWAY_PORT -export OPENSHELL_GATEWAY - -is_local_registry_host() { - [ "${REGISTRY_HOST}" = "127.0.0.1:5000" ] || [ "${REGISTRY_HOST}" = "localhost:5000" ] -} - -registry_reachable() { - curl -4 -fsS --max-time 2 "http://127.0.0.1:5000/v2/" >/dev/null 2>&1 || \ - curl -4 -fsS --max-time 2 "http://localhost:5000/v2/" >/dev/null 2>&1 -} - -wait_for_registry_ready() { - local attempts=${1:-20} - local delay_s=${2:-1} - local i - - for i in $(seq 1 "${attempts}"); do - if registry_reachable; then - return 0 - fi - sleep "${delay_s}" - done - - return 1 -} - -ensure_local_registry() { - if ce inspect "${LOCAL_REGISTRY_CONTAINER}" >/dev/null 2>&1; then - local proxy_remote_url - proxy_remote_url=$(ce inspect "${LOCAL_REGISTRY_CONTAINER}" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null | awk -F= '/^REGISTRY_PROXY_REMOTEURL=/{print $2; exit}' || true) - if [ -n "${proxy_remote_url}" ]; then - ce rm -f "${LOCAL_REGISTRY_CONTAINER}" >/dev/null 2>&1 || true - fi - fi - - if ! ce inspect "${LOCAL_REGISTRY_CONTAINER}" >/dev/null 2>&1; then - # --restart=always: on Docker this is managed by the Docker daemon and - # survives reboots when the daemon is enabled at boot. On rootless Podman - # it is handled by conmon within the current login session; it does NOT - # survive user logout or reboot without a systemd user unit. This is - # acceptable here because ensure_local_registry() recreates the container - # on every cluster bootstrap invocation if it is missing. - ce run -d --restart=always --name "${LOCAL_REGISTRY_CONTAINER}" -p 5000:5000 registry:2 >/dev/null - else - if ! ce ps --filter "name=^${LOCAL_REGISTRY_CONTAINER}$" --filter "status=running" -q | grep -q .; then - ce start "${LOCAL_REGISTRY_CONTAINER}" >/dev/null - fi - - port_map=$(ce port "${LOCAL_REGISTRY_CONTAINER}" 5000/tcp 2>/dev/null || true) - case "${port_map}" in - *:5000*) - ;; - *) - ce rm -f "${LOCAL_REGISTRY_CONTAINER}" >/dev/null 2>&1 || true - # See --restart=always note above in this function. - ce run -d --restart=always --name "${LOCAL_REGISTRY_CONTAINER}" -p 5000:5000 registry:2 >/dev/null - ;; - esac - fi - - if wait_for_registry_ready 20 1; then - return - fi - - if registry_reachable; then - return - fi - - echo "Error: local registry is not reachable at ${REGISTRY_HOST}." >&2 - echo " Ensure a registry is running on port 5000 (e.g. ${CONTAINER_ENGINE} run -d --name openshell-local-registry -p 5000:5000 registry:2)." >&2 - ce ps -a >&2 || true - ce logs "${LOCAL_REGISTRY_CONTAINER}" >&2 || true - exit 1 -} - -REGISTRY_ENDPOINT_DEFAULT=${REGISTRY_HOST} -if is_local_registry_host; then - REGISTRY_ENDPOINT_DEFAULT=host.docker.internal:5000 -fi - -REGISTRY_INSECURE_DEFAULT=false -if is_local_registry_host; then - REGISTRY_INSECURE_DEFAULT=true -fi - -export OPENSHELL_REGISTRY_HOST=${OPENSHELL_REGISTRY_HOST:-${REGISTRY_HOST}} -export OPENSHELL_REGISTRY_ENDPOINT=${OPENSHELL_REGISTRY_ENDPOINT:-${REGISTRY_ENDPOINT_DEFAULT}} -export OPENSHELL_REGISTRY_NAMESPACE=${OPENSHELL_REGISTRY_NAMESPACE:-${REGISTRY_NAMESPACE_DEFAULT}} -export OPENSHELL_REGISTRY_INSECURE=${OPENSHELL_REGISTRY_INSECURE:-${REGISTRY_INSECURE_DEFAULT}} -export IMAGE_REPO_BASE -export IMAGE_TAG - -if [ -n "${CI:-}" ] && [ -n "${CI_REGISTRY:-}" ] && [ -n "${CI_REGISTRY_USER:-}" ] && [ -n "${CI_REGISTRY_PASSWORD:-}" ]; then - printf '%s' "${CI_REGISTRY_PASSWORD}" | ce login -u "${CI_REGISTRY_USER}" --password-stdin "${CI_REGISTRY}" - export OPENSHELL_REGISTRY_USERNAME=${OPENSHELL_REGISTRY_USERNAME:-${CI_REGISTRY_USER}} - export OPENSHELL_REGISTRY_PASSWORD=${OPENSHELL_REGISTRY_PASSWORD:-${CI_REGISTRY_PASSWORD}} -fi - -if is_local_registry_host; then - ensure_local_registry -fi - -CONTAINER_NAME="openshell-cluster-${CLUSTER_NAME}" -VOLUME_NAME="openshell-cluster-${CLUSTER_NAME}" - -if [ "${MODE}" = "fast" ]; then - if ce inspect "${CONTAINER_NAME}" >/dev/null 2>&1 || ce volume inspect "${VOLUME_NAME}" >/dev/null 2>&1; then - echo "Recreating cluster '${CLUSTER_NAME}' from scratch..." - openshell gateway destroy --name "${CLUSTER_NAME}" - fi -fi - -if [ "${SKIP_IMAGE_PUSH:-}" = "1" ]; then - echo "Skipping image push (SKIP_IMAGE_PUSH=1; images already in registry)." -elif [ "${MODE}" = "build" ] || [ "${MODE}" = "fast" ]; then - tasks/scripts/cluster-push-component.sh gateway -fi - -# Build the cluster image so it contains the latest Helm chart, manifests, -# and entrypoint from the working tree. This ensures the k3s container -# always starts with the correct chart version. -if [ "${SKIP_CLUSTER_IMAGE_BUILD:-}" != "1" ]; then - tasks/scripts/docker-build-image.sh cluster -fi - -# In fast/build modes, use the locally-built cluster image rather than the -# remote distribution registry image. The local image is built by -# `docker-build-image.sh cluster` and contains the bundled Helm chart and -# manifests from the current working tree. -if [ -z "${OPENSHELL_CLUSTER_IMAGE:-}" ]; then - export OPENSHELL_CLUSTER_IMAGE="openshell/cluster:${IMAGE_TAG}" -fi - -DEPLOY_CMD=(openshell gateway start --name "${CLUSTER_NAME}" --port "${GATEWAY_PORT}") - -if [ "${CLUSTER_GPU:-0}" = "1" ]; then - DEPLOY_CMD+=(--gpu) -fi - -if [ -n "${GATEWAY_HOST:-}" ]; then - DEPLOY_CMD+=(--gateway-host "${GATEWAY_HOST}") - - # Ensure the gateway host resolves from the current environment. - # On Linux CI runners host.docker.internal is not set automatically - # (it's a Docker Desktop feature). If the hostname doesn't resolve, - # add it via the Docker bridge gateway IP. - if ! getent hosts "${GATEWAY_HOST}" >/dev/null 2>&1; then - BRIDGE_IP=$(ce_network_gateway) - if [ -n "${BRIDGE_IP}" ]; then - echo "Adding /etc/hosts entry: ${BRIDGE_IP} ${GATEWAY_HOST}" - echo "${BRIDGE_IP} ${GATEWAY_HOST}" >> /etc/hosts - fi - fi -fi - -if [ -n "${OPENSHELL_OIDC_ISSUER:-}" ]; then - DEPLOY_CMD+=(--oidc-issuer "${OPENSHELL_OIDC_ISSUER}") - [ -n "${OPENSHELL_OIDC_AUDIENCE:-}" ] && DEPLOY_CMD+=(--oidc-audience "${OPENSHELL_OIDC_AUDIENCE}") - [ -n "${OPENSHELL_OIDC_ROLES_CLAIM:-}" ] && DEPLOY_CMD+=(--oidc-roles-claim "${OPENSHELL_OIDC_ROLES_CLAIM}") - [ -n "${OPENSHELL_OIDC_ADMIN_ROLE:-}" ] && DEPLOY_CMD+=(--oidc-admin-role "${OPENSHELL_OIDC_ADMIN_ROLE}") - [ -n "${OPENSHELL_OIDC_USER_ROLE:-}" ] && DEPLOY_CMD+=(--oidc-user-role "${OPENSHELL_OIDC_USER_ROLE}") - [ -n "${OPENSHELL_OIDC_SCOPES_CLAIM:-}" ] && DEPLOY_CMD+=(--oidc-scopes-claim "${OPENSHELL_OIDC_SCOPES_CLAIM}") - [ -n "${OPENSHELL_OIDC_SCOPES:-}" ] && DEPLOY_CMD+=(--oidc-scopes "${OPENSHELL_OIDC_SCOPES}") -fi - -"${DEPLOY_CMD[@]}" - -# Clear the fast-deploy state file so the next incremental deploy -# recalculates from scratch. This prevents stale fingerprints from a -# prior session from masking changes that the bootstrap has already baked -# into the freshly pushed images. -DEPLOY_FAST_STATE_FILE=${DEPLOY_FAST_STATE_FILE:-.cache/cluster-deploy-fast.state} -rm -f "${DEPLOY_FAST_STATE_FILE}" - -echo "" -echo "Cluster '${CLUSTER_NAME}' is ready." diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh deleted file mode 100755 index da3e6494f..000000000 --- a/tasks/scripts/cluster-deploy-fast.sh +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/container-engine.sh" - -# Normalize cluster name: lowercase, replace invalid chars with hyphens -normalize_name() { - echo "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//' -} - -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CLUSTER_NAME=$(normalize_name "${CLUSTER_NAME}") -CONTAINER_NAME="openshell-cluster-${CLUSTER_NAME}" -IMAGE_REPO_BASE=${IMAGE_REPO_BASE:-${OPENSHELL_REGISTRY:-127.0.0.1:5000/openshell}} -IMAGE_TAG=${IMAGE_TAG:-dev} -RUST_BUILD_PROFILE=${RUST_BUILD_PROFILE:-debug} -DEPLOY_FAST_MODE=${DEPLOY_FAST_MODE:-auto} -FORCE_HELM_UPGRADE=${FORCE_HELM_UPGRADE:-0} -DEPLOY_FAST_HELM_WAIT=${DEPLOY_FAST_HELM_WAIT:-0} -DEPLOY_FAST_STATE_FILE=${DEPLOY_FAST_STATE_FILE:-.cache/cluster-deploy-fast.state} - -overall_start=$(date +%s) - -log_duration() { - local label=$1 - local start=$2 - local end=$3 - echo "${label} took $((end - start))s" -} - -if ! ce ps -q --filter "name=^${CONTAINER_NAME}$" --filter "health=healthy" | grep -q .; then - echo "Error: Cluster container '${CONTAINER_NAME}' is not running or not healthy." - echo "Start the cluster first with: mise run cluster" - exit 1 -fi - -# Run a command inside the cluster container with KUBECONFIG pre-configured. -cluster_exec() { - ce exec "${CONTAINER_NAME}" sh -c "KUBECONFIG=/etc/rancher/k3s/k3s.yaml $*" -} - -# Path inside the container where the chart is copied for helm upgrades. -CONTAINER_CHART_DIR=/tmp/openshell-chart - -build_gateway=0 -build_supervisor=0 -needs_helm_upgrade=0 -explicit_target=0 - -previous_gateway_fingerprint="" -previous_supervisor_fingerprint="" -previous_helm_fingerprint="" -current_gateway_fingerprint="" -current_supervisor_fingerprint="" -current_helm_fingerprint="" - -if [[ "$#" -gt 0 ]]; then - explicit_target=1 - build_gateway=0 - build_supervisor=0 - needs_helm_upgrade=0 - - for target in "$@"; do - case "${target}" in - gateway) - build_gateway=1 - ;; - supervisor|sandbox) - build_supervisor=1 - ;; - chart|helm) - needs_helm_upgrade=1 - ;; - all) - build_gateway=1 - build_supervisor=1 - needs_helm_upgrade=1 - ;; - *) - echo "Unknown target '${target}'. Use gateway, supervisor, chart, or all." - exit 1 - ;; - esac - done -fi - -declare -a changed_files=() -detect_start=$(date +%s) -mapfile -t changed_files < <( - { - git diff --name-only - git diff --name-only --cached - git ls-files --others --exclude-standard - } | sort -u -) -detect_end=$(date +%s) -log_duration "Change detection" "${detect_start}" "${detect_end}" - -# Track the cluster container ID so we can detect when the cluster was -# recreated (e.g. via bootstrap). A new container means the k3s state is -# fresh and all images must be rebuilt and pushed regardless of source -# fingerprints. -current_container_id=$(ce inspect --format '{{.Id}}' "${CONTAINER_NAME}" 2>/dev/null || true) - -if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then - while IFS='=' read -r key value; do - case "${key}" in - cluster_name) - previous_cluster_name=${value} - ;; - container_id) - previous_container_id=${value} - ;; - gateway) - previous_gateway_fingerprint=${value} - ;; - supervisor) - previous_supervisor_fingerprint=${value} - ;; - helm) - previous_helm_fingerprint=${value} - ;; - esac - done < "${DEPLOY_FAST_STATE_FILE}" - - if [[ "${previous_cluster_name:-}" != "${CLUSTER_NAME}" ]]; then - previous_gateway_fingerprint="" - previous_supervisor_fingerprint="" - previous_helm_fingerprint="" - fi - - # Invalidate gateway and helm fingerprints when the cluster container has - # changed (recreated or replaced). The new k3s instance has no pushed - # images so the gateway must be rebuilt and helm must be re-applied. - # The supervisor is NOT invalidated here because it is already built into - # the cluster image — a fresh cluster already has the correct supervisor - # binary, so rebuilding it would be redundant. - if [[ -n "${current_container_id}" && "${current_container_id}" != "${previous_container_id:-}" ]]; then - previous_gateway_fingerprint="" - previous_helm_fingerprint="" - fi -fi - -matches_gateway() { - local path=$1 - case "${path}" in - Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) - return 0 - ;; - deploy/docker/Dockerfile.images|tasks/scripts/docker-build-image.sh) - return 0 - ;; - crates/openshell-core/*|crates/openshell-driver-kubernetes/*|crates/openshell-ocsf/*|crates/openshell-policy/*|crates/openshell-providers/*) - return 0 - ;; - crates/openshell-router/*|crates/openshell-server/*) - return 0 - ;; - *) - return 1 - ;; - esac -} - -matches_supervisor() { - local path=$1 - case "${path}" in - Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) - return 0 - ;; - deploy/docker/Dockerfile.images|tasks/scripts/docker-build-image.sh) - return 0 - ;; - crates/openshell-core/*|crates/openshell-policy/*|crates/openshell-router/*) - return 0 - ;; - crates/openshell-sandbox/*) - return 0 - ;; - *) - return 1 - ;; - esac -} - -matches_helm() { - local path=$1 - case "${path}" in - deploy/helm/openshell/*) - return 0 - ;; - *) - return 1 - ;; - esac -} - -compute_fingerprint() { - local component=$1 - local payload="" - local path - local digest - - # Include the committed state of relevant source paths via git tree - # hashes. This ensures that committed changes (e.g. after `git pull` - # or amend) are detected even when there are no uncommitted edits. - local committed_trees="" - case "${component}" in - gateway) - committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/Dockerfile.images tasks/scripts/docker-build-image.sh tasks/scripts/stage-prebuilt-binaries.sh crates/openshell-core/ crates/openshell-driver-kubernetes/ crates/openshell-ocsf/ crates/openshell-policy/ crates/openshell-providers/ crates/openshell-router/ crates/openshell-server/ 2>/dev/null || true) - ;; - supervisor) - committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/Dockerfile.images tasks/scripts/docker-build-image.sh tasks/scripts/stage-prebuilt-binaries.sh crates/openshell-core/ crates/openshell-policy/ crates/openshell-router/ crates/openshell-sandbox/ 2>/dev/null || true) - ;; - helm) - committed_trees=$(git ls-tree HEAD deploy/helm/openshell/ 2>/dev/null || true) - ;; - esac - if [[ -n "${committed_trees}" ]]; then - payload+="${committed_trees}"$'\n' - fi - - # Layer uncommitted changes on top so dirty files trigger a rebuild too. - for path in "${changed_files[@]}"; do - case "${component}" in - gateway) - if ! matches_gateway "${path}"; then - continue - fi - ;; - supervisor) - if ! matches_supervisor "${path}"; then - continue - fi - ;; - helm) - if ! matches_helm "${path}"; then - continue - fi - ;; - esac - - if [[ -e "${path}" ]]; then - digest=$(shasum -a 256 "${path}" | cut -d ' ' -f 1) - else - digest="__MISSING__" - fi - payload+="${path}:${digest}"$'\n' - done - - if [[ -z "${payload}" ]]; then - printf '' - else - printf '%s' "${payload}" | shasum -a 256 | cut -d ' ' -f 1 - fi -} - -current_gateway_fingerprint=$(compute_fingerprint gateway) -current_supervisor_fingerprint=$(compute_fingerprint supervisor) -current_helm_fingerprint=$(compute_fingerprint helm) - -if [[ "${explicit_target}" == "0" && "${DEPLOY_FAST_MODE}" == "full" ]]; then - build_gateway=1 - build_supervisor=1 - needs_helm_upgrade=1 -elif [[ "${explicit_target}" == "0" ]]; then - if [[ "${current_gateway_fingerprint}" != "${previous_gateway_fingerprint}" ]]; then - build_gateway=1 - fi - if [[ "${current_supervisor_fingerprint}" != "${previous_supervisor_fingerprint}" ]]; then - build_supervisor=1 - fi - if [[ "${current_helm_fingerprint}" != "${previous_helm_fingerprint}" ]]; then - needs_helm_upgrade=1 - fi -fi - -if [[ "${FORCE_HELM_UPGRADE}" == "1" ]]; then - needs_helm_upgrade=1 -fi - -# Always run helm upgrade when the gateway image is rebuilt so that -# the image tag and pull policy are set correctly. -if [[ "${build_gateway}" == "1" ]]; then - needs_helm_upgrade=1 -fi - -echo "Fast deploy plan:" -echo " build gateway: ${build_gateway}" -echo " build supervisor: ${build_supervisor}" -echo " helm upgrade: ${needs_helm_upgrade}" - -if [[ "${explicit_target}" == "0" && "${build_gateway}" == "0" && "${build_supervisor}" == "0" && "${needs_helm_upgrade}" == "0" && "${DEPLOY_FAST_MODE}" != "full" ]]; then - echo "No new local changes since last deploy." -fi - -build_start=$(date +%s) - -# Track which components are being rebuilt for rollout decisions. -declare -a built_components=() - -if [[ "${build_gateway}" == "1" ]]; then - tasks/scripts/docker-build-image.sh gateway -fi - -# Build the supervisor binary and docker cp it into the running k3s cluster. -# The binary lives at /opt/openshell/bin/openshell-sandbox on the node -# filesystem and is mounted into sandbox pods via a hostPath volume. -if [[ "${build_supervisor}" == "1" ]]; then - echo "Building supervisor binary..." - supervisor_start=$(date +%s) - - # Detect the cluster container's architecture so we cross-compile correctly. - # Container objects lack an Architecture field (the Go template emits a - # stray newline before erroring), so inspect the container's *image* instead. - _cluster_image=$(ce inspect --format '{{.Config.Image}}' "${CONTAINER_NAME}" 2>/dev/null) - CLUSTER_ARCH=$(ce image inspect --format '{{.Architecture}}' "${_cluster_image}" 2>/dev/null || echo "amd64") - - # Detect the host (build) architecture in the container engine's naming convention. - HOST_ARCH=$(ce_info_arch) - # Normalize: Docker reports "aarch64" on ARM hosts but uses "arm64" elsewhere. - case "${HOST_ARCH}" in - aarch64) HOST_ARCH=arm64 ;; - x86_64) HOST_ARCH=amd64 ;; - esac - - # Stage the supervisor binary through the prebuilt path, then extract it - # via --output from the minimal Docker target. - SUPERVISOR_BUILD_DIR=$(mktemp -d) - trap 'rm -rf "${SUPERVISOR_BUILD_DIR}"' EXIT - - # Compute cargo version from git tags for the supervisor binary. - _cargo_version=${OPENSHELL_CARGO_VERSION:-} - if [[ -z "${_cargo_version}" ]]; then - _cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo 2>/dev/null || true) - fi - - # Only set DOCKER_PLATFORM when the cluster architecture differs from the - # local container engine architecture. Omitting it for native builds lets - # docker-build-image.sh pick the fast default builder. - _platform_env=() - if [[ "${CLUSTER_ARCH}" != "${HOST_ARCH}" ]]; then - _platform_env=(DOCKER_PLATFORM="linux/${CLUSTER_ARCH}") - fi - - env \ - "${_platform_env[@]+"${_platform_env[@]}"}" \ - DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_BUILD_DIR}" \ - OPENSHELL_CARGO_VERSION="${_cargo_version}" \ - tasks/scripts/docker-build-image.sh supervisor-output - - # Copy the built binary into the running k3s container - ce exec "${CONTAINER_NAME}" mkdir -p /opt/openshell/bin - ce cp "${SUPERVISOR_BUILD_DIR}/openshell-sandbox" \ - "${CONTAINER_NAME}:/opt/openshell/bin/openshell-sandbox" - ce exec "${CONTAINER_NAME}" chmod 755 /opt/openshell/bin/openshell-sandbox - - built_components+=("supervisor") - supervisor_end=$(date +%s) - log_duration "Supervisor build + deploy" "${supervisor_start}" "${supervisor_end}" -fi - -build_end=$(date +%s) -log_duration "Builds" "${build_start}" "${build_end}" - -# Push rebuilt gateway image to local registry. -declare -a pushed_images=() - -if [[ "${build_gateway}" == "1" ]]; then - ce tag "openshell/gateway:${IMAGE_TAG}" "${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" 2>/dev/null || true - pushed_images+=("${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}") - built_components+=("gateway") -fi - -if [[ "${#pushed_images[@]}" -gt 0 ]]; then - push_start=$(date +%s) - echo "Pushing updated images to local registry..." - for image_ref in "${pushed_images[@]}"; do - ce push "${image_ref}" - done - push_end=$(date +%s) - log_duration "Image push" "${push_start}" "${push_end}" -fi - -# Evict rebuilt gateway image from k3s containerd cache so new pods pull -# the updated image from the registry. -if [[ "${build_gateway}" == "1" ]]; then - echo "Evicting stale gateway image from k3s..." - ce exec "${CONTAINER_NAME}" crictl rmi "${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" >/dev/null 2>&1 || true -fi - -if [[ "${needs_helm_upgrade}" == "1" ]]; then - helm_start=$(date +%s) - echo "Upgrading helm release..." - helm_wait_args="" - if [[ "${DEPLOY_FAST_HELM_WAIT}" == "1" ]]; then - helm_wait_args="--wait" - fi - - # Copy the local chart source into the container so helm can read it. - ce exec "${CONTAINER_NAME}" rm -rf "${CONTAINER_CHART_DIR}" - ce cp deploy/helm/openshell "${CONTAINER_NAME}:${CONTAINER_CHART_DIR}" - - # grpcEndpoint must be explicitly set to https:// because the chart always - # terminates mTLS (there is no server.tls.enabled toggle). Without this, - # a prior Helm override or chart default change could silently regress - # sandbox callbacks to plaintext. - # Ensure the SSH handshake K8s secret exists. The bootstrap process normally - # creates it, but fast-deploy may run before bootstrap on a fresh cluster. - EXISTING_SECRET=$(cluster_exec "kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null | base64 -d" 2>/dev/null) || true - if [ -z "${EXISTING_SECRET}" ]; then - SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)" - cluster_exec "kubectl -n openshell create secret generic openshell-ssh-handshake --from-literal=secret='${SSH_HANDSHAKE_SECRET}' --dry-run=client -o yaml | kubectl apply -f -" - fi - - # Retrieve the host gateway IP from the entrypoint-rendered HelmChart CR so - # that hostAliases for host.openshell.internal are preserved across fast deploys. - HOST_GATEWAY_IP=$(cluster_exec "kubectl -n kube-system get helmchart openshell -o jsonpath='{.spec.valuesContent}' 2>/dev/null \ - | grep hostGatewayIP | awk '{print \$2}'" 2>/dev/null) || true - HOST_GATEWAY_ARGS="" - if [[ -n "${HOST_GATEWAY_IP}" ]]; then - HOST_GATEWAY_ARGS="--set server.hostGatewayIP=${HOST_GATEWAY_IP}" - fi - - OIDC_HELM_ARGS="" - if [[ -n "${OPENSHELL_OIDC_ISSUER:-}" ]]; then - OIDC_HELM_ARGS="--set server.oidc.issuer=${OPENSHELL_OIDC_ISSUER}" - OIDC_HELM_ARGS="${OIDC_HELM_ARGS} --set server.oidc.audience=${OPENSHELL_OIDC_AUDIENCE:-openshell-cli}" - if [[ -n "${OPENSHELL_OIDC_ROLES_CLAIM:-}" ]]; then - OIDC_HELM_ARGS="${OIDC_HELM_ARGS} --set server.oidc.rolesClaim=${OPENSHELL_OIDC_ROLES_CLAIM}" - fi - if [[ -n "${OPENSHELL_OIDC_ADMIN_ROLE:-}" ]]; then - OIDC_HELM_ARGS="${OIDC_HELM_ARGS} --set server.oidc.adminRole=${OPENSHELL_OIDC_ADMIN_ROLE}" - fi - if [[ -n "${OPENSHELL_OIDC_USER_ROLE:-}" ]]; then - OIDC_HELM_ARGS="${OIDC_HELM_ARGS} --set server.oidc.userRole=${OPENSHELL_OIDC_USER_ROLE}" - fi - if [[ -n "${OPENSHELL_OIDC_SCOPES_CLAIM:-}" ]]; then - OIDC_HELM_ARGS="${OIDC_HELM_ARGS} --set server.oidc.scopesClaim=${OPENSHELL_OIDC_SCOPES_CLAIM}" - fi - fi - - cluster_exec "helm upgrade openshell ${CONTAINER_CHART_DIR} \ - --namespace openshell \ - --set image.repository=${IMAGE_REPO_BASE}/gateway \ - --set image.tag=${IMAGE_TAG} \ - --set image.pullPolicy=Always \ - --set-string server.grpcEndpoint=https://openshell.openshell.svc.cluster.local:8080 \ - --set server.tls.certSecretName=openshell-server-tls \ - --set server.tls.clientCaSecretName=openshell-server-client-ca \ - --set server.tls.clientTlsSecretName=openshell-client-tls \ - ${HOST_GATEWAY_ARGS} \ - ${OIDC_HELM_ARGS} \ - ${helm_wait_args}" - helm_end=$(date +%s) - log_duration "Helm upgrade" "${helm_start}" "${helm_end}" -fi - -if [[ "${build_gateway}" == "1" ]]; then - rollout_start=$(date +%s) - echo "Restarting gateway to pick up updated image..." - if cluster_exec "kubectl get statefulset/openshell -n openshell" >/dev/null 2>&1; then - cluster_exec "kubectl rollout restart statefulset/openshell -n openshell" - cluster_exec "kubectl rollout status statefulset/openshell -n openshell" - elif cluster_exec "kubectl get deployment/openshell -n openshell" >/dev/null 2>&1; then - cluster_exec "kubectl rollout restart deployment/openshell -n openshell" - cluster_exec "kubectl rollout status deployment/openshell -n openshell" - else - echo "Warning: no openshell workload found to roll out in namespace 'openshell'." - fi - rollout_end=$(date +%s) - log_duration "Gateway rollout" "${rollout_start}" "${rollout_end}" -fi - -if [[ "${build_supervisor}" == "1" ]]; then - echo "Supervisor binary updated on cluster node." - echo "Existing sandbox pods will use the new binary on next restart." - echo "New sandbox pods will use the updated binary immediately (hostPath mount)." -fi - -if [[ "${explicit_target}" == "0" ]]; then - mkdir -p "$(dirname "${DEPLOY_FAST_STATE_FILE}")" - cat > "${DEPLOY_FAST_STATE_FILE}" <" >&2 - exit 1 -fi - -case "${component}" in - gateway) - ;; - *) - echo "invalid component '${component}'; expected gateway" >&2 - exit 1 - ;; -esac - -# Normalize cluster name: lowercase, replace invalid chars with hyphens -normalize_name() { - echo "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//' -} - -IMAGE_TAG=${IMAGE_TAG:-dev} -IMAGE_REPO_BASE=${IMAGE_REPO_BASE:-${OPENSHELL_REGISTRY:-127.0.0.1:5000/openshell}} -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CLUSTER_NAME=$(normalize_name "${CLUSTER_NAME}") -CONTAINER_NAME="openshell-cluster-${CLUSTER_NAME}" -SOURCE_IMAGE="openshell/${component}:${IMAGE_TAG}" -TARGET_IMAGE="${IMAGE_REPO_BASE}/${component}:${IMAGE_TAG}" - -source_candidates=( - "openshell/${component}:${IMAGE_TAG}" - "localhost:5000/openshell/${component}:${IMAGE_TAG}" - "127.0.0.1:5000/openshell/${component}:${IMAGE_TAG}" -) - -resolved_source_image="" -for candidate in "${source_candidates[@]}"; do - if ce image inspect "${candidate}" >/dev/null 2>&1; then - resolved_source_image="${candidate}" - break - fi -done - -if [ -z "${resolved_source_image}" ]; then - echo "Local image not found for ${component}:${IMAGE_TAG}, building..." - tasks/scripts/docker-build-image.sh "${component}" - resolved_source_image="openshell/${component}:${IMAGE_TAG}" -fi - -ce tag "${resolved_source_image}" "${TARGET_IMAGE}" -ce push "${TARGET_IMAGE}" - -# Evict the stale image from k3s's containerd cache so new pods pull the -# updated image. Without this, k3s uses its cached copy (imagePullPolicy -# defaults to IfNotPresent for non-:latest tags) and pods run stale code. -if ce ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then - ce exec "${CONTAINER_NAME}" crictl rmi "${TARGET_IMAGE}" >/dev/null 2>&1 || true -fi diff --git a/tasks/scripts/cluster-stop.sh b/tasks/scripts/cluster-stop.sh deleted file mode 100755 index 232305fe5..000000000 --- a/tasks/scripts/cluster-stop.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -normalize_name() { - echo "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//' -} - -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CLUSTER_NAME=$(normalize_name "${CLUSTER_NAME}") -CONTAINER_NAME="openshell-cluster-${CLUSTER_NAME}" - -if ! docker ps -aq --filter "name=^${CONTAINER_NAME}$" | grep -q .; then - echo "No cluster container '${CONTAINER_NAME}' found." - exit 0 -fi - -echo "Stopping cluster '${CLUSTER_NAME}'..." -docker rm -f "${CONTAINER_NAME}" >/dev/null -echo "Cluster '${CLUSTER_NAME}' stopped and removed." diff --git a/tasks/scripts/cluster.sh b/tasks/scripts/cluster.sh deleted file mode 100755 index 33b8b8e88..000000000 --- a/tasks/scripts/cluster.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Unified cluster entrypoint: bootstrap if no cluster is running, then -# incremental deploy. - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/container-engine.sh" - -# Normalize cluster name: lowercase, replace invalid chars with hyphens -normalize_name() { - echo "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//' -} - -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CLUSTER_NAME=$(normalize_name "${CLUSTER_NAME}") -CONTAINER_NAME="openshell-cluster-${CLUSTER_NAME}" - -if ! ce ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then - echo "No running cluster found. Bootstrapping..." - exec tasks/scripts/cluster-bootstrap.sh fast -fi - -# Container is running but not healthy — tear it down and re-bootstrap. -if ! ce ps -q --filter "name=^${CONTAINER_NAME}$" --filter "health=healthy" | grep -q .; then - echo "Cluster container '${CONTAINER_NAME}' is running but not healthy. Recreating..." - exec tasks/scripts/cluster-bootstrap.sh fast -fi - -exec tasks/scripts/cluster-deploy-fast.sh "$@" diff --git a/tasks/scripts/docker-build-image.sh b/tasks/scripts/docker-build-image.sh index 997a631ad..2fb86bc5e 100755 --- a/tasks/scripts/docker-build-image.sh +++ b/tasks/scripts/docker-build-image.sh @@ -43,7 +43,7 @@ required_prebuilt_binaries() { gateway) echo "openshell-gateway" ;; - supervisor|cluster|supervisor-sideload|supervisor-output) + supervisor|supervisor-sideload|supervisor-output) echo "openshell-sandbox" ;; esac @@ -90,7 +90,7 @@ ensure_prebuilt_binaries() { fi } -TARGET=${1:?"Usage: docker-build-image.sh [extra-args...]"} +TARGET=${1:?"Usage: docker-build-image.sh [extra-args...]"} shift DOCKERFILE="deploy/docker/Dockerfile.images" @@ -113,14 +113,6 @@ case "${TARGET}" in IMAGE_NAME="openshell/supervisor" DOCKER_TARGET="supervisor" ;; - cluster) - IS_FINAL_IMAGE=1 - IMAGE_NAME="openshell/cluster" - DOCKER_TARGET="cluster" - ;; - supervisor-builder) - DOCKER_TARGET="supervisor-builder" - ;; supervisor-output) # Backward-compat alias: same as "supervisor". IS_FINAL_IMAGE=1 @@ -164,17 +156,6 @@ if [[ -z "${CI:-}" ]]; then fi fi -# The cluster image embeds the packaged Helm chart. -if [[ "${TARGET}" == "cluster" ]]; then - mkdir -p deploy/docker/.build/charts - helm package deploy/helm/openshell -d deploy/docker/.build/charts/ >/dev/null -fi - -K3S_ARGS=() -if [[ "${TARGET}" == "cluster" && -n "${K3S_VERSION:-}" ]]; then - K3S_ARGS=(--build-arg "K3S_VERSION=${K3S_VERSION}") -fi - ensure_prebuilt_binaries "${TARGET}" TAG_ARGS=() @@ -202,7 +183,6 @@ ce_build \ ${BUILDER_ARGS[@]+"${BUILDER_ARGS[@]}"} \ ${DOCKER_PLATFORM:+--platform ${DOCKER_PLATFORM}} \ ${CACHE_ARGS[@]+"${CACHE_ARGS[@]}"} \ - ${K3S_ARGS[@]+"${K3S_ARGS[@]}"} \ -f "${DOCKERFILE}" \ --target "${DOCKER_TARGET}" \ ${TAG_ARGS[@]+"${TAG_ARGS[@]}"} \ diff --git a/tasks/scripts/docker-publish-multiarch.sh b/tasks/scripts/docker-publish-multiarch.sh index 18b7ab889..2f23c5106 100755 --- a/tasks/scripts/docker-publish-multiarch.sh +++ b/tasks/scripts/docker-publish-multiarch.sh @@ -3,7 +3,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Build multi-arch gateway + cluster images and push to a container registry. +# Build multi-arch gateway + supervisor images and push to a container registry. # Requires DOCKER_REGISTRY to be set (e.g. ghcr.io/myorg). set -euo pipefail @@ -47,8 +47,8 @@ _publish_multiarch_docker() { tasks/scripts/docker-build-image.sh gateway echo - echo "Building multi-arch cluster image..." - tasks/scripts/docker-build-image.sh cluster + echo "Building multi-arch supervisor image..." + tasks/scripts/docker-build-image.sh supervisor TAGS_TO_APPLY=("${EXTRA_TAGS[@]}") if [[ "${TAG_LATEST}" == "true" ]]; then @@ -56,7 +56,7 @@ _publish_multiarch_docker() { fi if [[ ${#TAGS_TO_APPLY[@]} -gt 0 ]]; then - for component in gateway cluster; do + for component in gateway supervisor; do full_image="${REGISTRY}/${component}" for tag in "${TAGS_TO_APPLY[@]}"; do [[ "${tag}" == "${IMAGE_TAG}" ]] && continue @@ -79,7 +79,7 @@ _publish_multiarch_podman() { # Split comma-separated platforms into an array. IFS=',' read -ra PLATFORM_LIST <<< "${PLATFORMS}" - for component in gateway cluster; do + for component in gateway supervisor; do local full_image="${REGISTRY}/${component}" local manifest_name="${full_image}:${IMAGE_TAG}" @@ -145,7 +145,7 @@ fi echo echo "Done! Multi-arch images pushed to ${REGISTRY}:" echo " ${REGISTRY}/gateway:${IMAGE_TAG}" -echo " ${REGISTRY}/cluster:${IMAGE_TAG}" +echo " ${REGISTRY}/supervisor:${IMAGE_TAG}" if [[ "${TAG_LATEST}" == "true" ]]; then echo " (all also tagged :latest)" fi diff --git a/tasks/scripts/gateway-docker.sh b/tasks/scripts/gateway-docker.sh index 269933c96..c5b8d37dd 100644 --- a/tasks/scripts/gateway-docker.sh +++ b/tasks/scripts/gateway-docker.sh @@ -112,8 +112,8 @@ HOST_OS="$(uname -s)" HOST_ARCH="$(normalize_arch "$(uname -m)")" SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" # Cache the supervisor binary alongside the gateway state. Reuses the same -# Docker pipeline that builds the cluster supervisor image, so the cross- -# compile happens inside Linux containers — sidestepping macOS's per-process +# Docker pipeline used for the supervisor image, so the cross-compile happens +# inside Linux containers — sidestepping macOS's per-process # file-descriptor cap that breaks zig/ld for this many rlibs. SUPERVISOR_OUT_DIR="${STATE_DIR}/supervisor/${DAEMON_ARCH}" SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" diff --git a/tasks/scripts/sandbox.sh b/tasks/scripts/sandbox.sh index 143d9bd90..264b5c6d0 100755 --- a/tasks/scripts/sandbox.sh +++ b/tasks/scripts/sandbox.sh @@ -3,70 +3,35 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Create or reconnect to the persistent "dev" sandbox. +# Create or reconnect to the persistent "dev" sandbox on the active gateway. # -# - Ensures the cluster is running (bootstraps if needed). -# - Redeploys if local source has changed since last deploy. -# - Recreates the sandbox if the cluster was redeployed since the sandbox -# was last created. -# - Provisions an "anthropic" provider from $ANTHROPIC_API_KEY when available. +# Start a gateway first with `mise run gateway:docker`, a package-managed +# openshell-gateway service, or a registered remote gateway. +# +# Provisions an "anthropic" provider from $ANTHROPIC_API_KEY when available. set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/container-engine.sh" - SANDBOX_NAME="dev" -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CONTAINER_NAME="openshell-cluster-${CLUSTER_NAME}" -STATE_DIR=${SANDBOX_STATE_DIR:-.cache} -SANDBOX_STATE_FILE=${STATE_DIR}/sandbox-dev.state -DEPLOY_STATE_FILE=${DEPLOY_FAST_STATE_FILE:-${STATE_DIR}/cluster-deploy-fast.state} -CMD=(${usage_command:-claude}) +read -r -a CMD <<< "${usage_command:-claude}" # ------------------------------------------------------------------- -# 1. Ensure the cluster is running; redeploy if dirty +# 1. Ensure a gateway is reachable # ------------------------------------------------------------------- -if ! ce ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then - echo "No running cluster found. Bootstrapping..." - mise run cluster -else - # Run incremental deploy — it no-ops when nothing has changed. - mise run cluster -fi - -# Capture the current deploy fingerprint so we can tell later whether the -# sandbox predates the most recent deploy. -deploy_fingerprint="" -if [[ -f "${DEPLOY_STATE_FILE}" ]]; then - deploy_fingerprint=$(shasum -a 256 "${DEPLOY_STATE_FILE}" | cut -d ' ' -f 1) +if ! openshell status >/dev/null 2>&1; then + echo "No reachable OpenShell gateway." >&2 + echo "Start one in another shell with: mise run gateway:docker" >&2 + echo "Or register/select an existing gateway with: openshell gateway add " >&2 + exit 2 fi # ------------------------------------------------------------------- -# 2. Decide whether to (re)create the sandbox +# 2. Decide whether to create the sandbox # ------------------------------------------------------------------- need_create=1 if openshell sandbox get "${SANDBOX_NAME}" >/dev/null 2>&1; then - # Sandbox exists — only recreate if the cluster has been redeployed. - # The command passed via `-- ` only affects the SSH exec session, - # not the sandbox pod itself (which always runs `sleep infinity`), so - # a command change never requires recreation. - previous_deploy_fingerprint="" - if [[ -f "${SANDBOX_STATE_FILE}" ]]; then - while IFS='=' read -r key value; do - case "${key}" in - deploy) previous_deploy_fingerprint="${value}" ;; - esac - done < "${SANDBOX_STATE_FILE}" - fi - - if [[ -n "${deploy_fingerprint}" && "${deploy_fingerprint}" == "${previous_deploy_fingerprint}" ]]; then - need_create=0 - else - echo "Cluster has been redeployed since sandbox '${SANDBOX_NAME}' was created. Recreating..." - openshell sandbox delete "${SANDBOX_NAME}" || true - fi + need_create=0 fi # ------------------------------------------------------------------- @@ -106,9 +71,3 @@ else echo "Connecting to existing sandbox '${SANDBOX_NAME}'..." openshell sandbox connect "${SANDBOX_NAME}" fi - -# Record state so we know this sandbox matches the current deploy. -mkdir -p "$(dirname "${SANDBOX_STATE_FILE}")" -cat > "${SANDBOX_STATE_FILE}" <" >&2 + echo "Usage: stage-prebuilt-binaries.sh " >&2 } normalize_arch() { @@ -70,7 +70,7 @@ components_for_target() { gateway) echo "gateway" ;; - sandbox|supervisor|cluster|supervisor-output) + sandbox|supervisor|supervisor-output) echo "sandbox" ;; all) diff --git a/tasks/test.toml b/tasks/test.toml index 6f5cf3a2b..bf5741c72 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -42,9 +42,9 @@ run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m ' ["e2e:python:gpu"] description = "Run Python GPU e2e tests" -depends = ["python:proto", "CLUSTER_GPU=1 cluster"] -env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } -run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" +depends = ["python:proto"] +env = { UV_NO_SYNC = "1", PYTHONPATH = "python", OPENSHELL_E2E_DOCKER_GPU = "1" } +run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" ["e2e:podman"] description = "Run Rust CLI e2e tests against a Podman-backed gateway"