From 3d18becdac984e10326ea24f8c89b733f1de0cb6 Mon Sep 17 00:00:00 2001 From: michael_crosby Date: Thu, 25 Jun 2026 11:47:31 -0400 Subject: [PATCH] Add cloud-hypervisor VMM backend for Linux hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apple/containerization currently runs containers in per-container VMs on macOS hosts via Virtualization.framework. This adds a second VMM backend so the same Swift orchestration layer (LinuxContainer / LinuxPod / Vminitd gRPC contract) runs on Linux hosts via cloud-hypervisor + KVM. **CloudHypervisor Swift package** (`Sources/CloudHypervisor/`) — a thin client for cloud-hypervisor's REST-over-UDS API, layered on AsyncHTTPClient. Endpoints cover VMM / VM lifecycle / hotplug (disk, fs, net, vsock, remove-device). Cross-platform (compiles on macOS for unit tests; consumed at runtime only by the Linux side of Containerization). **CH backend in Containerization** — one cloud-hypervisor subprocess per VM, gated behind `#if os(Linux)`. CHVirtualMachineManager / CHVirtualMachineInstance mirror the VZ shape behind the existing VirtualMachineManager / VirtualMachineInstance protocol. CHProcess and VirtiofsdProcess manage the binaries; CHHotplugProvider handles virtio-blk and virtio-fs runtime hotplug (with one virtiofsd per unique source-hash tag, refcounted across containers). **Linux host networking** — BridgeManager brings up a Linux bridge with an IPv4 subnet and (opt-in via `--enable-nat`) iptables MASQUERADE + scoped FORWARD rules. LinuxBridgedNetwork enslaves a fresh TAP per container to the bridge. State is recorded under `/run/containerization` so `cctl bridge delete` reverses exactly what create did. Bridge teardown verifies the link kind via sysfs to refuse deleting non-bridge interfaces. **cctl run / bridge** — end-to-end Linux container run path (image pull, ext4 rootfs assembly, VM boot, container exec) plus `cctl bridge create|delete` for the host network plumbing. **Build & dist** — `make linux-build` / `make linux-integration` build and exercise the host side inside an apple/container `--virtualization` dev container. `make dist-x86_64` produces a deployment tarball (cctl + cloud-hypervisor + virtiofsd + initfs + kernel) cross-compiled from the aarch64 dev container; pipeline documented in `docs/x86_64-build.md`. Static-musl C deps and the Zig cross compiler are pinned by SHA256. The host orchestrator runs as root. Per-VM runtime state lives under `/run/containerization/ch/` with mode 0700; UDS sockets inside are bound with mode 0600. Vminitd's gRPC channel inherits that trust boundary — socket-file perms are the auth. Sandbox flags are upstream-secure by default. Two per-component opt-outs exist for the apple/container dev-container case (where the host seccomp profile SIGSYS-kills CH and virtiofsd): - `CONTAINERIZATION_NO_CH_SECCOMP=1` — `cloud-hypervisor --seccomp false`. - `CONTAINERIZATION_NO_VIRTIOFSD_SANDBOX=1` — `virtiofsd --sandbox none`. Each emits a one-shot `logger.warning` at process start. Legacy alias `CONTAINERIZATION_RELAXED_SANDBOX=1` flips both. cctl spawns both binaries with `setsid` and a minimal env allowlist (PATH / HOME / RUST_LOG / RUST_BACKTRACE) so the parent's secrets don't leak to children. `make linux-integration` runs the cross-platform integration suite against a real cloud-hypervisor VM inside the dev container. Linux runs the cross-platform subset (`process true`/`false`/`echo hi`, virtiofs round-trip, hotplug); the macOS suite is unchanged. Signed-off-by: michael_crosby --- .gitignore | 2 +- CLAUDE.md | 91 +++ Makefile | 184 ++++- Package.swift | 29 +- README.md | 16 + Sources/CShim/cz_tap.c | 78 ++ Sources/CShim/include/cz_tap.h | 39 + Sources/CloudHypervisor/Client.swift | 169 ++++ Sources/CloudHypervisor/CloudHypervisor.swift | 17 + .../Endpoints/Client+Hotplug.swift | 55 ++ .../CloudHypervisor/Endpoints/Client+VM.swift | 61 ++ .../Endpoints/Client+VMM.swift | 40 + Sources/CloudHypervisor/Errors.swift | 27 + Sources/CloudHypervisor/HTTPOverUDS.swift | 202 +++++ Sources/CloudHypervisor/README.md | 95 +++ .../CloudHypervisor/Types/DeviceConfigs.swift | 242 ++++++ Sources/CloudHypervisor/Types/VmConfig.swift | 187 +++++ Sources/CloudHypervisor/Types/VmInfo.swift | 122 +++ Sources/Containerization/BridgeManager.swift | 359 +++++++++ .../Containerization/BridgeStateFile.swift | 75 ++ .../Containerization/CHHotplugProvider.swift | 422 ++++++++++ .../CHInstanceExtension.swift | 46 ++ Sources/Containerization/CHInterface.swift | 68 ++ Sources/Containerization/CHProcess.swift | 211 +++++ .../CHVirtualMachineInstance.swift | 753 ++++++++++++++++++ .../CHVirtualMachineManager.swift | 149 ++++ .../Containerization/ContainerManager.swift | 7 - .../Containerization/HostDefaultRoute.swift | 67 ++ Sources/Containerization/IptablesRules.swift | 102 +++ .../Containerization/Kernel+Commandline.swift | 50 ++ .../LinuxBridgedNetwork.swift | 181 +++++ Sources/Containerization/LinuxContainer.swift | 57 +- Sources/Containerization/LinuxPod.swift | 58 +- Sources/Containerization/LinuxProcess.swift | 2 +- Sources/Containerization/Mount+CH.swift | 80 ++ Sources/Containerization/Mount.swift | 142 +++- .../Containerization/SandboxOverrides.swift | 100 +++ Sources/Containerization/TAPDevice.swift | 155 ++++ .../VZVirtualMachineInstance.swift | 32 - .../Containerization/VirtiofsdProcess.swift | 187 +++++ .../VirtualMachineInstance.swift | 16 + Sources/Containerization/Vsock+Linux.swift | 158 ++++ Sources/Containerization/VsockListener.swift | 25 + Sources/ContainerizationExtras/CIDRv4.swift | 8 + .../NetlinkSession.swift | 215 ++++- Sources/ContainerizationNetlink/Types.swift | 7 + Sources/Integration/ContainerTests.swift | 9 + Sources/Integration/NBDServer.swift | 2 + Sources/Integration/PodTests.swift | 2 + Sources/Integration/PodVolumeTests.swift | 2 + Sources/Integration/Suite.swift | 451 +++++++---- Sources/cctl/BridgeCommand.swift | 95 +++ Sources/cctl/ImageCommand.swift | 4 +- Sources/cctl/RunCommand.swift | 335 ++++++++ Sources/cctl/cctl.swift | 10 +- Tests/CloudHypervisorTests/ClientTests.swift | 506 ++++++++++++ Tests/CloudHypervisorTests/ErrorsTests.swift | 35 + .../CloudHypervisorTests/StubHTTPServer.swift | 203 +++++ Tests/CloudHypervisorTests/TypesTests.swift | 328 ++++++++ .../BridgeStateFileTests.swift | 60 ++ .../CHInterfaceTests.swift | 69 ++ .../HostDefaultRouteTests.swift | 66 ++ .../ContainerizationTests/MountCHTests.swift | 72 ++ .../TAPNameDerivationTests.swift | 59 ++ docs/x86_64-build.md | 228 ++++++ images/linux-dev/Dockerfile | 107 ++- images/linux-dev/wrappers/x86_64-linux-gnu-ar | 2 + .../linux-dev/wrappers/x86_64-linux-gnu-g++ | 24 + .../linux-dev/wrappers/x86_64-linux-gnu-gcc | 31 + images/linux-dev/wrappers/x86_64-linux-gnu-ld | 9 + .../wrappers/x86_64-linux-gnu-ranlib | 2 + .../linux-dev/wrappers/x86_64-linux-gnu-strip | 2 + .../linux-dev/wrappers/x86_64-linux-musl-ar | 2 + .../linux-dev/wrappers/x86_64-linux-musl-g++ | 14 + .../linux-dev/wrappers/x86_64-linux-musl-gcc | 14 + .../wrappers/x86_64-linux-musl-ranlib | 2 + .../wrappers/x86_64-linux-musl-strip | 2 + scripts/build-dist-x86_64.sh | 243 ++++++ scripts/build-glibc-x86_64-deps.sh | 138 ++++ scripts/build-musl-x86_64-deps.sh | 188 +++++ ...ofsd-skip-cap-drop-with-sandbox-none.patch | 41 + vminitd/Makefile | 23 +- vminitd/Sources/VminitdCore/Server+GRPC.swift | 2 +- 83 files changed, 8532 insertions(+), 238 deletions(-) create mode 100644 CLAUDE.md create mode 100644 Sources/CShim/cz_tap.c create mode 100644 Sources/CShim/include/cz_tap.h create mode 100644 Sources/CloudHypervisor/Client.swift create mode 100644 Sources/CloudHypervisor/CloudHypervisor.swift create mode 100644 Sources/CloudHypervisor/Endpoints/Client+Hotplug.swift create mode 100644 Sources/CloudHypervisor/Endpoints/Client+VM.swift create mode 100644 Sources/CloudHypervisor/Endpoints/Client+VMM.swift create mode 100644 Sources/CloudHypervisor/Errors.swift create mode 100644 Sources/CloudHypervisor/HTTPOverUDS.swift create mode 100644 Sources/CloudHypervisor/README.md create mode 100644 Sources/CloudHypervisor/Types/DeviceConfigs.swift create mode 100644 Sources/CloudHypervisor/Types/VmConfig.swift create mode 100644 Sources/CloudHypervisor/Types/VmInfo.swift create mode 100644 Sources/Containerization/BridgeManager.swift create mode 100644 Sources/Containerization/BridgeStateFile.swift create mode 100644 Sources/Containerization/CHHotplugProvider.swift create mode 100644 Sources/Containerization/CHInstanceExtension.swift create mode 100644 Sources/Containerization/CHInterface.swift create mode 100644 Sources/Containerization/CHProcess.swift create mode 100644 Sources/Containerization/CHVirtualMachineInstance.swift create mode 100644 Sources/Containerization/CHVirtualMachineManager.swift create mode 100644 Sources/Containerization/HostDefaultRoute.swift create mode 100644 Sources/Containerization/IptablesRules.swift create mode 100644 Sources/Containerization/Kernel+Commandline.swift create mode 100644 Sources/Containerization/LinuxBridgedNetwork.swift create mode 100644 Sources/Containerization/Mount+CH.swift create mode 100644 Sources/Containerization/SandboxOverrides.swift create mode 100644 Sources/Containerization/TAPDevice.swift create mode 100644 Sources/Containerization/VirtiofsdProcess.swift create mode 100644 Sources/Containerization/Vsock+Linux.swift create mode 100644 Sources/cctl/BridgeCommand.swift create mode 100644 Tests/CloudHypervisorTests/ClientTests.swift create mode 100644 Tests/CloudHypervisorTests/ErrorsTests.swift create mode 100644 Tests/CloudHypervisorTests/StubHTTPServer.swift create mode 100644 Tests/CloudHypervisorTests/TypesTests.swift create mode 100644 Tests/ContainerizationTests/BridgeStateFileTests.swift create mode 100644 Tests/ContainerizationTests/CHInterfaceTests.swift create mode 100644 Tests/ContainerizationTests/HostDefaultRouteTests.swift create mode 100644 Tests/ContainerizationTests/MountCHTests.swift create mode 100644 Tests/ContainerizationTests/TAPNameDerivationTests.swift create mode 100644 docs/x86_64-build.md create mode 100755 images/linux-dev/wrappers/x86_64-linux-gnu-ar create mode 100755 images/linux-dev/wrappers/x86_64-linux-gnu-g++ create mode 100755 images/linux-dev/wrappers/x86_64-linux-gnu-gcc create mode 100755 images/linux-dev/wrappers/x86_64-linux-gnu-ld create mode 100755 images/linux-dev/wrappers/x86_64-linux-gnu-ranlib create mode 100755 images/linux-dev/wrappers/x86_64-linux-gnu-strip create mode 100755 images/linux-dev/wrappers/x86_64-linux-musl-ar create mode 100755 images/linux-dev/wrappers/x86_64-linux-musl-g++ create mode 100755 images/linux-dev/wrappers/x86_64-linux-musl-gcc create mode 100755 images/linux-dev/wrappers/x86_64-linux-musl-ranlib create mode 100755 images/linux-dev/wrappers/x86_64-linux-musl-strip create mode 100755 scripts/build-dist-x86_64.sh create mode 100755 scripts/build-glibc-x86_64-deps.sh create mode 100755 scripts/build-musl-x86_64-deps.sh create mode 100644 scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch diff --git a/.gitignore b/.gitignore index dfecc070..088b1bf4 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,4 @@ vmlinux* # API docs for local preview only. _site/ _serve/ - +kernel/vmlinuz-x86_64 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..eb1484cc --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,91 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build / Test / Format + +The project is built via `make`, not directly with `swift build`. Two Swift packages live in this repo: the root package (Containerization libraries + `cctl` + macOS-only integration binary) and `vminitd/` (the Linux guest init system, cross-compiled with the Static Linux SDK). + +- `make all` — build everything (`containerization` + `vminitd` + `init.ext4` rootfs in `bin/`). Default `BUILD_CONFIGURATION=debug`; pass `release` (or use `make release`) for optimized builds. +- `make containerization` — build just the host-side Swift package (skips vminitd). +- `make vminitd` — build vminitd / vmexec only. By default uses `LIBC=musl` via the Static Linux SDK; `make linux-build LIBC=glibc` builds via a Linux dev container. +- `make test` — unit tests with code coverage. `make coverage` regenerates the coverage report. +- `make integration` — runs `bin/containerization-integration`. Requires an in-repo kernel under `bin/` (`bin/vmlinux-arm64` on arm64, `bin/vmlinuz-x86_64` or `bin/vmlinux-x86_64` on x86_64); if absent, run `make fetch-default-kernel` to download the Kata-provided kernel for the host arch. +- Single test: `swift test --filter ContainerizationOCITests.ReferenceTests/testParsing` (Swift Testing / XCTest filter syntax). Targets are listed in `Package.swift`. +- `make linux-test` — runs `swift test` inside the Linux dev container (requires the `container` CLI from apple/container). +- `make linux-build` — builds the host-side Swift package (incl. `cctl`, `Containerization`, and `CloudHypervisor`) inside the same Linux dev container. Use this to validate Linux portability of host-side code; the resulting `cctl` is what the cloud-hypervisor backend ships behind. +- `make linux-integration` — runs the cross-platform integration suite against a real cloud-hypervisor VM inside the dev container (nested virt via apple/container's `--virtualization`). Requires a KVM-capable kernel at `kernel/vmlinux-arm64` (or `kernel/vmlinuz-x86_64` on x86_64 hosts) — build via `make -C kernel`; the kata-fetched kernel doesn't include KVM. Also requires `make fetch-cloud-hypervisor` and `make linux-build` to have been run first. Linux runs only the cross-platform subset (`process true`/`false`/`echo hi`); the macOS suite is unchanged. +- `make fetch-cloud-hypervisor` — downloads the static `cloud-hypervisor` v52.0 (aarch64) binary into `bin/cloud-hypervisor` for the Linux integration tests. +- `make build-cloud-hypervisor` / `make build-virtiofsd` — build patched `cloud-hypervisor` / `virtiofsd` from sources you have cloned into `.local/cloud-hypervisor` and `.local/virtiofsd` respectively. There is no fetch target — clone the upstream repos at the revision you want pinned. `build-virtiofsd` applies `scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch` and is idempotent. Both run inside the same Linux dev container as `linux-integration` so the resulting binaries are aarch64-linux-gnu. +- `make dist-x86_64` — assembles `bin/containerization-x86_64-.tar.gz` (cctl + cloud-hypervisor + virtiofsd + initfs.ext4 + kernel) for x86_64 Linux deployment, cross-compiled inside the aarch64 dev container via the Static Linux SDK (Swift) and `cargo zigbuild` (Rust). Prereqs: `.local/cloud-hypervisor` and `.local/virtiofsd` source checkouts (clone deliberately — no fetch target), and an x86_64 kernel built via `make -C kernel TARGET_ARCH=x86_64`. Per-stage rebuild env vars: `REBUILD_VMINITD=1`, `REBUILD_INITFS=1`, `REBUILD_CH=1`, `REBUILD_VIRTIOFSD=1`; cctl x86 always rebuilds. **Full pipeline, toolchain rationale, and troubleshooting in `docs/x86_64-build.md`.** The orchestrator is `scripts/build-dist-x86_64.sh`. +- `make fmt` — applies `.swift-format` and refreshes license headers via hawkeye. +- `make check` — formatting + license-header lint (this is what the pre-commit hook runs). Uses `.swift-format-nolint` for stricter linting. +- `make pre-commit` — installs `scripts/pre-commit.fmt` as a git pre-commit hook. +- `make protos` — regenerates `Sources/Containerization/SandboxContext/SandboxContext.{pb,grpc}.swift` from the `.proto`. Touch this whenever the proto changes; never hand-edit the generated files. +- `make cross-prep` — installs Swiftly, the pinned Swift toolchain (see `.swift-version`), and the Static Linux SDK. Run once before the first build. + +`WARNINGS_AS_ERRORS=true` is the default for both packages. Don't disable it casually — CI builds with it on. + +## Architecture + +This is a **Swift library package** (not a CLI tool) that lets applications run Linux containers on Apple silicon by spawning a lightweight VM per container via `Virtualization.framework`. The corresponding end-user CLI lives in [`apple/container`](https://github.com/apple/container) and is **not** part of this repo. `cctl` here is a playground/example binary, not the shipping product. + +### The host ↔ guest split + +Every Linux container runs inside its own VM. The boundary between host (macOS) and guest (Linux) is the central architectural fact: + +- **Host side** (`Sources/`, `macOS` platform): orchestrates VMs through `Virtualization.framework` (`VZVirtualMachineInstance.swift`, `VZVirtualMachine+Helpers.swift`). The user-facing entry points are `LinuxContainer` (one container per VM) and `LinuxPod` (multiple containers in one VM, experimental). These build a `VMConfiguration`, boot the VM with the chosen `Kernel` and a rootfs containing `vminitd`, then drive the guest via gRPC. +- **Guest side** (`vminitd/`, Linux platform): `vminitd` is PID 1 inside the VM. It exposes a gRPC service over **vsock** (default port `1024`) defined by `Sources/Containerization/SandboxContext/SandboxContext.proto`. `VminitdCore` implements that service: managing containers via runc, handling stdio over vsock, signal/event delivery, cgroups, mounts, and process lifecycle. `vmexec` is a small helper used to launch container processes from inside the guest. + +The proto is the contract between the two halves. **The `.pb.swift` and `.grpc.swift` files in `SandboxContext/` are generated** — regenerate via `make protos` after changing `SandboxContext.proto`. Both host and guest depend on the same generated Swift via the path-dependency wiring in `vminitd/Package.swift` (`containerization` is a sibling path package). + +### VMM backends + +`Containerization` abstracts the VMM behind `VirtualMachineManager` / `VirtualMachineInstance`. Two backends ship in this repo, both inside the same `Containerization` target but gated by `#if`: + +- **macOS**: `VZVirtualMachineManager` / `VZVirtualMachineInstance` (`VZ*` files, `#if os(macOS)`). Drives `Virtualization.framework` directly. +- **Linux**: `CHVirtualMachineManager` / `CHVirtualMachineInstance` (`CH*` files plus `CHProcess`, `VirtiofsdProcess`, `Vsock+Linux`, all `#if os(Linux)`). One `cloud-hypervisor` subprocess per VM, REST-on-UDS control plane via the standalone [`CloudHypervisor`](./Sources/CloudHypervisor) Swift package, virtio-blk / virtio-fs (one `virtiofsd` per share) / TAP / vsock for the data plane. Same `Vminitd` guest contract as VZ — only the host-side VMM differs. + +The `CloudHypervisor` library is a thin NIO-based HTTP/1.1-over-UDS client targeting cloud-hypervisor's REST API. It compiles on both platforms (so it can be unit-tested on macOS without a real cloud-hypervisor binary), but is only consumed by the Linux backend at runtime. + +**Sandbox env vars.** `CHProcess` and `VirtiofsdProcess` default to the upstream-secure spawn flags. Per-component opt-outs: + +- `CONTAINERIZATION_NO_CH_SECCOMP=1` — launch cloud-hypervisor with `--seccomp false`. +- `CONTAINERIZATION_NO_VIRTIOFSD_SANDBOX=1` — launch virtiofsd with `--sandbox none`. + +Both flags emit a one-line `logger.warning` at start so a relaxed-sandbox VM is loud in the host log. The legacy alias `CONTAINERIZATION_RELAXED_SANDBOX=1` continues to flip both at once. These are required inside apple/container's `--virtualization` dev container, where the host seccomp profile SIGSYS-kills both binaries; `make linux-integration` sets the legacy alias automatically. Leave them unset in production deployments where the host policy lets CH/virtiofsd run unmolested. + +### Library targets (`Sources/`) + +These are independently consumable Swift modules. Keep their dependencies narrow: + +- `Containerization` — the top-level orchestration layer (`LinuxContainer`, `LinuxPod`, `VMConfiguration`, `Vminitd` gRPC client wrapper, mounts, networking, sockets, image unpacking). Hosts both the macOS (VZ) and Linux (CH) VMM backends behind `#if os(...)`. +- `CloudHypervisor` — standalone NIO-based HTTP/1.1-over-UDS client targeting cloud-hypervisor's REST API. Cross-platform (compiles on macOS for unit tests; consumed at runtime only by the Linux side of `Containerization`). +- `ContainerizationOCI` — OCI image spec types, registry client (push/pull/auth), local OCI layout, content store. Used host-side for image management. +- `ContainerizationEXT4` — pure-Swift ext4 reader/formatter; used to build container rootfs blocks (`bin/initfs.ext4`). +- `ContainerizationArchive` — Swift wrapper around vendored libarchive headers (`Sources/ContainerizationArchive/CArchive`, refreshable via `make update-libarchive-source`). Links system `libarchive`, `lzma`, `bz2`, `z`, plus zstd via SwiftPM. +- `ContainerizationNetlink` — netlink socket bindings (used by vminitd for in-guest network configuration). +- `ContainerizationOS` — POSIX/Darwin/Linux platform shims (`Command`, `Terminal`, `Socket`, signal handling, mount syscalls, keychain). Cross-platform. +- `ContainerizationIO` — small NIO-flavored stream/reader utilities. +- `ContainerizationExtras`, `ContainerizationError`, `CShim` — shared helpers and a tiny C bridge. + +`Sources/Integration/` is the macOS-only `containerization-integration` binary (the integration test runner; it is not a `testTarget`, it's an `executableTarget` that's invoked by `make integration`). Unit `testTarget`s live under `Tests/`. + +### vminitd internals (`vminitd/Sources/`) + +- `VminitdCore/Server+GRPC.swift` is the bulk of the guest agent — it implements every RPC declared in `SandboxContext.proto`. +- `VminitdCore/Runc/` plus `RuncProcess.swift` / `ManagedContainer.swift` / `ManagedProcess.swift` shell out to `runc` for actual container execution. `ProcessSupervisor` reaps and dispatches exit events. +- `Cgroup/` handles cgroup v2 setup. `LCShim/` and `CVersion/` are small C bridges (the latter injects `GIT_COMMIT`/`GIT_TAG`/`BUILD_TIME` at compile time). +- `vmexec` runs a single container process inside the guest namespace and is what `vminitd` execs to launch container workloads. + +## Conventions + +- **License headers are required** on every Swift file. `make check-licenses` runs hawkeye against `scripts/license-header.txt`. New files: run `make update-licenses` (or `make fmt`) before committing. +- **Formatting**: `.swift-format` (line length 180, 4-space indent). The lint config (`.swift-format-nolint`) is what CI enforces. `NeverForceUnwrap`, `NeverUseForceTry`, and `NeverUseImplicitlyUnwrappedOptionals` are all on — don't introduce `!` / `try!`. +- **Package isolation**: prefer adding code to the smallest applicable module. Don't pull `Containerization` into `ContainerizationOCI` or similar — the leaf modules are intentionally light so they can be consumed standalone. +- **`SandboxContext.proto` is excluded from the `Containerization` target** (see `Package.swift`). The generated `.pb.swift` / `.grpc.swift` files are checked in. +- **Squash-and-merge**: PRs land as a single commit, so the PR title/body becomes the commit message — write it accordingly. Commits must be signed (per `CONTRIBUTING.md`). + +## Requirements + +Apple silicon Mac, macOS 26, Xcode 26. Swift toolchain version is pinned in `.swift-version` (currently `6.3.0`) and installed via Swiftly during `make cross-prep`. Older macOS releases are not supported. diff --git a/Makefile b/Makefile index e45835eb..9c99a9e2 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,14 @@ KERNEL_CANDIDATES := bin/vmlinuz-x86_64 bin/vmlinux-x86_64 else KERNEL_CANDIDATES := bin/vmlinux-$(KERNEL_ARCH) endif +# In-repo KVM-capable kernel built by `make -C kernel` (vmlinuz for x86_64 bzImage, +# vmlinux for arm64 Image). linux-integration requires this; the kata-fetched +# kernel under bin/ does not enable KVM. +ifeq ($(KERNEL_ARCH),x86_64) +LINUX_INTEGRATION_KERNEL := kernel/vmlinuz-x86_64 +else +LINUX_INTEGRATION_KERNEL := kernel/vmlinux-$(KERNEL_ARCH) +endif ifeq ($(UNAME_S),Darwin) SWIFT ?= /usr/bin/swift else @@ -44,15 +52,37 @@ LIBARCHIVE_UPSTREAM_VERSION := v3.7.7 LIBARCHIVE_LOCAL_DIR := workdir/libarchive KATA_BINARY_PACKAGE := https://github.com/kata-containers/kata-containers/releases/download/3.17.0/kata-static-3.17.0-arm64.tar.xz +CLOUD_HYPERVISOR_URL := https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/v52.0/cloud-hypervisor-static-aarch64 +# SHA256 of the v52.0 aarch64 static binary (verified locally from the +# upstream release artifact). Bump alongside CLOUD_HYPERVISOR_URL. +CLOUD_HYPERVISOR_SHA256 := bf004ddc1a148f47caa87ac49a783b8dbd6bf9bc27abe522ed197df7b982d3b1 SWIFT_VERSION := $(shell cat $(ROOT_DIR)/.swift-version) SWIFT_SDK_URL := $(shell grep '^SWIFT_SDK_URL' vminitd/Makefile | head -1 | sed 's/.*:= *//') SWIFT_SDK_CHECKSUM := $(shell grep '^SWIFT_SDK_CHECKSUM' vminitd/Makefile | head -1 | sed 's/.*:= *//') LINUX_DEV_IMAGE := containerization-dev:$(SWIFT_VERSION) +# Literal `,` for use inside $(call ...) arguments — bare commas are +# treated as the call's argument separator and split the value early. +comma := , + # Run a command inside a Linux dev container. # Requires 'container' (https://github.com/apple/container). # Automatically builds the dev image if it doesn't exist. +# +# Bind-mounts $(ROOT_DIR)/.local/integration-cache → the dev container's +# appRoot (`~/.local/share/com.apple.containerization`) so cctl-populated +# imageStore content (e.g. `vminit:latest` from `make init`, plus images +# pulled by the integration suite like alpine) persists across `container +# run` invocations. Without this, every `make linux-integration` re-pulls +# alpine and re-imports vminit, which dominates per-suite ramp-up. The +# macOS path gets this for free because $HOME persists. +# +# $(1): bash command to run inside the container. +# $(2): optional extra flags for `container run` (empty by default). Use this +# for linux-integration to pass `--kernel kernel/vmlinux-` so +# /dev/kvm is exposed in the dev container's Linux VM (the kata kernel +# fetched by `make fetch-default-kernel` does not enable KVM). define linux_run @if ! command -v container > /dev/null 2>&1; then \ echo "Error: 'container' CLI not found. Install from https://github.com/apple/container"; \ @@ -62,7 +92,11 @@ define linux_run echo "Building Linux dev container image..."; \ $(MAKE) linux-image; \ fi - @container run --memory 8gb --cpus 4 -v $(ROOT_DIR):/workspace -w /workspace $(LINUX_DEV_IMAGE) \ + @mkdir -p $(ROOT_DIR)/.local/integration-cache + @container run --rm $(2) --memory 16gb --cpus 8 --virtualization \ + -v $(ROOT_DIR):/workspace \ + -v $(ROOT_DIR)/.local/integration-cache:/root/.local/share/com.apple.containerization \ + -w /workspace $(LINUX_DEV_IMAGE) \ bash -c "$(1)" endef @@ -93,14 +127,141 @@ linux-image: linux-build: LIBC ?= musl linux-build: ifeq ($(LIBC),all) - $(call linux_run,make containerization && make -C vminitd LIBC=glibc && make -C vminitd LIBC=musl) + $(call linux_run,make containerization && make -C vminitd LIBC=glibc && make -C vminitd LIBC=musl && make init) else - $(call linux_run,make containerization && make -C vminitd LIBC=$(LIBC)) + $(call linux_run,make containerization && make -C vminitd LIBC=$(LIBC) && make init) endif .PHONY: linux-test linux-test: $(call linux_run,swift test $(SWIFT_CONFIGURATION)) + +.PHONY: build-cloud-hypervisor +# Build cloud-hypervisor from the patched source at .local/cloud-hypervisor and +# install it to bin/cloud-hypervisor. Runs inside the Linux dev container so the +# resulting binary is aarch64-linux-gnu and can run nested-virt under +# `container run --virtualization`. Installs build deps + rustup the first +# time. Forces HOME=/root since the container inherits the host HOME otherwise, +# which breaks rustup's $HOME/.cargo path. +# +# Prerequisite: clone cloud-hypervisor into .local/cloud-hypervisor (any +# revision compatible with the v52.0 REST surface this repo targets). There +# is no fetch target — pin the revision deliberately. Example: +# git clone -b v52.0 https://github.com/cloud-hypervisor/cloud-hypervisor \ +# .local/cloud-hypervisor +build-cloud-hypervisor: +ifeq (,$(wildcard .local/cloud-hypervisor/Cargo.toml)) + @echo "missing .local/cloud-hypervisor source checkout." >&2 + @echo "clone the cloud-hypervisor repo into .local/cloud-hypervisor before running this target, e.g.:" >&2 + @echo " git clone -b v52.0 https://github.com/cloud-hypervisor/cloud-hypervisor .local/cloud-hypervisor" >&2 + @exit 1 +endif + $(call linux_run,export HOME=/root && if ! command -v curl >/dev/null 2>&1; then apt-get update && apt-get install -y --no-install-recommends curl ca-certificates build-essential pkg-config libssl-dev; fi && if [ ! -x /root/.cargo/bin/cargo ]; then curl --proto =https --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal; fi && . /root/.cargo/env && cd .local/cloud-hypervisor && cargo build --release --bin cloud-hypervisor && cp target/release/cloud-hypervisor /workspace/bin/cloud-hypervisor && chmod +x /workspace/bin/cloud-hypervisor) + +.PHONY: build-virtiofsd +# Build virtiofsd from the source at .local/virtiofsd and install it to +# bin/virtiofsd. Runs inside the Linux dev container so the resulting +# binary is aarch64-linux-gnu and matches the cloud-hypervisor binary +# built by `make build-cloud-hypervisor`. +# +# Prerequisite: clone virtiofsd into .local/virtiofsd (any revision the +# scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch applies +# cleanly to). There is no fetch target — pin the revision deliberately: +# git clone https://gitlab.com/virtio-fs/virtiofsd .local/virtiofsd +# +# virtiofsd has two hard build deps that aren't in the base dev image: +# * libcap-ng-dev — capng crate is unconditional in [dependencies]. +# * libseccomp-dev — Cargo.toml has `default = ["seccomp"]` and +# `[[bin]] required-features = ["seccomp"]`, and libseccomp-sys is a +# -sys crate that links against the system library via pkg-config. +# Both are required even though we run with `--sandbox none` (capng is +# called for capability-drop at startup, before any sandbox setup). +# +# Before building, applies the patch at +# scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch (see +# that file for rationale). Idempotent: skips if already applied via +# git apply --reverse --check. +# +# Sentinel for the apt-get block is libcap-ng + libseccomp via pkg-config +# (not `command -v curl`) so this target works correctly even after +# `build-cloud-hypervisor` has already installed curl in the same dev +# container. +build-virtiofsd: +ifeq (,$(wildcard .local/virtiofsd/Cargo.toml)) + @echo "missing .local/virtiofsd source checkout." >&2 + @echo "clone the virtiofsd repo into .local/virtiofsd before running this target, e.g.:" >&2 + @echo " git clone https://gitlab.com/virtio-fs/virtiofsd .local/virtiofsd" >&2 + @exit 1 +endif + $(call linux_run,export HOME=/root && \ + if ! pkg-config --exists libcap-ng libseccomp 2>/dev/null; then \ + apt-get update && apt-get install -y --no-install-recommends \ + curl ca-certificates build-essential pkg-config libssl-dev \ + libcap-ng-dev libseccomp-dev; \ + fi && \ + if [ ! -x /root/.cargo/bin/cargo ]; then \ + curl --proto =https --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal; \ + fi && \ + . /root/.cargo/env && \ + cd /workspace/.local/virtiofsd && \ + if git apply --check /workspace/scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch 2>/dev/null; then \ + git apply /workspace/scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch && \ + echo "applied virtiofsd cap-drop patch"; \ + elif git apply --reverse --check /workspace/scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch 2>/dev/null; then \ + echo "virtiofsd cap-drop patch already applied"; \ + else \ + echo "ERROR: virtiofsd cap-drop patch does not apply cleanly" >&2; \ + exit 1; \ + fi && \ + cargo build --release && \ + cp target/release/virtiofsd /workspace/bin/virtiofsd && \ + chmod +x /workspace/bin/virtiofsd) + +.PHONY: linux-integration +linux-integration: +ifeq (,$(wildcard bin/cloud-hypervisor)) + @echo "missing bin/cloud-hypervisor; run 'make fetch-cloud-hypervisor' first" + @exit 1 +endif +ifeq (,$(wildcard bin/virtiofsd)) + @echo "missing bin/virtiofsd; run 'make build-virtiofsd' first" + @exit 1 +endif +ifeq (,$(wildcard $(LINUX_INTEGRATION_KERNEL))) + @echo "missing $(LINUX_INTEGRATION_KERNEL); run 'make -C kernel' first to build a KVM-capable kernel" + @exit 1 +endif +ifeq (,$(wildcard bin/containerization-integration)) + @echo "missing bin/containerization-integration; run 'make linux-build' first" + @exit 1 +endif +ifeq (,$(wildcard bin/initfs.ext4)) + @echo "missing bin/initfs.ext4; run 'make init' first (this also seeds the persistent imageStore at .local/integration-cache)" + @exit 1 +endif + $(call linux_run,CONTAINERIZATION_RELAXED_SANDBOX=1 ./bin/containerization-integration --kernel ./$(LINUX_INTEGRATION_KERNEL) --ch-binary ./bin/cloud-hypervisor --virtiofsd-binary ./bin/virtiofsd --max-concurrency 1,--kernel $(LINUX_INTEGRATION_KERNEL)) + +# Builds the x86_64 deployment tarball. +# +# Cross-compiles cctl, vminitd, cloud-hypervisor, and virtiofsd to +# x86_64-linux-musl inside the aarch64 Linux dev container (using the +# musl cross toolchain + static C deps installed by the dev image), +# packs an initfs.ext4 with the x86_64 vminitd inside, and emits +# bin/containerization-x86_64-.tar.gz. +# +# Depends on linux-image so that Dockerfile / build-musl-x86_64-deps.sh +# changes are picked up automatically. `container build` is cheap when +# layers are cached, so the no-change path is a few seconds of overhead. +# +# Prereqs: +# * .local/cloud-hypervisor and .local/virtiofsd source checkouts +# (see build-cloud-hypervisor / build-virtiofsd for clone URLs). +# * kernel/vmlinuz-x86_64 (preferred) or kernel/vmlinux-x86_64 present. +# Build via `make -C kernel TARGET_ARCH=x86_64` (or `make -C kernel x86_64`). +# The script fails hard if neither is present. +.PHONY: dist-x86_64 +dist-x86_64: linux-image + $(call linux_run,./scripts/build-dist-x86_64.sh) endif .PHONY: all @@ -120,9 +281,8 @@ containerization: @echo Copying containerization binaries... @mkdir -p bin @install "$(BUILD_BIN_DIR)/cctl" ./bin/ -ifeq ($(UNAME_S),Darwin) @install "$(BUILD_BIN_DIR)/containerization-integration" ./bin/ - +ifeq ($(UNAME_S),Darwin) @echo Signing containerization binaries... @codesign --force --sign - --timestamp=none --entitlements=signing/vz.entitlements bin/cctl @codesign --force --sign - --timestamp=none --entitlements=signing/vz.entitlements bin/containerization-integration @@ -199,6 +359,20 @@ ifeq (,$(wildcard bin/vmlinux-$(KERNEL_ARCH))) @cp .local/vmlinux-$(KERNEL_ARCH) bin/vmlinux-$(KERNEL_ARCH) endif +.PHONY: fetch-cloud-hypervisor +fetch-cloud-hypervisor: + @mkdir -p bin + @curl -SsL -o bin/cloud-hypervisor $(CLOUD_HYPERVISOR_URL) + @actual=$$(shasum -a 256 bin/cloud-hypervisor | awk '{print $$1}'); \ + if [ "$$actual" != "$(CLOUD_HYPERVISOR_SHA256)" ]; then \ + echo "ERROR: cloud-hypervisor checksum mismatch" >&2; \ + echo " expected: $(CLOUD_HYPERVISOR_SHA256)" >&2; \ + echo " actual: $$actual" >&2; \ + rm -f bin/cloud-hypervisor; \ + exit 1; \ + fi + @chmod +x bin/cloud-hypervisor + .PHONY: check check: swift-fmt-check check-licenses diff --git a/Package.swift b/Package.swift index df498005..04d84119 100644 --- a/Package.swift +++ b/Package.swift @@ -34,6 +34,7 @@ let package = Package( .library(name: "ContainerizationExtras", targets: ["ContainerizationExtras"]), .library(name: "ContainerizationArchive", targets: ["ContainerizationArchive"]), .library(name: "VminitdCore", targets: ["VminitdCore", "Cgroup", "LCShim"]), + .library(name: "CloudHypervisor", targets: ["CloudHypervisor"]), .executable(name: "cctl", targets: ["cctl"]), ], dependencies: [ @@ -65,12 +66,15 @@ let package = Package( .product(name: "GRPCNIOTransportHTTP2", package: "grpc-swift-nio-transport"), .product(name: "GRPCProtobuf", package: "grpc-swift-protobuf"), .product(name: "_NIOFileSystem", package: "swift-nio"), + "CloudHypervisor", "ContainerizationArchive", "ContainerizationOCI", "ContainerizationOS", "ContainerizationIO", "ContainerizationExtras", "ContainerizationEXT4", + "ContainerizationNetlink", + "CShim", ], exclude: [ "../Containerization/SandboxContext/SandboxContext.proto" @@ -91,7 +95,7 @@ let package = Package( ), .testTarget( name: "ContainerizationUnitTests", - dependencies: ["Containerization"], + dependencies: ["Containerization", "CloudHypervisor"], path: "Tests/ContainerizationTests", resources: [ .copy("ImageTests/Resources/scratch.tar"), @@ -260,6 +264,27 @@ let package = Package( .target( name: "CShim" ), + .target( + name: "CloudHypervisor", + dependencies: [ + .product(name: "AsyncHTTPClient", package: "async-http-client"), + .product(name: "Logging", package: "swift-log"), + .product(name: "NIOCore", package: "swift-nio"), + .product(name: "NIOPosix", package: "swift-nio"), + .product(name: "NIOHTTP1", package: "swift-nio"), + .product(name: "NIOConcurrencyHelpers", package: "swift-nio"), + ] + ), + .testTarget( + name: "CloudHypervisorTests", + dependencies: [ + "CloudHypervisor", + .product(name: "NIOCore", package: "swift-nio"), + .product(name: "NIOPosix", package: "swift-nio"), + .product(name: "NIOHTTP1", package: "swift-nio"), + .product(name: "NIOConcurrencyHelpers", package: "swift-nio"), + ] + ), .target( name: "LCShim", path: "vminitd/Sources/LCShim" @@ -297,7 +322,6 @@ let package = Package( ] ) -#if os(macOS) package.targets.append( .executableTarget( name: "containerization-integration", @@ -311,4 +335,3 @@ package.targets.append( path: "Sources/Integration" ) ) -#endif diff --git a/README.md b/README.md index 7e81eb17..3bf63980 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,22 @@ Containerization executes each Linux container inside of its own lightweight vir The API allows the runtime environment to be configured and containerized processes to be launched. `vminitd` provides I/O, signals, and events to the calling process when a process is run. +## Backends + +Containerization abstracts the VMM behind the `VirtualMachineManager` / +`VirtualMachineInstance` protocols and ships two implementations: + +- **macOS — Virtualization.framework** (`VZVirtualMachineManager`). The shipping path on Apple silicon. Uses Apple's `Virtualization` framework directly; no extra binaries required. +- **Linux — cloud-hypervisor + KVM** (`CHVirtualMachineManager`). One `cloud-hypervisor` subprocess per VM, controlled over its REST-on-UDS API by the standalone [`CloudHypervisor`](./Sources/CloudHypervisor) Swift package. Block storage uses virtio-blk, shared directories use virtio-fs (one `virtiofsd` per share), networking uses TAP, and the guest agent is reached over cloud-hypervisor's hybrid vsock — same `vminitd` contract as the macOS path, so guest-side semantics are unchanged. + +The Linux backend requires: + +- `cloud-hypervisor` and `virtiofsd` on the host. Both are looked up on `PATH` by default; `CHVirtualMachineManager.init` accepts explicit URLs to override. `virtiofsd` is resolved lazily — a VM that uses only block-device mounts can run without it installed at all. Recent stable releases of each are recommended (smoke testing pins specific versions). +- KVM access (`/dev/kvm` readable + writable by the calling user). +- Pre-staged TAP / bridge / NAT plumbing if the container needs networking. `TAPInterface` consumes an existing TAP device by name; bringing it up, attaching it to a bridge, and configuring NAT or routing is the caller's responsibility. + +The integration test suite (`make linux-integration`) runs inside an apple/container Linux VM with nested virt enabled (`container run --virtualization`). The kata kernel fetched by `make fetch-default-kernel` does not enable KVM, so the integration suite uses the in-repo kernel at `kernel/vmlinux-arm64` (or `kernel/vmlinuz-x86_64` on x86_64 hosts) instead — build it with `make -C kernel` before invoking `make linux-integration`. On Linux the suite runs only the cross-platform scenarios that don't depend on macOS-only types; the full suite remains macOS-only for now. + ## Requirements To build the Containerization package, you need: diff --git a/Sources/CShim/cz_tap.c b/Sources/CShim/cz_tap.c new file mode 100644 index 00000000..9f4621f5 --- /dev/null +++ b/Sources/CShim/cz_tap.c @@ -0,0 +1,78 @@ +/* + * Copyright © 2026 Apple Inc. and the Containerization project authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(__linux__) + +#include "cz_tap.h" + +#include +#include +#include /* struct ifreq, IFNAMSIZ */ +#include +#include +#include + +/* + * Avoid — the Static Linux SDK (musl) used to cross-compile + * vminitd ships from musl but not the linux kernel UAPI headers. + * The TUN ioctl number and flags are stable Linux ABI; redeclare locally. + * + * TUNSETIFF = _IOW('T', 202, int): + * dir=IOC_WRITE(1)<<30 | size(4)<<16 | type('T'=0x54)<<8 | nr(202=0xCA) + * = 0x400454CA + * Architecture-independent (Linux's ioctl encoding is the same on x86/arm). + */ +#ifndef TUNSETIFF +#define TUNSETIFF 0x400454CAu +#endif +#ifndef IFF_TAP +#define IFF_TAP 0x0002 +#endif +#ifndef IFF_NO_PI +#define IFF_NO_PI 0x1000 +#endif + +int cz_tap_create(const char *requested_name, char *out_name, size_t out_name_len) { + if (out_name == NULL || out_name_len < IFNAMSIZ) { + return -EINVAL; + } + + int fd = open("/dev/net/tun", O_RDWR | O_CLOEXEC); + if (fd < 0) { + return -errno; + } + + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + if (requested_name != NULL && requested_name[0] != '\0') { + strncpy(ifr.ifr_name, requested_name, IFNAMSIZ - 1); + } + + if (ioctl(fd, TUNSETIFF, &ifr) < 0) { + int saved = errno; + close(fd); + return -saved; + } + + /* Copy out the resolved name. ifr.ifr_name is always NUL-terminated + * within IFNAMSIZ by the kernel. */ + memset(out_name, 0, out_name_len); + strncpy(out_name, ifr.ifr_name, IFNAMSIZ - 1); + return fd; +} + +#endif /* __linux__ */ diff --git a/Sources/CShim/include/cz_tap.h b/Sources/CShim/include/cz_tap.h new file mode 100644 index 00000000..e293b7fb --- /dev/null +++ b/Sources/CShim/include/cz_tap.h @@ -0,0 +1,39 @@ +/* + * Copyright © 2026 Apple Inc. and the Containerization project authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CZ_TAP_H +#define __CZ_TAP_H + +#include + +/* + * Open /dev/net/tun, ioctl(TUNSETIFF) with IFF_TAP|IFF_NO_PI, and write the + * resolved interface name into `out_name` (must be at least 16 bytes). + * + * If `requested_name` is non-NULL and non-empty, it is the desired name; the + * kernel may rename it on collision (rare). If NULL or empty, the kernel + * picks a name like "tap%d". + * + * Returns the open fd on success, -errno on failure. + * + * Linux-only — the implementation in cz_tap.c is gated on __linux__. The + * declaration is left unconditional so Swift's clang importer can see it + * regardless of whose target's preprocessor defines reach the modulemap. + * On non-Linux targets the symbol is not provided; do not call. + */ +int cz_tap_create(const char *requested_name, char *out_name, size_t out_name_len); + +#endif /* __CZ_TAP_H */ diff --git a/Sources/CloudHypervisor/Client.swift b/Sources/CloudHypervisor/Client.swift new file mode 100644 index 00000000..2c4a67a2 --- /dev/null +++ b/Sources/CloudHypervisor/Client.swift @@ -0,0 +1,169 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import Logging +import NIOCore +import NIOHTTP1 +import NIOPosix + +extension CloudHypervisor { + /// A high-level client for Cloud Hypervisor's REST API over a Unix Domain Socket. + /// + /// Use ``init(socketPath:eventLoopGroup:logger:)`` to construct a client, then + /// call endpoint-specific methods (added as extensions in `Endpoints/`). + /// + /// The internal `get(_:)` / `put(_:)` / `put(_:body:)` helpers are used by + /// endpoint extensions in A8-A10 and are intentionally not public. + public final class Client: Sendable { + private let http: HTTPOverUDSClient + private let group: any EventLoopGroup + private let ownsGroup: Bool + private let encoder: JSONEncoder + private let decoder: JSONDecoder + + /// Create a client that communicates with Cloud Hypervisor over the given socket. + /// + /// - Parameters: + /// - socketPath: A `file://` URL whose `.path` points to the socket. + /// - eventLoopGroup: The NIO event loop group to use. When `nil` the client + /// creates and owns its own group. Callers wanting deterministic + /// resource release should pass a group they manage and call + /// ``shutdown()`` themselves; the deinit fallback shuts down + /// asynchronously and may outlive the `Client` instance briefly. + /// - logger: Logger for transport-level diagnostics. + /// - requestTimeout: Per-request deadline. A request that does not + /// complete within this window fails with + /// ``CloudHypervisor/Error/transport(_:)``. Defaults to 30 seconds. + /// - Throws: ``CloudHypervisor/Error/invalidSocketPath(_:)`` when `socketPath` + /// is not a `file://` URL. + public init( + socketPath: URL, + eventLoopGroup: (any EventLoopGroup)? = nil, + logger: Logger = Logger(label: "CloudHypervisor.Client"), + requestTimeout: TimeAmount = .seconds(30) + ) throws { + guard socketPath.isFileURL else { + throw CloudHypervisor.Error.invalidSocketPath(socketPath.absoluteString) + } + if let eventLoopGroup { + self.ownsGroup = false + self.group = eventLoopGroup + } else { + self.ownsGroup = true + self.group = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + } + self.http = HTTPOverUDSClient( + socketPath: socketPath.path, + group: self.group, + logger: logger, + requestTimeout: requestTimeout + ) + self.encoder = JSONEncoder() + self.decoder = JSONDecoder() + } + + /// Drain the underlying `AsyncHTTPClient`, and shut down the NIO + /// event-loop group when this client owns it. Idempotent. Prefer + /// calling this explicitly over relying on the deinit fallback — + /// `shutdown()` waits for in-flight I/O to drain. + /// + /// Callers that pass in a shared `eventLoopGroup` MUST call this + /// before tearing down that group. AsyncHTTPClient parks deferred + /// connection-close work on the group's event loops after each + /// response returns; shutting the group down before that work + /// runs trips NIO's "Cannot schedule tasks on an EventLoop that + /// has already shut down" warning (and a forced crash in future + /// NIO releases). + public func shutdown() async throws { + try await http.shutdown() + if ownsGroup { + try await group.shutdownGracefully() + } + } + + deinit { + // Use the async-dispatched shutdown rather than + // `syncShutdownGracefully()`. The sync variant blocks the calling + // thread until every event loop drains, which deadlocks if deinit + // happens to run on one of the group's event loop threads (e.g. + // the last release came from inside a NIO callback). The + // callback-based variant schedules shutdown on its own queue and + // returns immediately — at the cost of giving up any signal that + // shutdown completed. Callers who need that signal should call + // `shutdown()` explicitly before letting the client deinit. + if ownsGroup { + group.shutdownGracefully(queue: .global()) { _ in } + } + } + + // MARK: - Internal request dispatch helpers + // + // Endpoint extensions (A8/A9/A10) call these to build their public API. + // They are internal (not public) because all public surface lives in those + // extensions. + + /// GET `path`, decode the response body as `Response`. + func get(_ path: String) async throws -> Response { + try await sendAndDecode(method: .GET, path: path, body: nil) + } + + /// PUT `path` with no body, discard the response. + func put(_ path: String) async throws { + try await sendVoid(method: .PUT, path: path, body: nil) + } + + /// PUT `path` with a JSON-encoded body, discard the response. + func put(_ path: String, body: Body) async throws { + let data = try encoder.encode(body) + try await sendVoid(method: .PUT, path: path, body: data) + } + + /// PUT `path` with a JSON-encoded body, decode the response as `Response`. + func put( + _ path: String, + body: Body + ) async throws -> Response { + let data = try encoder.encode(body) + return try await sendAndDecode(method: .PUT, path: path, body: data) + } + + // MARK: - Private machinery + + private func sendAndDecode( + method: HTTPMethod, + path: String, + body: Data? + ) async throws -> Response { + let resp = try await http.send(method: method, uri: path, body: body) + guard (200..<300).contains(Int(resp.status.code)) else { + throw CloudHypervisor.Error.http(status: resp.status, body: resp.body) + } + do { + return try decoder.decode(Response.self, from: resp.body) + } catch { + throw CloudHypervisor.Error.decoding(error, body: resp.body) + } + } + + private func sendVoid(method: HTTPMethod, path: String, body: Data?) async throws { + let resp = try await http.send(method: method, uri: path, body: body) + guard (200..<300).contains(Int(resp.status.code)) else { + throw CloudHypervisor.Error.http(status: resp.status, body: resp.body) + } + } + } +} diff --git a/Sources/CloudHypervisor/CloudHypervisor.swift b/Sources/CloudHypervisor/CloudHypervisor.swift new file mode 100644 index 00000000..d7231180 --- /dev/null +++ b/Sources/CloudHypervisor/CloudHypervisor.swift @@ -0,0 +1,17 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +public enum CloudHypervisor {} diff --git a/Sources/CloudHypervisor/Endpoints/Client+Hotplug.swift b/Sources/CloudHypervisor/Endpoints/Client+Hotplug.swift new file mode 100644 index 00000000..4eeb52e9 --- /dev/null +++ b/Sources/CloudHypervisor/Endpoints/Client+Hotplug.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +extension CloudHypervisor.Client { + /// Hotplug a virtio-blk disk device into a running VM. + /// + /// Maps to `PUT /api/v1/vm.add-disk` in the Cloud Hypervisor REST API. + public func vmAddDisk(_ config: CloudHypervisor.DiskConfig) async throws -> CloudHypervisor.PciDeviceInfo { + try await put("/api/v1/vm.add-disk", body: config) + } + + /// Hotplug a virtio-fs filesystem device into a running VM. + /// + /// Maps to `PUT /api/v1/vm.add-fs` in the Cloud Hypervisor REST API. + public func vmAddFs(_ config: CloudHypervisor.FsConfig) async throws -> CloudHypervisor.PciDeviceInfo { + try await put("/api/v1/vm.add-fs", body: config) + } + + /// Hotplug a virtio-net network device into a running VM. + /// + /// Maps to `PUT /api/v1/vm.add-net` in the Cloud Hypervisor REST API. + public func vmAddNet(_ config: CloudHypervisor.NetConfig) async throws -> CloudHypervisor.PciDeviceInfo { + try await put("/api/v1/vm.add-net", body: config) + } + + /// Hotplug a virtio-vsock device into a running VM. + /// + /// Maps to `PUT /api/v1/vm.add-vsock` in the Cloud Hypervisor REST API. + public func vmAddVsock(_ config: CloudHypervisor.VsockConfig) async throws -> CloudHypervisor.PciDeviceInfo { + try await put("/api/v1/vm.add-vsock", body: config) + } + + /// Remove a hotplugged device from a running VM by its identifier. + /// + /// Maps to `PUT /api/v1/vm.remove-device` in the Cloud Hypervisor REST API. + public func vmRemoveDevice(id: String) async throws { + struct Request: Encodable, Sendable { let id: String } + try await put("/api/v1/vm.remove-device", body: Request(id: id)) + } +} diff --git a/Sources/CloudHypervisor/Endpoints/Client+VM.swift b/Sources/CloudHypervisor/Endpoints/Client+VM.swift new file mode 100644 index 00000000..b1f79986 --- /dev/null +++ b/Sources/CloudHypervisor/Endpoints/Client+VM.swift @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +extension CloudHypervisor.Client { + /// Create a VM with the given configuration. + /// + /// Maps to `PUT /api/v1/vm.create` in the Cloud Hypervisor REST API. + public func vmCreate(_ config: CloudHypervisor.VmConfig) async throws { + try await put("/api/v1/vm.create", body: config) + } + + /// Boot the VM (transition from Created → Running). + /// + /// Maps to `PUT /api/v1/vm.boot` in the Cloud Hypervisor REST API. + public func vmBoot() async throws { + try await put("/api/v1/vm.boot") + } + + /// Shut down the VM. + /// + /// Maps to `PUT /api/v1/vm.shutdown` in the Cloud Hypervisor REST API. + public func vmShutdown() async throws { + try await put("/api/v1/vm.shutdown") + } + + /// Retrieve runtime information about the VM. + /// + /// Maps to `GET /api/v1/vm.info` in the Cloud Hypervisor REST API. + public func vmInfo() async throws -> CloudHypervisor.VmInfo { + try await get("/api/v1/vm.info") + } + + /// Pause the running VM. + /// + /// Maps to `PUT /api/v1/vm.pause` in the Cloud Hypervisor REST API. + public func vmPause() async throws { + try await put("/api/v1/vm.pause") + } + + /// Resume a paused VM. + /// + /// Maps to `PUT /api/v1/vm.resume` in the Cloud Hypervisor REST API. + public func vmResume() async throws { + try await put("/api/v1/vm.resume") + } +} diff --git a/Sources/CloudHypervisor/Endpoints/Client+VMM.swift b/Sources/CloudHypervisor/Endpoints/Client+VMM.swift new file mode 100644 index 00000000..fb956639 --- /dev/null +++ b/Sources/CloudHypervisor/Endpoints/Client+VMM.swift @@ -0,0 +1,40 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +extension CloudHypervisor.Client { + /// Ping the Cloud Hypervisor VMM process and return its version information. + /// + /// Maps to `GET /api/v1/vmm.ping` in the Cloud Hypervisor REST API. + public func vmmPing() async throws -> CloudHypervisor.VmmPingResponse { + try await get("/api/v1/vmm.ping") + } + + /// Request the Cloud Hypervisor VMM process to shut down gracefully. + /// + /// Maps to `PUT /api/v1/vmm.shutdown` in the Cloud Hypervisor REST API. + public func vmmShutdown() async throws { + try await put("/api/v1/vmm.shutdown") + } + + /// Retrieve information about the Cloud Hypervisor VMM process. + /// + /// Maps to `GET /api/v1/vmm.info` in the Cloud Hypervisor REST API. + public func vmmInfo() async throws -> CloudHypervisor.VmmInfo { + try await get("/api/v1/vmm.info") + } +} diff --git a/Sources/CloudHypervisor/Errors.swift b/Sources/CloudHypervisor/Errors.swift new file mode 100644 index 00000000..942e1bfe --- /dev/null +++ b/Sources/CloudHypervisor/Errors.swift @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import NIOHTTP1 + +extension CloudHypervisor { + public enum Error: Swift.Error, Sendable { + case transport(any Swift.Error) + case http(status: HTTPResponseStatus, body: Data) + case decoding(any Swift.Error, body: Data) + case invalidSocketPath(String) + } +} diff --git a/Sources/CloudHypervisor/HTTPOverUDS.swift b/Sources/CloudHypervisor/HTTPOverUDS.swift new file mode 100644 index 00000000..96ecf44f --- /dev/null +++ b/Sources/CloudHypervisor/HTTPOverUDS.swift @@ -0,0 +1,202 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import AsyncHTTPClient +import Foundation +import Logging +import NIOConcurrencyHelpers +import NIOCore +import NIOHTTP1 + +// MARK: - HTTPResponse + +/// An HTTP response received from Cloud Hypervisor's REST API. +struct HTTPResponse: Sendable { + let status: HTTPResponseStatus + let headers: HTTPHeaders + let body: Data +} + +// MARK: - HTTPOverUDSClient + +/// A minimal HTTP/1.1 client that speaks over a Unix Domain Socket. Backed +/// by `AsyncHTTPClient` so connection lifecycle, timeout handling, and the +/// head/body/end write race we used to manage manually all live in the +/// library rather than in this file. +/// +/// AHC selects UDS via the `http+unix://` URL scheme (the supplied +/// `URL(httpURLWithSocketPath:uri:)` initializer does the percent-encoding). +/// Each `HTTPOverUDSClient` owns a fresh `HTTPClient` configured with +/// `eventLoopGroupProvider: .shared(group)` so the underlying NIO group is +/// the caller's to shut down — `httpClient.shutdown` only releases the +/// client's own state. +final class HTTPOverUDSClient: Sendable { + private let socketPath: String + private let httpClient: HTTPClient + private let logger: Logger + private let requestTimeout: TimeAmount + // One-shot flag tracking whether shutdown has been initiated, so + // explicit `shutdown()` is idempotent and `deinit` skips its fallback + // when an explicit shutdown already drained the HTTPClient. + private let didShutdown: NIOLockedValueBox + + init( + socketPath: String, + group: any EventLoopGroup, + logger: Logger, + requestTimeout: TimeAmount = .seconds(30) + ) { + self.socketPath = socketPath + self.httpClient = HTTPClient( + eventLoopGroupProvider: .shared(group), + configuration: .init() + ) + self.logger = logger + self.requestTimeout = requestTimeout + self.didShutdown = NIOLockedValueBox(false) + } + + /// Drain the underlying HTTPClient and wait for in-flight I/O to + /// finish. Idempotent — safe to call multiple times. + /// + /// MUST be called before the shared event-loop group is torn down. + /// AsyncHTTPClient leaves deferred connection-cleanup work parked on + /// the group's event loops after a response returns; if the group is + /// shut down first, that deferred work fails to schedule and SwiftNIO + /// prints "Cannot schedule tasks on an EventLoop that has already + /// shut down" (and will upgrade to a forced crash in future NIO + /// releases). + func shutdown() async throws { + let already = didShutdown.withLockedValue { state -> Bool in + if state { return true } + state = true + return false + } + if already { return } + try await httpClient.shutdown() + } + + /// Send an HTTP request and return the response. + /// + /// Translates AHC errors → ``CloudHypervisor/Error/transport(_:)`` so + /// callers see a uniform error type regardless of failure mode. + func send( + method: HTTPMethod, + uri: String, + body: Data?, + headers: HTTPHeaders = [:] + ) async throws -> HTTPResponse { + // AHC handles the percent-encoding. nil only on a path that can't + // be encoded — surface it the same way the public Client init does. + guard let url = URL(httpURLWithSocketPath: socketPath, uri: uri) else { + throw CloudHypervisor.Error.invalidSocketPath(socketPath) + } + + var request = HTTPClientRequest(url: url.absoluteString) + request.method = method + + // Preserve all caller-supplied headers verbatim. + for (name, value) in headers { + request.headers.replaceOrAdd(name: name, value: value) + } + + // `Connection: close` is preserved from the previous transport. CH + // accepts both close and keep-alive, but close is the safer default + // until we have explicit smoke coverage of long-lived per-VM + // keep-alive behavior. Each request goes to a different per-VM UDS + // anyway so there's nothing to pool. + request.headers.replaceOrAdd(name: "Connection", value: "close") + + // Body framing. CH's HTTP parser rejects body-less PUTs unless the + // request carries `Content-Length: 0` instead of falling back to + // chunked transfer encoding. + // + // How AHC actually frames the request is subtle: + // `RequestValidation.setTransportFraming` strips any manually-set + // `Content-Length` and re-derives framing from the body's known + // length. Assigning `.bytes(ByteBuffer())` (rather than leaving + // body nil) sets `bodyLength == .known(0)`, which AHC then frames + // as `Content-Length: 0` for PUT/POST per RFC 7230 §3.3.2. Leaving + // body nil would surface as `bodyLength == .unknown`, and AHC may + // emit chunked framing or no framing at all, which CH rejects. + // The explicit `Content-Length: 0` header set below is documentation + // of intent — AHC removes it before deriving framing — but the + // wire shape is determined by the empty body assignment. + // + // Regression test: ClientTests.bodylessPUTSendsContentLengthZero. + if let body, !body.isEmpty { + if request.headers["Content-Type"].isEmpty { + request.headers.add(name: "Content-Type", value: "application/json") + } + request.body = .bytes(ByteBuffer(bytes: body)) + } else { + request.headers.replaceOrAdd(name: "Content-Length", value: "0") + request.body = .bytes(ByteBuffer()) + } + + let deadline = NIODeadline.now() + requestTimeout + logger.debug("HTTPOverUDSClient: \(method) \(uri) → \(socketPath)") + + do { + let response = try await httpClient.execute( + request, + deadline: deadline, + logger: logger + ) + + // 16 MiB is far larger than any CH response we expect — vm.info, + // the largest, measures in low-KB even for many-disk VMs. The + // cap exists so a wedged server can't OOM us. + // + // Use `readableBytesView` + the Sequence-based Data init rather + // than `Data(buffer: ByteBuffer)`: the latter requires + // `NIOFoundationCompat`, which the Linux musl build doesn't + // pull in via Foundation by default. + let bodyBuffer = try await response.body.collect(upTo: 1 << 24) + let bodyData = Data(bodyBuffer.readableBytesView) + + logger.debug("HTTPOverUDSClient: \(method) \(uri) ← \(response.status.code)") + return HTTPResponse( + status: response.status, + headers: response.headers, + body: bodyData + ) + } catch let error as CloudHypervisor.Error { + throw error + } catch { + throw CloudHypervisor.Error.transport(error) + } + } + + deinit { + // Fire the callback-based shutdown only when `shutdown()` wasn't + // already called. The sync variant would deadlock if deinit + // happened to run on one of the HTTPClient's own event loops + // (commit fe1c95cf); the callback variant returns immediately at + // the cost of any completion signal. If explicit shutdown + // already ran, the HTTPClient is drained and a second call would + // just return `alreadyShutdown` — but it can still try to + // schedule the callback on the (now-dead) event loop, which is + // exactly the failure mode this whole flag guards against. + let already = didShutdown.withLockedValue { state -> Bool in + if state { return true } + state = true + return false + } + guard !already else { return } + httpClient.shutdown { _ in } + } +} diff --git a/Sources/CloudHypervisor/README.md b/Sources/CloudHypervisor/README.md new file mode 100644 index 00000000..9f8d3971 --- /dev/null +++ b/Sources/CloudHypervisor/README.md @@ -0,0 +1,95 @@ +# CloudHypervisor + +A standalone Swift library for driving the [cloud-hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor) REST API over a Unix domain socket. The package compiles on both macOS and Linux, though `cloud-hypervisor` itself only runs on Linux. + +## Dependencies + +- [swift-nio](https://github.com/apple/swift-nio): `NIOCore`, `NIOPosix`, `NIOHTTP1`, `NIOConcurrencyHelpers` +- [swift-log](https://github.com/apple/swift-log): `Logging` + +There are no transitive dependencies on any other `containerization` library types. + +## Usage + +```swift +import CloudHypervisor + +let client = try CloudHypervisor.Client( + socketPath: URL(filePath: "/tmp/ch-foo/api.sock") +) + +try await client.vmmPing() +try await client.vmCreate(VmConfig(/* ... */)) +try await client.vmBoot() +``` + +### Full example with shared event loop group + +```swift +import CloudHypervisor +import NIOPosix + +let group = MultiThreadedEventLoopGroup(numberOfThreads: 2) +defer { try? group.syncShutdownGracefully() } + +let client = try CloudHypervisor.Client( + socketPath: URL(filePath: "/run/ch/vm0.sock"), + eventLoopGroup: group +) + +let info = try await client.vmInfo() +print(info.state) +``` + +## Supported Endpoints (v1) + +### VMM + +- `vmmPing() -> VmmPingResponse` — verify the VMM process is alive +- `vmmShutdown()` — shut down the VMM process +- `vmmInfo() -> VmmInfo` — query VMM-level metadata + +### VM Lifecycle + +- `vmCreate(_ config: VmConfig)` — define a new VM +- `vmBoot()` — start the VM +- `vmShutdown()` — gracefully shut down the VM +- `vmInfo() -> VmInfo` — query VM state and configuration +- `vmPause()` — pause a running VM +- `vmResume()` — resume a paused VM + +### Hotplug + +- `vmAddDisk(_ config: DiskConfig) -> PciDeviceInfo` — hot-add a block device +- `vmAddFs(_ config: FsConfig) -> PciDeviceInfo` — hot-add a virtio-fs share +- `vmAddNet(_ config: NetConfig) -> PciDeviceInfo` — hot-add a network device +- `vmAddVsock(_ config: VsockConfig) -> PciDeviceInfo` — hot-add a vsock device +- `vmRemoveDevice(id: String)` — hot-remove a device by ID + +## Minimum Supported cloud-hypervisor Version + +The package targets the `/api/v1/` REST namespace. It is tested against **cloud-hypervisor v40** and later. Earlier releases may be missing endpoints or use incompatible JSON schemas. + +## Error Model + +All failures are reported through `CloudHypervisor.Error`: + +- `.transport(any Swift.Error)` — a network or NIO-level failure before the HTTP response was received +- `.http(status:body:)` — the server responded with a non-2xx HTTP status; `body` contains the raw response bytes +- `.decoding(any Swift.Error, body:)` — the response had a 2xx status but JSON decoding failed; `body` is the raw bytes for diagnostics +- `.invalidSocketPath(String)` — the URL passed to `Client.init` is not a `file://` URL + +Non-2xx responses always produce `.http`, never a decode error, so callers can distinguish protocol-level errors from unexpected payloads. + +## Concurrency + +`Client` is `Sendable` and all endpoint methods are `async throws`. Each call opens a fresh TCP-over-UDS connection to cloud-hypervisor and closes it when the response is complete. + +By default the client creates and owns a `MultiThreadedEventLoopGroup` and shuts it down in `deinit`. If you already have an event loop group (e.g. from NIO or another library), pass it via the `eventLoopGroup:` parameter — in that case the client does **not** shut the group down on `deinit`, leaving lifecycle management to the caller. + +## Non-Goals (v1) + +- Not a high-level VM orchestration layer — for that, use the `Containerization` library. +- Not exhaustive coverage of cloud-hypervisor's full OpenAPI surface — only the 14 endpoints listed above are implemented; additional endpoints can be added incrementally. +- No connection pooling — a fresh connection is opened per request, which is appropriate for low-volume control-plane use. +- No streaming response bodies — response payloads are buffered in memory before decoding. diff --git a/Sources/CloudHypervisor/Types/DeviceConfigs.swift b/Sources/CloudHypervisor/Types/DeviceConfigs.swift new file mode 100644 index 00000000..e54877d5 --- /dev/null +++ b/Sources/CloudHypervisor/Types/DeviceConfigs.swift @@ -0,0 +1,242 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +extension CloudHypervisor { + // MARK: - ImageType + + /// On-disk format of a `DiskConfig`'s backing file. When omitted on the + /// wire, cloud-hypervisor defaults to `Unknown` and rejects writes to + /// the disk (logging "Attempting to write to sector 0 on a disk without + /// specifying image_type"); always set this explicitly. + /// + /// Raw values match the Rust `block::ImageType` enum variants used in + /// CH's JSON serialization (PascalCase) — these differ from the + /// lowercase tokens accepted on the `--disk` CLI flag. + public enum ImageType: String, Sendable, Codable, Equatable { + case raw = "Raw" + case qcow2 = "Qcow2" + case fixedVhd = "FixedVhd" + case vhdx = "Vhdx" + case unknown = "Unknown" + } + + // MARK: - DiskConfig + + /// Virtio-blk disk configuration. + /// + /// Maps to `DiskConfig` in the Cloud Hypervisor OpenAPI spec. + public struct DiskConfig: Sendable, Codable, Equatable { + /// Path to the disk image file. + public var path: String + /// Open the disk in read-only mode. + public var readonly: Bool? + /// Use O_DIRECT for disk I/O. + public var direct: Bool? + /// Enable IOMMU for this device. + public var iommu: Bool? + /// Optional device identifier. + public var id: String? + /// PCI segment to attach the device to. + public var pciSegment: UInt16? + /// On-disk format of the backing file. + public var imageType: ImageType? + + public init( + path: String, + readonly: Bool? = nil, + direct: Bool? = nil, + iommu: Bool? = nil, + id: String? = nil, + pciSegment: UInt16? = nil, + imageType: ImageType? = nil + ) { + self.path = path + self.readonly = readonly + self.direct = direct + self.iommu = iommu + self.id = id + self.pciSegment = pciSegment + self.imageType = imageType + } + + enum CodingKeys: String, CodingKey { + case path + case readonly + case direct + case iommu + case id + case pciSegment = "pci_segment" + case imageType = "image_type" + } + } + + // MARK: - NetConfig + + /// Virtio-net network device configuration. + /// + /// Maps to `NetConfig` in the Cloud Hypervisor OpenAPI spec. + public struct NetConfig: Sendable, Codable, Equatable { + /// TAP device name on the host. + public var tap: String? + /// IPv4 address for the device. + public var ip: String? + /// IPv4 subnet mask. + public var mask: String? + /// MAC address for the device. + public var mac: String? + /// Maximum transmission unit. + public var mtu: Int? + /// Number of virtio queues. + public var numQueues: Int? + /// Size of each virtio queue. + public var queueSize: Int? + /// Optional device identifier. + public var id: String? + + public init( + tap: String? = nil, + ip: String? = nil, + mask: String? = nil, + mac: String? = nil, + mtu: Int? = nil, + numQueues: Int? = nil, + queueSize: Int? = nil, + id: String? = nil + ) { + self.tap = tap + self.ip = ip + self.mask = mask + self.mac = mac + self.mtu = mtu + self.numQueues = numQueues + self.queueSize = queueSize + self.id = id + } + + enum CodingKeys: String, CodingKey { + case tap + case ip + case mask + case mac + case mtu + case numQueues = "num_queues" + case queueSize = "queue_size" + case id + } + } + + // MARK: - FsConfig + + /// Virtio-fs filesystem device configuration. + /// + /// Maps to `FsConfig` in the Cloud Hypervisor OpenAPI spec. + public struct FsConfig: Sendable, Codable, Equatable { + /// Filesystem tag used by the guest to mount. + public var tag: String + /// Path to the virtiofsd Unix socket. + public var socket: String + /// Number of virtio queues. + public var numQueues: Int? + /// Size of each virtio queue. + public var queueSize: Int? + /// Optional device identifier. + public var id: String? + /// PCI segment to attach the device to. + public var pciSegment: UInt16? + + public init( + tag: String, + socket: String, + numQueues: Int? = nil, + queueSize: Int? = nil, + id: String? = nil, + pciSegment: UInt16? = nil + ) { + self.tag = tag + self.socket = socket + self.numQueues = numQueues + self.queueSize = queueSize + self.id = id + self.pciSegment = pciSegment + } + + enum CodingKeys: String, CodingKey { + case tag + case socket + case numQueues = "num_queues" + case queueSize = "queue_size" + case id + case pciSegment = "pci_segment" + } + } + + // MARK: - VsockConfig + + /// Virtio-vsock configuration. + /// + /// Maps to `VsockConfig` in the Cloud Hypervisor OpenAPI spec. + public struct VsockConfig: Sendable, Codable, Equatable { + /// Context ID (CID) for the vsock device. + public var cid: UInt32 + /// Path to the vsock Unix socket on the host. + public var socket: String + /// Enable IOMMU for this device. + public var iommu: Bool? + /// Optional device identifier. + public var id: String? + + public init( + cid: UInt32, + socket: String, + iommu: Bool? = nil, + id: String? = nil + ) { + self.cid = cid + self.socket = socket + self.iommu = iommu + self.id = id + } + + enum CodingKeys: String, CodingKey { + case cid + case socket + case iommu + case id + } + } + + // MARK: - PciDeviceInfo + + /// PCI device identifier returned by Cloud Hypervisor after device add. + /// + /// Maps to `PciDeviceInfo` in the Cloud Hypervisor OpenAPI spec. + public struct PciDeviceInfo: Sendable, Codable, Equatable { + /// Device identifier string. + public var id: String + /// PCI Bus:Device.Function address (e.g. `"0000:00:03.0"`). + public var bdf: String + + public init(id: String, bdf: String) { + self.id = id + self.bdf = bdf + } + + enum CodingKeys: String, CodingKey { + case id + case bdf + } + } +} diff --git a/Sources/CloudHypervisor/Types/VmConfig.swift b/Sources/CloudHypervisor/Types/VmConfig.swift new file mode 100644 index 00000000..429b7db9 --- /dev/null +++ b/Sources/CloudHypervisor/Types/VmConfig.swift @@ -0,0 +1,187 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +extension CloudHypervisor { + // MARK: - VmConfig + + /// Top-level VM boot / create payload. + /// + /// Maps to `VmConfig` in the Cloud Hypervisor OpenAPI spec. + public struct VmConfig: Sendable, Codable, Equatable { + public var cpus: CpusConfig + public var memory: MemoryConfig + public var payload: PayloadConfig + public var disks: [DiskConfig]? + public var net: [NetConfig]? + public var fs: [FsConfig]? + public var vsock: VsockConfig? + public var console: ConsoleConfig + public var serial: ConsoleConfig + + public init( + cpus: CpusConfig, + memory: MemoryConfig, + payload: PayloadConfig, + disks: [DiskConfig]? = nil, + net: [NetConfig]? = nil, + fs: [FsConfig]? = nil, + vsock: VsockConfig? = nil, + console: ConsoleConfig, + serial: ConsoleConfig + ) { + self.cpus = cpus + self.memory = memory + self.payload = payload + self.disks = disks + self.net = net + self.fs = fs + self.vsock = vsock + self.console = console + self.serial = serial + } + + enum CodingKeys: String, CodingKey { + case cpus + case memory + case payload + case disks + case net + case fs + case vsock + case console + case serial + } + } + + // MARK: - CpusConfig + + /// CPU configuration for a VM. + /// + /// Maps to `CpusConfig` in the Cloud Hypervisor OpenAPI spec. + public struct CpusConfig: Sendable, Codable, Equatable { + /// Number of vCPUs to boot with. + public var bootVcpus: Int + /// Maximum number of vCPUs (for hotplug). + public var maxVcpus: Int + + public init(bootVcpus: Int, maxVcpus: Int) { + self.bootVcpus = bootVcpus + self.maxVcpus = maxVcpus + } + + enum CodingKeys: String, CodingKey { + case bootVcpus = "boot_vcpus" + case maxVcpus = "max_vcpus" + } + } + + // MARK: - MemoryConfig + + /// Memory configuration for a VM. + /// + /// Maps to `MemoryConfig` in the Cloud Hypervisor OpenAPI spec. + public struct MemoryConfig: Sendable, Codable, Equatable { + /// RAM size in bytes. + public var size: UInt64 + /// Hotplug memory size in bytes. + public var hotplugSize: UInt64? + /// Enable memory merging (KSM). + public var mergeable: Bool? + /// Use a shared memory mapping (`MAP_SHARED`). Required when any + /// vhost-user device (e.g. virtio-fs / virtiofsd) is attached — + /// CH otherwise rejects `vm.boot` with "Using vhost-user requires + /// using shared memory or huge pages". + public var shared: Bool? + + public init(size: UInt64, hotplugSize: UInt64? = nil, mergeable: Bool? = nil, shared: Bool? = nil) { + self.size = size + self.hotplugSize = hotplugSize + self.mergeable = mergeable + self.shared = shared + } + + enum CodingKeys: String, CodingKey { + case size + case hotplugSize = "hotplug_size" + case mergeable + case shared + } + } + + // MARK: - PayloadConfig + + /// Kernel / initramfs / cmdline payload for a VM. + /// + /// Maps to `PayloadConfig` in the Cloud Hypervisor OpenAPI spec. + public struct PayloadConfig: Sendable, Codable, Equatable { + /// Path to the uncompressed kernel image (vmlinux). + public var kernel: String + /// Optional initramfs path. + public var initramfs: String? + /// Optional kernel command line. + public var cmdline: String? + + public init(kernel: String, initramfs: String? = nil, cmdline: String? = nil) { + self.kernel = kernel + self.initramfs = initramfs + self.cmdline = cmdline + } + + enum CodingKeys: String, CodingKey { + case kernel + case initramfs + case cmdline + } + } + + // MARK: - ConsoleConfig + + /// Console / serial device configuration. + /// + /// Maps to `ConsoleConfig` in the Cloud Hypervisor OpenAPI spec. + public struct ConsoleConfig: Sendable, Codable, Equatable { + /// Console I/O mode. + /// + /// CH's OpenAPI spec uses these capitalized strings literally. + public enum Mode: String, Codable, Sendable { + case Off + case Pty + case Tty + case File + case Socket + case Null + } + + public var mode: Mode + /// Path to the output file when `mode == .File`. + public var file: String? + /// Path to the Unix socket when `mode == .Socket`. + public var socket: String? + + public init(mode: Mode, file: String? = nil, socket: String? = nil) { + self.mode = mode + self.file = file + self.socket = socket + } + + enum CodingKeys: String, CodingKey { + case mode + case file + case socket + } + } + +} diff --git a/Sources/CloudHypervisor/Types/VmInfo.swift b/Sources/CloudHypervisor/Types/VmInfo.swift new file mode 100644 index 00000000..f37f63e2 --- /dev/null +++ b/Sources/CloudHypervisor/Types/VmInfo.swift @@ -0,0 +1,122 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +extension CloudHypervisor { + // MARK: - VmState + + /// Lifecycle state of a Cloud Hypervisor VM. + /// + /// Maps to `VmState` in the Cloud Hypervisor OpenAPI spec. + /// The raw values match CH's literal strings exactly (capitalized). + public enum VmState: String, Sendable, Codable, Equatable { + case Created + case Running + case Shutdown + case Paused + case BreakPoint + } + + // MARK: - VmInfo + + /// Response body for `GET /vm.info`. + /// + /// Maps to `VmInfo` in the Cloud Hypervisor OpenAPI spec. + /// + /// Note: the `device_tree` map (`[String: VmInfoDeviceNode]`) from the + /// upstream OpenAPI spec is omitted in this v1 implementation — no current + /// endpoint consumers require it. Add when needed. + public struct VmInfo: Sendable, Codable, Equatable { + /// The boot configuration used for this VM. + public var config: VmConfig + /// Current lifecycle state. + public var state: VmState + /// Actual memory size in bytes as reported by the VMM, if available. + public var memoryActualSize: UInt64? + + public init(config: VmConfig, state: VmState, memoryActualSize: UInt64? = nil) { + self.config = config + self.state = state + self.memoryActualSize = memoryActualSize + } + + enum CodingKeys: String, CodingKey { + case config + case state + case memoryActualSize = "memory_actual_size" + } + } + + // MARK: - VmmPingResponse + + /// Response body for `GET /vmm.ping`. + /// + /// Maps to `VmmPingResponse` in the Cloud Hypervisor OpenAPI spec. + public struct VmmPingResponse: Sendable, Codable, Equatable { + /// Cloud Hypervisor version string (e.g. `"v40.0"`). + public var version: String + /// PID of the VMM process, if provided. + public var pid: Int? + /// List of compiled-in feature flags, if provided. + public var features: [String]? + /// Build-time version string, if provided. + public var buildVersion: String? + + public init(version: String, pid: Int? = nil, features: [String]? = nil, buildVersion: String? = nil) { + self.version = version + self.pid = pid + self.features = features + self.buildVersion = buildVersion + } + + enum CodingKeys: String, CodingKey { + case version + case pid + case features + case buildVersion = "build_version" + } + } + + // MARK: - VmmInfo + + /// Response body for `GET /vmm.info`. + /// + /// Maps to a subset of the `VmmInfo` schema in the Cloud Hypervisor OpenAPI + /// spec. Only the fields needed by v1 consumers are included (YAGNI). + public struct VmmInfo: Sendable, Codable, Equatable { + /// Cloud Hypervisor version string (e.g. `"v40.0"`). + public var version: String + /// PID of the VMM process, if provided. + public var pid: Int? + /// Build-time version string, if provided. + public var buildVersion: String? + /// The currently-running VM's boot configuration, if a VM exists. + public var config: VmConfig? + + public init(version: String, pid: Int? = nil, buildVersion: String? = nil, config: VmConfig? = nil) { + self.version = version + self.pid = pid + self.buildVersion = buildVersion + self.config = config + } + + enum CodingKeys: String, CodingKey { + case version + case pid + case buildVersion = "build_version" + case config + } + } +} diff --git a/Sources/Containerization/BridgeManager.swift b/Sources/Containerization/BridgeManager.swift new file mode 100644 index 00000000..27250374 --- /dev/null +++ b/Sources/Containerization/BridgeManager.swift @@ -0,0 +1,359 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import ContainerizationExtras +import ContainerizationNetlink +import Foundation +import Logging + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + +/// Linux-only host plumbing for a container bridge network. +/// +/// `create()` is idempotent: it brings the bridge to a known state (created +/// if absent, configured if already present), records what it changed in +/// `/run/containerization/bridge-.state`, and `delete()` reverses +/// only what was recorded. +/// +/// **NAT is opt-in.** With the default (`enableNAT: false`) `create()` only +/// brings up the bridge link and assigns the gateway IP — it does NOT touch +/// `ip_forward`, does NOT program iptables, and does NOT pick an egress +/// interface. Containers attached to the bridge can talk to each other and +/// to the host, but not to the outside world. Pass `enableNAT: true` to +/// also enable IPv4 forwarding and program a scoped MASQUERADE/FORWARD +/// pair (`-i -o `); the bridge becomes a NAT exit and the +/// host now routes guest traffic. +/// +/// Concurrent `create()`/`delete()` calls (e.g. from two `cctl run` +/// processes) serialize via `flock(LOCK_EX)` on +/// `/run/containerization/bridge-.lock`. +/// +/// Requires root (or `CAP_NET_ADMIN` plus, when NAT is enabled, the ability +/// to write `/proc/sys/...` and invoke `iptables`). +public struct BridgeManager: Sendable { + public let name: String + public let subnet: CIDRv4 + public let gateway: IPv4Address + public let mtu: UInt32 + public let egressInterface: String? + public let enableNAT: Bool + private let log: Logger + + /// - Parameters: + /// - name: bridge interface name (e.g. `cz0`). + /// - subnet: subnet to assign on the bridge. + /// - gateway: host-side IP on the bridge. Defaults to `subnet.gateway` + /// (= `subnet.lower + 1`). + /// - mtu: bridge MTU. Default 1500. + /// - egressInterface: explicit egress iface for MASQUERADE. nil = + /// auto-detect via `/proc/net/route` at `create()` time. Only used + /// when `enableNAT` is true. + /// - enableNAT: when true, program iptables MASQUERADE+FORWARD and + /// enable `net.ipv4.ip_forward`. Default false — the bridge is + /// created without external connectivity, leaving host firewall + /// policy untouched. + /// - logger: optional logger. Defaults to a `bridge`-labeled logger. + public init( + name: String, + subnet: CIDRv4, + gateway: IPv4Address? = nil, + mtu: UInt32 = 1500, + egressInterface: String? = nil, + enableNAT: Bool = false, + logger: Logger? = nil + ) { + self.name = Self.validateInterfaceName(name) + self.subnet = subnet + self.gateway = gateway ?? subnet.gateway + self.mtu = mtu + self.egressInterface = egressInterface.map(Self.validateInterfaceName) + self.enableNAT = enableNAT + self.log = logger ?? Logger(label: "com.apple.containerization.bridge") + } + + /// Reject obviously-bogus interface names before they hit netlink or + /// `iptables`. This is a defense-in-depth check; the kernel and + /// `iptables` themselves will also reject pathological inputs, but doing + /// it here surfaces the error in a callable Swift API rather than as a + /// netlink rc or iptables exit. Asserts (rather than throws) — these + /// constraints are static, so a violation is a programming error. + private static func validateInterfaceName(_ name: String) -> String { + // IFNAMSIZ on Linux is 16 (15 usable + NUL). iptables itself caps + // at 15. Names with `/`, whitespace, or NUL are kernel-rejected. + precondition(!name.isEmpty, "interface name must be non-empty") + precondition(name.utf8.count <= 15, "interface name '\(name)' exceeds IFNAMSIZ-1 (15)") + precondition( + !name.contains(where: { $0.isWhitespace || $0 == "/" || $0 == "\0" || $0 == ":" }), + "interface name '\(name)' contains invalid characters" + ) + return name + } + + /// Idempotent create. + public func create() throws { + try Self.ensureStateDirectory() + let lock = try FileLock(path: Self.lockPath(for: name)) + try lock.withExclusive { + try createLocked() + } + } + + /// Idempotent delete. No-op when the bridge does not exist. + public func delete() throws { + try Self.ensureStateDirectory() + let lock = try FileLock(path: Self.lockPath(for: name)) + try lock.withExclusive { + try deleteLocked() + } + } + + private func createLocked() throws { + let session = NetlinkSession(socket: try DefaultNetlinkSocket(), log: log) + let stateURL = URL(fileURLWithPath: Self.statePath(for: name)) + + // Preserve `prevIpForward` across re-runs: a second NAT-enabled + // create() call would otherwise read the value the FIRST run left + // behind ("1") and clobber the original prior state, so delete() + // couldn't restore. + let priorState: BridgeState? = (try? Data(contentsOf: stateURL)) + .flatMap { try? BridgeState.decode($0) } + + // 1. Bridge link. + do { + try session.linkAddBridge(name: name) + log.info("created bridge \(name)") + } catch { + // EEXIST is fine; treat any error as "maybe it exists" and probe. + // `linkGet` throws ENODEV when the iface is absent (rather than + // returning an empty array), so coalesce both shapes to "absent". + let existing = (try? session.linkGet(interface: name)) ?? [] + if existing.isEmpty { + throw ContainerizationError( + .internalError, + message: "linkAddBridge \(name) failed and bridge does not exist: \(error)" + ) + } + log.debug("bridge \(name) already exists") + } + + // 2. Address (gateway/prefix) on the bridge. + let cidr = try CIDRv4(gateway, prefix: subnet.prefix) + do { + try session.addressAdd(interface: name, ipv4Address: cidr) + } catch { + // EEXIST tolerated; netlink layer doesn't expose errno cleanly, + // so log and continue. linkSet/up below will fail visibly if the + // bridge state is actually broken. + log.debug("addressAdd \(cidr) on \(name) returned \(error) (likely already set)") + } + + // 3. Up + MTU. + try session.linkSet(interface: name, up: true, mtu: mtu) + + // NAT is opt-in but sticky: once enabled by a previous create(), + // subsequent create() calls without --enable-nat leave the existing + // rules and ip_forward state in place. Otherwise `cctl run` + // (defaults to NAT off) called after `cctl bridge create + // --enable-nat` would silently disable the NAT the user explicitly + // turned on. delete() always reverses whatever the state file + // records. + let effectiveNAT = enableNAT || (priorState?.natEnabled ?? false) + guard effectiveNAT else { + let state = BridgeState(natEnabled: false) + try state.encode().write(to: stateURL) + log.info("bridge \(name) ready (subnet \(subnet), NAT disabled)") + return + } + + // 4. ip_forward: read what's currently on the host, decide what to + // record. If we already have a NAT-enabled state file from a prior + // create(), keep its `prevIpForward` (it's the *original* prior + // value); otherwise record what we just read. + let currentIpForward = (try? Self.readSysctl("net/ipv4/ip_forward")) ?? "0" + let prevIpForward = (priorState?.natEnabled == true ? priorState?.prevIpForward : nil) ?? currentIpForward + if currentIpForward != "1" { + try Self.writeSysctl("net/ipv4/ip_forward", value: "1") + } + + // 5. Egress iface — explicit override or auto-detect. + let egress: String + if let explicit = egressInterface { + egress = explicit + } else if let detected = HostDefaultRoute.currentEgress() { + egress = detected + } else { + throw ContainerizationError( + .invalidArgument, + message: "no default route on host; pass egressInterface explicitly" + ) + } + + // 6. Record state BEFORE iptables. If a later iptables -A fails, + // delete() still has authority to clean up partial rules; if we + // deferred the write until after, a mid-failure would orphan rules + // with no record. + let state = BridgeState( + natEnabled: true, + prevIpForward: prevIpForward, + egressInterface: egress + ) + try state.encode().write(to: stateURL) + + // 7. iptables rules — idempotent. The FORWARD rule is scoped to + // `-i -o ` so the host doesn't become an + // unrestricted router for guest traffic across every host iface + // (e.g. a VPN or a sibling bridge). + try IptablesRules.ensure( + table: "nat", + args: [ + "POSTROUTING", "-s", subnet.description, "!", "-o", name, "-j", "MASQUERADE", + ]) + try IptablesRules.ensure(args: [ + "FORWARD", "-i", name, "-o", egress, "-j", "ACCEPT", + ]) + try IptablesRules.ensure(args: [ + "FORWARD", "-i", egress, "-o", name, "-m", "conntrack", "--ctstate", "RELATED,ESTABLISHED", "-j", "ACCEPT", + ]) + + log.info("bridge \(name) ready (subnet \(subnet), egress \(egress), NAT enabled)") + } + + private func deleteLocked() throws { + let stateURL = URL(fileURLWithPath: Self.statePath(for: name)) + let state: BridgeState? = (try? Data(contentsOf: stateURL)) + .flatMap { try? BridgeState.decode($0) } + + // 1. iptables — only if a prior create() with NAT enabled left state + // we own. The rules are keyed off subnet, bridge name, and the + // recorded egress iface, so removal is precise even when the + // host has rules from other tools. + if let state, state.natEnabled, let egress = state.egressInterface { + log.debug("removing iptables rules for bridge \(name) (egress \(egress))") + IptablesRules.remove( + table: "nat", + args: [ + "POSTROUTING", "-s", subnet.description, "!", "-o", name, "-j", "MASQUERADE", + ]) + IptablesRules.remove(args: [ + "FORWARD", "-i", name, "-o", egress, "-j", "ACCEPT", + ]) + IptablesRules.remove(args: [ + "FORWARD", "-i", egress, "-o", name, "-m", "conntrack", "--ctstate", "RELATED,ESTABLISHED", "-j", "ACCEPT", + ]) + } + + // 2. Bridge link. + let session = NetlinkSession(socket: try DefaultNetlinkSocket(), log: log) + // Refuse to delete anything that isn't actually a bridge — the + // kernel exposes `/sys/class/net//bridge` only for links of + // kind=bridge, so its presence is an authoritative kind check + // without parsing IFLA_LINKINFO. This guards against `cctl bridge + // delete --name eth0` (or docker0, etc.) taking down host links. + let sysfsBridge = "/sys/class/net/\(name)/bridge" + let isBridge = FileManager.default.fileExists(atPath: sysfsBridge) + let exists = !((try? session.linkGet(interface: name)) ?? []).isEmpty + if exists && !isBridge { + throw ContainerizationError( + .invalidArgument, + message: "refusing to delete \(name): exists but is not a bridge interface" + ) + } + do { + try session.linkDel(name: name) + log.info("deleted bridge \(name)") + } catch { + // ENODEV-like: nothing to do. + log.debug("linkDel \(name) returned \(error) (likely already absent)") + } + + // 3. Restore ip_forward only if create()-with-NAT set it from 0. + if state?.natEnabled == true, state?.prevIpForward == "0" { + try? Self.writeSysctl("net/ipv4/ip_forward", value: "0") + } + + // 4. Remove state file. + try? FileManager.default.removeItem(at: stateURL) + } + + // MARK: - Paths / sysctl helpers + + private static let stateDir = "/run/containerization" + + private static func statePath(for name: String) -> String { + "\(stateDir)/bridge-\(name).state" + } + + private static func lockPath(for name: String) -> String { + "\(stateDir)/bridge-\(name).lock" + } + + private static func ensureStateDirectory() throws { + try FileManager.default.createDirectory( + atPath: stateDir, + withIntermediateDirectories: true, + attributes: [.posixPermissions: 0o755] + ) + } + + private static func readSysctl(_ path: String) throws -> String { + let url = URL(fileURLWithPath: "/proc/sys/\(path)") + return try String(contentsOf: url, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + private static func writeSysctl(_ path: String, value: String) throws { + let url = URL(fileURLWithPath: "/proc/sys/\(path)") + try Data((value + "\n").utf8).write(to: url) + } +} + +/// `flock(2)` wrapper. Held for the duration of a closure. +struct FileLock { + let fd: Int32 + + init(path: String) throws { + let f = open(path, O_RDWR | O_CREAT | O_CLOEXEC, 0o600) + guard f >= 0 else { + throw ContainerizationError( + .internalError, + message: "open \(path) failed: errno=\(errno)" + ) + } + self.fd = f + } + + func withExclusive(_ body: () throws -> T) throws -> T { + guard flock(fd, LOCK_EX) == 0 else { + close(fd) + throw ContainerizationError( + .internalError, + message: "flock LOCK_EX failed: errno=\(errno)" + ) + } + defer { + _ = flock(fd, LOCK_UN) + close(fd) + } + return try body() + } +} +#endif diff --git a/Sources/Containerization/BridgeStateFile.swift b/Sources/Containerization/BridgeStateFile.swift new file mode 100644 index 00000000..34dbd09e --- /dev/null +++ b/Sources/Containerization/BridgeStateFile.swift @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +/// On-disk record of state `BridgeManager.create()` modified, used by +/// `delete()` to restore the host. Stored at +/// `/run/containerization/bridge-.state` (tmpfs — gone after host +/// reboot, which is fine because reboot already clears `ip_forward` and +/// the bridge link itself). +struct BridgeState: Codable, Equatable { + /// Whether `create()` programmed NAT (iptables MASQUERADE/FORWARD + + /// `ip_forward`). When `false`, the only thing `create()` did was bring + /// the bridge up — `delete()` only needs to remove the link, not roll + /// back NAT. State files predating this field are decoded as + /// `natEnabled = true` for back-compat. + let natEnabled: Bool + + /// Value of `/proc/sys/net/ipv4/ip_forward` read at the *first* + /// `create()` call. Preserved across re-runs so `delete()` can restore + /// the host's true original value. Only set when `natEnabled`. + let prevIpForward: String? + + /// Egress interface that `create()` used in the iptables rules — passed + /// explicitly by the caller, or auto-detected from `/proc/net/route`. + /// Only set when `natEnabled`. Recorded for debug / observability and + /// to scope the FORWARD rule's `-o` clause; rule removal is keyed off + /// subnet, bridge name, and egress. + let egressInterface: String? + + init(natEnabled: Bool, prevIpForward: String? = nil, egressInterface: String? = nil) { + self.natEnabled = natEnabled + self.prevIpForward = prevIpForward + self.egressInterface = egressInterface + } + + enum CodingKeys: String, CodingKey { + case natEnabled + case prevIpForward + case egressInterface + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + // Default natEnabled to true when missing so files written by older + // versions (which always programmed NAT) still describe themselves + // accurately — delete() will roll back ip_forward / iptables. + self.natEnabled = try container.decodeIfPresent(Bool.self, forKey: .natEnabled) ?? true + self.prevIpForward = try container.decodeIfPresent(String.self, forKey: .prevIpForward) + self.egressInterface = try container.decodeIfPresent(String.self, forKey: .egressInterface) + } + + func encode() throws -> Data { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + return try encoder.encode(self) + } + + static func decode(_ data: Data) throws -> BridgeState { + try JSONDecoder().decode(BridgeState.self, from: data) + } +} diff --git a/Sources/Containerization/CHHotplugProvider.swift b/Sources/Containerization/CHHotplugProvider.swift new file mode 100644 index 00000000..9036da50 --- /dev/null +++ b/Sources/Containerization/CHHotplugProvider.swift @@ -0,0 +1,422 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import CloudHypervisor +import ContainerizationError +import ContainerizationExtras +import Foundation +import Logging +import NIOHTTP1 +import Synchronization + +/// Hotplug provider for the cloud-hypervisor backend. +/// +/// Handles both block (`vm.add-disk`) and virtiofs (`vm.add-fs`, with one +/// `virtiofsd` per unique source-hash tag) hotplug, plus the matching +/// `vm.remove-device` teardown. Owns the per-VM mount registry so +/// `CHVirtualMachineInstance.mounts` can forward to it. +final class CHHotplugProvider: HotplugProvider { + struct HotplugRecord: Sendable { + let chDeviceId: String + let kind: Kind + + enum Kind: Sendable { + case block(letter: Character) + case virtiofs(tag: String) + } + } + + struct VirtiofsdTagState: Sendable { + var process: VirtiofsdProcess + var refcount: Int + var chDeviceId: String + } + + private let client: CloudHypervisor.Client + private let workDir: URL + private let virtiofsdBinaryOverride: URL? + private let allocator: any AddressAllocator + private let _mounts: Mutex<[String: [AttachedFilesystem]]> + private let _records: Mutex<[String: [HotplugRecord]]> + private let _tags: Mutex<[String: VirtiofsdTagState]> + /// Serializes per-tag virtiofsd spawn so a concurrent hotplug for the + /// same tag can't race the existence-check / process-registration window + /// (TOCTOU → orphaned virtiofsd). Held across awaits, so it must be an + /// `AsyncLock` rather than the sync `Mutex` that protects `_tags`. + private let spawnLock: AsyncLock + private let logger: Logger? + + init( + client: CloudHypervisor.Client, + workDir: URL, + virtiofsdBinary: URL?, + allocator: any AddressAllocator, + initialMounts: [String: [AttachedFilesystem]], + logger: Logger? + ) { + self.client = client + self.workDir = workDir + self.virtiofsdBinaryOverride = virtiofsdBinary + self.allocator = allocator + self._mounts = Mutex(initialMounts) + self._records = Mutex([:]) + self._tags = Mutex([:]) + self.spawnLock = AsyncLock() + self.logger = logger + } + + // MARK: - Read accessors + + var mounts: [String: [AttachedFilesystem]] { + _mounts.withLock { $0 } + } + + func withMountRegistry( + _ body: (inout sending [String: [AttachedFilesystem]]) throws -> sending T + ) rethrows -> T { + try _mounts.withLock(body) + } + + // MARK: - HotplugProvider conformance + + func hotplug(_ block: Mount, id: String) async throws -> AttachedFilesystem { + guard case .virtioblk = block.runtimeOptions else { + throw ContainerizationError(.invalidArgument, message: "hotplug requires a virtio-blk mount") + } + + let letter = try allocator.allocate() + let chId = "blk-\(id)-\(letter)" + let disk = CloudHypervisor.DiskConfig( + path: block.source, + readonly: block.options.contains("ro"), + id: chId, + imageType: .raw + ) + + let pci: CloudHypervisor.PciDeviceInfo + do { + pci = try await chCall { try await self.client.vmAddDisk(disk) } + } catch { + try? allocator.release(letter) + throw error + } + + let attached = AttachedFilesystem( + type: block.type, + source: "/dev/vd\(letter)", + destination: block.destination, + options: block.options + ) + + _records.withLock { + $0[id, default: []].append(HotplugRecord(chDeviceId: pci.id, kind: .block(letter: letter))) + } + return attached + } + + func registerMounts(id: String, rootfs: AttachedFilesystem, additionalMounts: [Mount]) throws { + var attached: [AttachedFilesystem] = [rootfs] + for mount in additionalMounts { + attached.append(try AttachedFilesystem(mount: mount, allocator: allocator)) + } + _mounts.withLock { + $0[id, default: []].append(contentsOf: attached) + } + } + + func releaseHotplug(id: String) async throws { + let popped: [HotplugRecord] = _records.withLock { records in + let all = records[id] ?? [] + let blocks = all.filter { record in + if case .block = record.kind { return true } + return false + } + let remaining = all.filter { record in + if case .block = record.kind { return false } + return true + } + if remaining.isEmpty { + records.removeValue(forKey: id) + } else { + records[id] = remaining + } + return blocks + } + + for rec in popped { + do { + try await chCall { try await self.client.vmRemoveDevice(id: rec.chDeviceId) } + } catch { + logger?.warning("vmRemoveDevice failed for \(rec.chDeviceId): \(error)") + } + if case .block(let letter) = rec.kind { + try? allocator.release(letter) + } + } + + // Drop block-derived AttachedFilesystem entries for `id`. Block entries + // are the ones whose source was rewritten to "/dev/vd" by + // `hotplug(_:)` (or by AttachedFilesystem(mount:allocator:) for an + // additionalMount of type virtio-blk). + _mounts.withLock { state in + guard var perID = state[id] else { return } + perID.removeAll { $0.source.hasPrefix("/dev/vd") } + if perID.isEmpty { + state.removeValue(forKey: id) + } else { + state[id] = perID + } + } + } + + func hotplugVirtioFS(_ mounts: [Mount], id: String) async throws { + let virtiofs = mounts.filter { + if case .virtiofs = $0.runtimeOptions { return true } + return false + } + guard !virtiofs.isEmpty else { return } + + // Group by tag (source-hash). Multiple Mounts to the same source dir + // share a tag and a single virtiofsd. + var byTag: [String: [Mount]] = [:] + for mount in virtiofs { + let tag = try hashFilePath(path: mount.source) + byTag[tag, default: []].append(mount) + } + + for (tag, group) in byTag { + // Hold spawnLock across the existence check and the spawn / + // _tags write so two concurrent calls for the same tag can't + // both decide alreadyRunning=false and double-spawn virtiofsd + // (the second write would clobber the first in `_tags`, + // orphaning that process). + try await spawnLock.withLock { _ in + // Build per-container AttachedFilesystem entries up front. + // These depend only on Mount + allocator and don't need the + // chDeviceId, so surfacing any error here keeps the + // transactional shape: nothing irreversible has happened + // yet, no virtiofsd has spawned, no _tags entry written. + var attached: [AttachedFilesystem] = [] + for mount in group { + attached.append(try AttachedFilesystem(mount: mount, allocator: self.allocator)) + } + + let chDeviceId: String + + // Refcount-bump path. If a virtiofsd already serves this + // tag, increment refcount and use the cached deviceId. + let cachedDeviceId: String? = self._tags.withLock { tags in + if var state = tags[tag] { + state.refcount += 1 + tags[tag] = state + return state.chDeviceId + } + return nil + } + + if let cached = cachedDeviceId { + chDeviceId = cached + } else { + // First-spawn path. Walk: spawn → vmAddFs → commit _tags, + // with rollback at every step so a partial failure can't + // leave a virtiofsd running unrecorded. + let socket = chVirtiofsSocketURL(workDir: self.workDir, tag: tag) + let readonly = group.allSatisfy { $0.options.contains("ro") } + guard let source = group.first?.source else { return } + let virtiofsdBinary = try CHVirtualMachineManager.resolveBinary( + self.virtiofsdBinaryOverride, + name: "virtiofsd" + ) + + let process = VirtiofsdProcess( + config: .init( + binary: virtiofsdBinary, + socketPath: socket, + sharedDir: URL(fileURLWithPath: source), + readonly: readonly + ), + logger: self.logger + ) + + try await process.start() + + let fsConfig = CloudHypervisor.FsConfig( + tag: tag, + socket: socket.path, + id: "fs-\(tag)" + ) + let pci: CloudHypervisor.PciDeviceInfo + do { + pci = try await chCall { try await self.client.vmAddFs(fsConfig) } + } catch { + await process.terminate(graceSeconds: 5) + try? FileManager.default.removeItem(at: socket) + throw error + } + + self._tags.withLock { + $0[tag] = VirtiofsdTagState(process: process, refcount: 1, chDeviceId: pci.id) + } + chDeviceId = pci.id + } + + // Bookkeeping. Both writes are non-throwing closures, and + // `attached` was built up front, so once we reach here + // nothing can fail between the refcount/spawn commit above + // and the per-container record below — the orphan window + // (tag committed, record missing) is closed. + self._records.withLock { + $0[id, default: []].append(HotplugRecord(chDeviceId: chDeviceId, kind: .virtiofs(tag: tag))) + } + self._mounts.withLock { + $0[id, default: []].append(contentsOf: attached) + } + } + } + } + + func releaseVirtioFS(id: String) async throws { + let popped: [HotplugRecord] = _records.withLock { records in + let all = records[id] ?? [] + let fs = all.filter { record in + if case .virtiofs = record.kind { return true } + return false + } + let remaining = all.filter { record in + if case .virtiofs = record.kind { return false } + return true + } + if remaining.isEmpty { + records.removeValue(forKey: id) + } else { + records[id] = remaining + } + return fs + } + + var processesToStop: [(VirtiofsdProcess, String, String)] = [] // (process, tag, chDeviceId) + for rec in popped { + guard case .virtiofs(let tag) = rec.kind else { continue } + _tags.withLock { tags in + guard var state = tags[tag] else { return } + state.refcount -= 1 + if state.refcount <= 0 { + tags.removeValue(forKey: tag) + processesToStop.append((state.process, tag, state.chDeviceId)) + } else { + tags[tag] = state + } + } + } + + for (process, tag, chDeviceId) in processesToStop { + do { + try await chCall { try await self.client.vmRemoveDevice(id: chDeviceId) } + } catch { + logger?.warning("vmRemoveDevice failed for \(chDeviceId): \(error)") + } + await process.terminate(graceSeconds: 5) + let socket = chVirtiofsSocketURL(workDir: workDir, tag: tag) + try? FileManager.default.removeItem(at: socket) + } + + // Drop virtiofs AttachedFilesystem entries for `id`. AttachedFilesystem + // sets `type = mount.type` which for a `.virtiofs` mount is "virtiofs". + _mounts.withLock { state in + guard var perID = state[id] else { return } + perID.removeAll { $0.type == "virtiofs" } + if perID.isEmpty { + state.removeValue(forKey: id) + } else { + state[id] = perID + } + } + } + + // MARK: - Boot-time + shutdown hooks (used by CHVirtualMachineInstance) + + /// Record a virtiofsd that was started as part of `start()`'s initial + /// `VmConfig.fs` (rather than a runtime `vm.add-fs`). The `chDeviceId` + /// is the user-supplied `FsConfig.id` (which `vm.remove-device` keys on). + /// `ownerIds` are the container ids that count toward this tag's refcount; + /// each gets a `HotplugRecord` so `releaseVirtioFS(id:)` walks them + /// uniformly. + func recordBootTimeVirtiofs( + tag: String, + process: VirtiofsdProcess, + chDeviceId: String, + ownerIds: [String] + ) { + _tags.withLock { + $0[tag] = VirtiofsdTagState(process: process, refcount: ownerIds.count, chDeviceId: chDeviceId) + } + _records.withLock { records in + for id in ownerIds { + records[id, default: []].append(HotplugRecord(chDeviceId: chDeviceId, kind: .virtiofs(tag: tag))) + } + } + } + + /// Called from `CHVirtualMachineInstance.stop()` to terminate any + /// virtiofsd subprocesses still alive. The CH side teardown is handled by + /// `chProcess.terminate()`. + func shutdown() async { + let processes = _tags.withLock { tags -> [VirtiofsdProcess] in + let all = tags.values.map(\.process) + tags.removeAll() + return all + } + _records.withLock { $0.removeAll() } + + for process in processes { + await process.terminate(graceSeconds: 5) + } + } +} + +// MARK: - Error translation + +/// Wraps a closure that may throw `CloudHypervisor.Error`, translating it into +/// `ContainerizationError` per spec §6 so callers of the public API only see +/// `ContainerizationError`. +func chCall(_ block: @Sendable () async throws -> T) async throws -> T { + do { + return try await block() + } catch let error as CloudHypervisor.Error { + switch error { + case .http(let status, let body): + let bodyStr = String(data: body, encoding: .utf8) ?? "" + if status == .notFound { + throw ContainerizationError(.notFound, message: "cloud-hypervisor 404: \(bodyStr)") + } + if status == .badRequest { + throw ContainerizationError(.invalidArgument, message: "cloud-hypervisor 400: \(bodyStr)") + } + throw ContainerizationError( + .internalError, + message: "cloud-hypervisor HTTP \(status.code): \(bodyStr)" + ) + case .transport(let underlying): + throw ContainerizationError(.internalError, message: "cloud-hypervisor transport error", cause: underlying) + case .decoding(let underlying, _): + throw ContainerizationError(.internalError, message: "cloud-hypervisor response decode error", cause: underlying) + case .invalidSocketPath(let path): + throw ContainerizationError(.invalidArgument, message: "invalid cloud-hypervisor socket path: \(path)") + } + } +} +#endif diff --git a/Sources/Containerization/CHInstanceExtension.swift b/Sources/Containerization/CHInstanceExtension.swift new file mode 100644 index 00000000..80121476 --- /dev/null +++ b/Sources/Containerization/CHInstanceExtension.swift @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import CloudHypervisor + +/// Extension hook for `CHVirtualMachineInstance` lifecycle. Append conforming +/// types to `Configuration.extensions` to participate in VM setup and +/// teardown without subclassing. +/// +/// All methods have no-op defaults so a conforming type only needs to +/// implement the hooks it actually cares about. +public protocol CHInstanceExtension: Sendable { + /// Mutate the cloud-hypervisor `VmConfig` before the VM is created. + /// Called by `start()` after the base config is built but before + /// `vm.create` is dispatched to the VMM. + func configureCH(_ config: inout CloudHypervisor.VmConfig) throws + + /// Called once the VM has been created and booted but before + /// `start()` returns to the caller. + func didCreate(_ instance: CHVirtualMachineInstance) throws + + /// Called from `stop()` before the VM is shut down. Errors are + /// best-effort — `stop()` swallows them. + func willStop(_ instance: CHVirtualMachineInstance) async throws +} + +extension CHInstanceExtension { + public func configureCH(_ config: inout CloudHypervisor.VmConfig) throws {} + public func didCreate(_ instance: CHVirtualMachineInstance) throws {} + public func willStop(_ instance: CHVirtualMachineInstance) async throws {} +} +#endif diff --git a/Sources/Containerization/CHInterface.swift b/Sources/Containerization/CHInterface.swift new file mode 100644 index 00000000..459209f2 --- /dev/null +++ b/Sources/Containerization/CHInterface.swift @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import CloudHypervisor +import ContainerizationExtras + +/// An `Interface` specialization that can produce a `CloudHypervisor.NetConfig` +/// describing how the cloud-hypervisor VMM should attach the device. +public protocol CHInterface { + func chNetConfig() throws -> CloudHypervisor.NetConfig +} + +/// A TAP-backed network interface for the cloud-hypervisor backend. +/// +/// IP configuration on the guest side is delegated to `vminitd` (matching the +/// macOS path). `chNetConfig()` therefore leaves CH's `ip`/`mask` fields nil — +/// those would assign an address to the host end of the TAP, which we do not +/// use. Bringing up the TAP and any bridge/NAT plumbing is the caller's +/// responsibility. +public struct TAPInterface: CHInterface, Interface, Sendable { + public let tapName: String + public let ipv4Address: CIDRv4 + public let ipv4Gateway: IPv4Address? + public let macAddress: MACAddress? + public let mtu: UInt32 + + public init( + tapName: String, + ipv4Address: CIDRv4, + ipv4Gateway: IPv4Address? = nil, + macAddress: MACAddress? = nil, + mtu: UInt32 = 1500 + ) { + self.tapName = tapName + self.ipv4Address = ipv4Address + self.ipv4Gateway = ipv4Gateway + self.macAddress = macAddress + self.mtu = mtu + } + + public func chNetConfig() throws -> CloudHypervisor.NetConfig { + CloudHypervisor.NetConfig( + tap: tapName, + ip: nil, + mask: nil, + mac: macAddress?.description, + mtu: Int(mtu), + numQueues: nil, + queueSize: nil, + id: nil + ) + } +} +#endif diff --git a/Sources/Containerization/CHProcess.swift b/Sources/Containerization/CHProcess.swift new file mode 100644 index 00000000..fd1c6b18 --- /dev/null +++ b/Sources/Containerization/CHProcess.swift @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import ContainerizationExtras +import ContainerizationOS +import Foundation +import Logging +import Synchronization + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + +/// A managed `cloud-hypervisor` subprocess. +/// +/// Owns spawning the binary with `--api-socket `, attaching stdout/stderr +/// per the supplied `BootLog`, and tearing it down with a SIGTERM/SIGKILL ladder. +/// One `CHProcess` per VM. Not safe to call `start()` more than once. +final class CHProcess: Sendable { + struct Config: Sendable { + let binary: URL + let apiSocketPath: URL + let bootLog: BootLog? + } + + enum ExitReason: Sendable, Equatable { + case exited(Int32) + case signalled(Int32) + case unknown + } + + private struct State { + var command: Command? + var bootLogHandle: FileHandle? + var exitTask: Task? + } + + private let config: Config + private let logger: Logger? + private let state: Mutex + + init(config: Config, logger: Logger?) { + self.config = config + self.logger = logger + self.state = Mutex(State(command: nil, bootLogHandle: nil, exitTask: nil)) + } + + /// Spawn the cloud-hypervisor binary and wait for its API socket to accept + /// connections. Throws `ContainerizationError(.timeout, ...)` if the socket + /// is not connectable within the bounded poll deadline. + func start() async throws { + let logHandle = try Self.openBootLogHandle(config.bootLog) + var arguments = ["--api-socket", config.apiSocketPath.path] + if SandboxOverrides.chSeccompDisabled { + // `--seccomp false`: cloud-hypervisor's default seccomp profile + // SIGSYS-kills the VMM on syscalls it didn't anticipate. Inside + // apple/container's --virtualization dev container the unix-vsock + // muxer's accept(2)/connect(2) interactions on per-port UDS files + // trip the filter and CH dies mid-process-start, surfacing on the + // host as "Stream unexpectedly closed" on the vminitd gRPC channel. + // Opt-in via CONTAINERIZATION_NO_CH_SECCOMP=1; default = secure. + logger?.warning( + "cloud-hypervisor launching with --seccomp false (CONTAINERIZATION_NO_CH_SECCOMP=1) — VMM seccomp filter disabled" + ) + arguments.append(contentsOf: ["--seccomp", "false"]) + } + var command = Command( + config.binary.path, + arguments: arguments, + environment: ChildEnvironment.minimal() + ) + command.stdout = logHandle + command.stderr = logHandle + // Run cloud-hypervisor in its own session. Without setsid, the VMM + // shares the parent process group and inherits SIGINT/SIGQUIT from + // the controlling TTY (e.g. Ctrl-C in `cctl run`), dying alongside + // the parent before our own teardown ladder (terminate → wait) gets + // a chance to run an orderly shutdown. + command.attrs.setsid = true + + do { + try command.start() + } catch { + try? logHandle?.close() + throw error + } + + let exitTask = Task.detached { [command, logger] in + do { + let status = try command.wait() + if status >= 128 { + return .signalled(status - 128) + } + return .exited(status) + } catch { + logger?.error("cloud-hypervisor wait failed: \(error)") + return .unknown + } + } + + state.withLock { + $0.command = command + $0.bootLogHandle = logHandle + $0.exitTask = exitTask + } + + try await waitForAPISocket() + } + + /// Wait for the subprocess to exit. Resolves with the cached `ExitReason` + /// once `wait4` has returned. Safe to call any number of times. + func wait() async -> ExitReason { + guard let task = state.withLock({ $0.exitTask }) else { + return .unknown + } + return await task.value + } + + /// Send SIGTERM, then SIGKILL after `graceSeconds` if the process is still + /// running. Returns once the process has been reaped. + func terminate(graceSeconds: UInt32) async { + guard let command = state.withLock({ $0.command }) else { return } + + _ = command.kill(SIGTERM) + + do { + try await Timeout.run(for: .seconds(Int(graceSeconds))) { + _ = await self.wait() + } + } catch { + logger?.warning("cloud-hypervisor did not exit within \(graceSeconds)s, sending SIGKILL") + _ = command.kill(SIGKILL) + _ = await wait() + } + + state.withLock { + try? $0.bootLogHandle?.close() + $0.bootLogHandle = nil + } + } + + // MARK: - Private helpers + + private static let socketDeadline: Duration = .seconds(2) + private static let socketPollInterval: Duration = .milliseconds(50) + + private func waitForAPISocket() async throws { + let clock = ContinuousClock() + let deadline = clock.now.advanced(by: Self.socketDeadline) + + while clock.now < deadline { + if Self.isAPISocketReady(at: config.apiSocketPath) { + return + } + try? await Task.sleep(for: Self.socketPollInterval) + } + + await terminate(graceSeconds: 5) + throw ContainerizationError( + .timeout, + message: "cloud-hypervisor API socket not connectable at \(config.apiSocketPath.path) within \(Self.socketDeadline)" + ) + } + + private static func isAPISocketReady(at url: URL) -> Bool { + guard let unix = try? UnixType(path: url.path) else { return false } + guard let socket = try? Socket(type: unix) else { return false } + defer { try? socket.close() } + do { + try socket.connect() + return true + } catch { + return false + } + } + + private static func openBootLogHandle(_ bootLog: BootLog?) throws -> FileHandle? { + guard let bootLog else { return nil } + switch bootLog.base { + case .file(let path, let append): + var flags = O_WRONLY | O_CREAT + flags |= append ? O_APPEND : O_TRUNC + let fd = open(path.path, flags, 0o644) + guard fd >= 0 else { + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + return FileHandle(fileDescriptor: fd, closeOnDealloc: true) + case .fileHandle(let handle): + return handle + } + } + +} +#endif diff --git a/Sources/Containerization/CHVirtualMachineInstance.swift b/Sources/Containerization/CHVirtualMachineInstance.swift new file mode 100644 index 00000000..56af004b --- /dev/null +++ b/Sources/Containerization/CHVirtualMachineInstance.swift @@ -0,0 +1,753 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import CloudHypervisor +import ContainerizationError +import ContainerizationExtras +import ContainerizationOS +import Foundation +import Logging +import NIOCore +import NIOPosix +import Synchronization + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + +/// Cloud-hypervisor backed virtual machine instance. +/// +/// One CH subprocess per VM. Connects to the same `Vminitd` guest agent the +/// macOS path uses, so guest-side semantics are unchanged. This file is the +/// D1 scaffold — `start`/`stop`/`dialAgent`/`dial`/`listen` throw +/// `.unsupported` until D3–D5 fill them in. Hotplug methods delegate to +/// `CHHotplugProvider` (stubbed in D0, real in D2). +public final class CHVirtualMachineInstance: Sendable { + public typealias Agent = Vminitd + + /// VM-instance configuration. Mirrors the macOS `VZVirtualMachineInstance.Configuration`, + /// minus rosetta / nested-virt (which are macOS-only concepts). + public struct Configuration: Sendable { + public var cpus: Int + public var memoryInBytes: UInt64 + public var mountsByID: [String: [Mount]] + public var interfaces: [any Interface] + public var kernel: Kernel? + public var initialFilesystem: Mount? + public var bootLog: BootLog? + public var extensions: [any Sendable] = [] + + public init() { + self.cpus = 4 + self.memoryInBytes = 1024 * 1024 * 1024 + self.mountsByID = [:] + self.interfaces = [] + } + } + + /// One boot-time virtio-blk disk. Built deterministically in `init` so + /// `start()`'s `VmConfig.disks` ordering matches the allocator letters. + struct BootDisk: Sendable { + let mount: Mount + let containerId: String? // nil for rootfs + let letter: Character + } + + // MARK: - State + + private let _state: Mutex + public var state: VirtualMachineInstanceState { + _state.withLock { $0 } + } + + public var mounts: [String: [AttachedFilesystem]] { + hotplug.mounts + } + + /// Cloud-hypervisor exposes one virtio-fs device per source-hash tag, so + /// guests must mount each tag separately at `/run/virtiofs/` rather + /// than using a single unified-share device. + public var virtiofsLayout: VirtiofsLayout { .perTag } + + /// Block-letter allocator shared between the boot wiring (already + /// reserved in `init` via `bootDisks`) and runtime hotplug (D2). + let blockAllocator: any AddressAllocator + + /// Boot-time disks in the order their letters were allocated. D3 maps + /// these into `VmConfig.disks`. + let bootDisks: [BootDisk] + + /// Owned resources + let workDir: URL + let config: Configuration + let chProcess: CHProcess + let client: CloudHypervisor.Client + let hotplug: CHHotplugProvider + let virtiofsdBinaryOverride: URL? + let group: any EventLoopGroup + private let ownsGroup: Bool + private let lock: AsyncLock + private let timeSyncer: TimeSyncer + let logger: Logger? + + /// Pre-bound vsock listener pool for stdio. apple/container's + /// `--virtualization` mode hands the cloud-hypervisor child process a + /// snapshotted filesystem view at fork time, so files written under the + /// per-VM workDir AFTER cloud-hypervisor starts are invisible to CH. + /// We work around this by binding a fixed range of `vsock.sock_` + /// listener files BEFORE launching CH; `vm.listen(_:)` then consumes + /// pre-bound entries from this pool instead of binding on demand. + /// Range covers `LinuxContainer.hostVsockPorts` initial value + /// (`0x10000000`) through the next `stdioPoolSize` sequential ports — + /// enough for `[stdin,stdout,stderr] x N` processes per VM. Bump + /// `stdioPoolSize` if you need more concurrent stdio streams than that. + static let stdioPoolBase: UInt32 = 0x1000_0000 + static let stdioPoolSize: Int = 16 + private struct PreboundListener: Sendable { + let port: UInt32 + let listenFd: Int32 + let path: URL + } + private let _stdioPool: Mutex<[UInt32: PreboundListener]> + + public convenience init( + group: (any EventLoopGroup)? = nil, + runtimeRoot: URL, + chBinary: URL, + virtiofsdBinary: URL?, + logger: Logger? = nil, + with: (inout Configuration) throws -> Void + ) throws { + var config = Configuration() + try with(&config) + try self.init( + group: group, + config: config, + runtimeRoot: runtimeRoot, + chBinary: chBinary, + virtiofsdBinary: virtiofsdBinary, + logger: logger + ) + } + + init( + group: (any EventLoopGroup)?, + config: Configuration, + runtimeRoot: URL, + chBinary: URL, + virtiofsdBinary: URL?, + logger: Logger? + ) throws { + // 1. Working directory: per-instance under runtimeRoot. Mode 0o700 + // so the per-VM UDS sockets inside (api.sock, vsock.sock, vfs-*) + // aren't reachable by other local users — the gRPC channel into + // vminitd has no peer authentication, so socket-file perms are + // the trust boundary. + let workDir = runtimeRoot.appendingPathComponent(UUID().uuidString) + try FileManager.default.createDirectory( + at: workDir, + withIntermediateDirectories: true, + attributes: [.posixPermissions: 0o700] + ) + self.workDir = workDir + + // 2. Block allocator + boot inventory. Walks rootfs first, then + // mountsByID sorted by container id, allocating disk letters in + // that order. The same allocator is later handed to the hotplug + // provider so runtime add-disk picks up where boot wiring left off. + let allocator = Character.blockDeviceTagAllocator() + let inventory = try config.bootInventory(allocator: allocator) + self.blockAllocator = allocator + self.bootDisks = inventory.bootDisks + + // 3. EventLoopGroup + if let group { + self.ownsGroup = false + self.group = group + } else { + self.ownsGroup = true + self.group = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + } + + // 4. CHProcess + REST client. The api socket lives next to the workDir. + let apiSocket = workDir.appendingPathComponent("api.sock") + self.chProcess = CHProcess( + config: .init( + binary: chBinary, + apiSocketPath: apiSocket, + bootLog: config.bootLog + ), + logger: logger + ) + self.client = try CloudHypervisor.Client( + socketPath: apiSocket, + eventLoopGroup: self.group, + logger: logger ?? Logger(label: "CloudHypervisor.Client") + ) + + // 5. Hotplug provider — owns the mount registry, seeded with the + // boot inventory so registerMounts can append to it. + self.hotplug = CHHotplugProvider( + client: self.client, + workDir: workDir, + virtiofsdBinary: virtiofsdBinary, + allocator: allocator, + initialMounts: inventory.attachments, + logger: logger + ) + + // 6. Misc + self.config = config + self.virtiofsdBinaryOverride = virtiofsdBinary + self.logger = logger + self.lock = .init() + self.timeSyncer = .init(logger: logger) + self._state = Mutex(.stopped) + self._stdioPool = Mutex([:]) + } + + /// Mutate the mount registry. Forwards to the hotplug provider, which + /// owns the registry. Kept on the instance for parity with the macOS + /// path's `withMountRegistry` API. + func withMountRegistry(_ body: (inout sending [String: [AttachedFilesystem]]) throws -> sending T) rethrows -> T { + try hotplug.withMountRegistry(body) + } +} + +// MARK: - VirtualMachineInstance conformance (stubbed; D3–D5 fill these in) + +extension CHVirtualMachineInstance: VirtualMachineInstance { + public func start() async throws { + try await lock.withLock { _ in + guard self.state == .stopped else { + throw ContainerizationError( + .invalidState, + message: "virtual machine is not stopped (\(self.state))" + ) + } + self._state.withLock { $0 = .starting } + + do { + var vmConfig = try await self.buildVmConfig() + for ext in self.config.extensions.compactMap({ $0 as? any CHInstanceExtension }) { + try ext.configureCH(&vmConfig) + } + let finalConfig = vmConfig + + // Pre-bind the stdio vsock listener pool before launching CH. + // CH inherits a fs snapshot at fork time and is blind to + // anything we add to workDir after — see `_stdioPool` doc. + try self.prebindStdioPool() + + try await self.chProcess.start() + + try await chCall { try await self.client.vmCreate(finalConfig) } + try await chCall { try await self.client.vmBoot() } + + let fh = try await self.dialVminitdWithRetries() + let agent = try Vminitd(connection: fh, group: self.group) + await self.timeSyncer.start(context: agent) + + for ext in self.config.extensions.compactMap({ $0 as? any CHInstanceExtension }) { + try ext.didCreate(self) + } + + self._state.withLock { $0 = .running } + } catch { + self.logger?.warning("CH VM start failed; tearing down partial resources: \(error)") + await self.teardownAfterFailedStart() + self._state.withLock { $0 = .stopped } + throw error + } + } + } + + /// Reverse the side effects of any partially-completed `start()`: + /// terminate cloud-hypervisor, kill registered virtiofsd processes, + /// close pre-bound stdio listener fds, remove the workDir, and shut + /// down the owned event-loop group. All steps are best-effort and + /// safe to invoke whether the corresponding `start()` step ran or not. + private func teardownAfterFailedStart() async { + try? await self.timeSyncer.close() + + // chProcess.terminate() is a no-op if `start()` never reached the + // spawn — otherwise SIGTERM / SIGKILL ladder + reap. + await self.chProcess.terminate(graceSeconds: 5) + + // Kills every virtiofsd registered by buildVmConfig (boot-time) or + // by an in-flight hotplug. Empty if neither ran. + await self.hotplug.shutdown() + + // Close pre-bound stdio listener fds the start path opened in + // prebindStdioPool. Files unlink with workDir below. + let leftover = self._stdioPool.withLock { pool -> [PreboundListener] in + let entries = Array(pool.values) + pool.removeAll() + return entries + } + for entry in leftover { + _ = close(entry.listenFd) + } + + try? FileManager.default.removeItem(at: self.workDir) + + // Drain the AHC HTTP client before shutting down the shared + // event-loop group, same rationale as `stop()`: AHC's deferred + // connection cleanup must not outlive the group it's parked on. + try? await self.client.shutdown() + + if self.ownsGroup { + try? await self.group.shutdownGracefully() + } + } + + public func stop() async throws { + try await lock.withLock { _ in + guard self.state == .running else { + throw ContainerizationError(.invalidState, message: "vm is not running") + } + self._state.withLock { $0 = .stopping } + + try? await self.timeSyncer.close() + + for ext in self.config.extensions.compactMap({ $0 as? any CHInstanceExtension }) { + try? await ext.willStop(self) + } + + // Best-effort graceful shutdown via REST. The CH process may + // already be on its way out, so swallow errors from these. + _ = try? await chCall { try await self.client.vmShutdown() } + _ = try? await chCall { try await self.client.vmmShutdown() } + + await self.chProcess.terminate(graceSeconds: 10) + await self.hotplug.shutdown() + + // Drain the AHC HTTP client before tearing down the shared + // event-loop group. AHC parks deferred connection-cleanup + // work on the group's event loops after each response; if we + // shut the group down with that work still pending, NIO + // prints "Cannot schedule tasks on an EventLoop that has + // already shut down" (and will hard-crash in future NIO + // releases). Must run after the last `chCall` above and + // before `group.shutdownGracefully()` below. + try? await self.client.shutdown() + + // Close any listening fds for stdio ports the test never + // consumed. The files themselves are removed when workDir is + // unlinked below. + let leftover = self._stdioPool.withLock { pool -> [PreboundListener] in + let entries = Array(pool.values) + pool.removeAll() + return entries + } + for entry in leftover { + _ = close(entry.listenFd) + } + + if self.ownsGroup { + try? await self.group.shutdownGracefully() + } + + try? FileManager.default.removeItem(at: self.workDir) + + self._state.withLock { $0 = .stopped } + } + } + + public func dialAgent() async throws -> Vminitd { + try await lock.withLock { _ in + try self.requireRunning() + let fh = try await chVsockDial( + baseSocket: self.workDir.appendingPathComponent("vsock.sock"), + port: Vminitd.port + ) + return try Vminitd(connection: fh, group: self.group) + } + } + + public func dial(_ port: UInt32) async throws -> FileHandle { + try await lock.withLock { _ in + try self.requireRunning() + return try await chVsockDial( + baseSocket: self.workDir.appendingPathComponent("vsock.sock"), + port: port + ) + } + } + + /// Reject vsock dials when the VM isn't actually running. Without this, + /// a dial issued after `stop()` (or before `start()` finished) raced + /// against `workDir` removal and surfaced as an opaque "connect: No + /// such file or directory" instead of a clear lifecycle error. + private func requireRunning() throws { + let current = self.state + guard current == .running else { + throw ContainerizationError( + .invalidState, + message: "vm is not running (state=\(current))" + ) + } + } + + public func listen(_ port: UInt32) throws -> VsockListener { + // Consume from the pre-bound pool (see `_stdioPool` doc). + let prebound = _stdioPool.withLock { $0.removeValue(forKey: port) } + guard let prebound else { + throw ContainerizationError( + .invalidArgument, + message: "vsock port \(port) was not pre-bound; only ports " + + "\(Self.stdioPoolBase)..<\(Self.stdioPoolBase + UInt32(Self.stdioPoolSize)) " + + "are available for stdio. Increase CHVirtualMachineInstance.stdioPoolSize " + + "if you need more concurrent stdio streams per VM." + ) + } + let listenFd = prebound.listenFd + let path = prebound.path + logger?.debug("vsock listen consuming pool entry port=\(port) path=\(path.path)") + let listener = VsockListener(port: port) { [path, listenFd, logger] _ in + logger?.debug("vsock listen finishing port=\(port) closing listenFd=\(listenFd)") + _ = close(listenFd) + try? FileManager.default.removeItem(at: path) + } + let acceptLogger = logger + // The accept loop calls a blocking accept() syscall, which is + // inappropriate for Swift's cooperative thread pool: a pool thread + // pinned to accept() can't service other tasks until the syscall + // returns. With even a few leaked accept loops (e.g. when a test's + // setupIO times out and the listener is finished only when the + // 30s timer fires), Task.detached'd accept loops queue behind the + // pinned threads and never run, manifesting as the "vsock acceptLoop + // starting" log being silent and the dial-back never being seen by + // the host. Use libdispatch's global queue instead — it spawns + // OS threads on demand and is the right tool for blocking syscalls. + DispatchQueue.global(qos: .userInitiated).async { [listener, listenFd] in + acceptLogger?.debug("vsock acceptLoop starting port=\(listener.port) listenFd=\(listenFd)") + Self.acceptLoop(listenFd: listenFd, into: listener, logger: acceptLogger) + acceptLogger?.debug("vsock acceptLoop exited port=\(listener.port)") + } + return listener + } + + /// Bind every port in `stdioPoolBase../vsock.sock_`. Must run before + /// `chProcess.start()` so the files end up in CH's snapshot view of + /// the workDir. Files for ports never consumed are removed during + /// `stop()` along with the rest of `workDir`; the listening fds are + /// closed there too. + private func prebindStdioPool() throws { + let base = workDir.appendingPathComponent("vsock.sock") + var pool: [UInt32: PreboundListener] = [:] + pool.reserveCapacity(Self.stdioPoolSize) + do { + for offset in 0.. AttachedFilesystem { + try await hotplug.hotplug(block, id: id) + } + + public func releaseHotplug(id: String) async throws { + try await hotplug.releaseHotplug(id: id) + } + + public func hotplugVirtioFS(_ mounts: [Mount], id: String) async throws { + try await hotplug.hotplugVirtioFS(mounts, id: id) + } + + public func releaseVirtioFS(id: String) async throws { + try await hotplug.releaseVirtioFS(id: id) + } + + public func registerMounts(id: String, rootfs: AttachedFilesystem, additionalMounts: [Mount]) throws { + try hotplug.registerMounts(id: id, rootfs: rootfs, additionalMounts: additionalMounts) + } +} + +// MARK: - VmConfig + vminitd dial helpers + +extension CHVirtualMachineInstance { + /// Build the cloud-hypervisor `VmConfig` from `config`. Spawns one + /// `virtiofsd` per unique boot-time virtiofs source-hash tag and registers + /// each with the hotplug provider so `releaseVirtioFS(id:)` and `stop()` + /// can reclaim them. + private func buildVmConfig() async throws -> CloudHypervisor.VmConfig { + guard let kernel = config.kernel else { + throw ContainerizationError(.invalidArgument, message: "kernel is required for cloud-hypervisor backend") + } + guard let rootfs = config.initialFilesystem else { + throw ContainerizationError(.invalidArgument, message: "initialFilesystem is required for cloud-hypervisor backend") + } + + // Disks: rootfs forced read-only at the device level; container disks + // honor their `ro` option through chDiskConfig. + var disks: [CloudHypervisor.DiskConfig] = [] + for bd in bootDisks { + let chId = bd.containerId.map { "blk-\($0)-\(bd.letter)" } ?? "rootfs" + if var disk = bd.mount.chDiskConfig(id: chId) { + if bd.containerId == nil { + disk.readonly = true + } + disks.append(disk) + } + } + + // Virtiofs: group all .virtiofs mounts in mountsByID by source-hash + // tag, spawn one virtiofsd per tag, build matching FsConfigs. + var byTag: [String: (mounts: [Mount], owners: [String])] = [:] + for cid in config.mountsByID.keys.sorted() { + guard let mounts = config.mountsByID[cid] else { continue } + for mount in mounts { + guard case .virtiofs = mount.runtimeOptions else { continue } + let tag = try hashFilePath(path: mount.source) + var entry = byTag[tag] ?? (mounts: [], owners: []) + entry.mounts.append(mount) + if !entry.owners.contains(cid) { + entry.owners.append(cid) + } + byTag[tag] = entry + } + } + + var fsConfigs: [CloudHypervisor.FsConfig] = [] + // Resolve virtiofsd lazily — only if we actually have any virtiofs + // mounts at boot. A block-only VM doesn't require virtiofsd. + let resolvedVirtiofsdBinary: URL? = + byTag.isEmpty + ? nil + : try CHVirtualMachineManager.resolveBinary(virtiofsdBinaryOverride, name: "virtiofsd") + for (tag, entry) in byTag { + guard let source = entry.mounts.first?.source else { continue } + guard let binary = resolvedVirtiofsdBinary else { continue } + let socket = chVirtiofsSocketURL(workDir: workDir, tag: tag) + let readonly = entry.mounts.allSatisfy { $0.options.contains("ro") } + let chDeviceId = "fs-\(tag)" + + let process = VirtiofsdProcess( + config: .init( + binary: binary, + socketPath: socket, + sharedDir: URL(fileURLWithPath: source), + readonly: readonly + ), + logger: logger + ) + try await process.start() + + hotplug.recordBootTimeVirtiofs( + tag: tag, + process: process, + chDeviceId: chDeviceId, + ownerIds: entry.owners + ) + + fsConfigs.append( + CloudHypervisor.FsConfig( + tag: tag, + socket: socket.path, + id: chDeviceId + ) + ) + } + + let net: [CloudHypervisor.NetConfig] = try config.interfaces.compactMap { + try ($0 as? any CHInterface)?.chNetConfig() + } + + let vsock = CloudHypervisor.VsockConfig( + cid: 3, + socket: workDir.appendingPathComponent("vsock.sock").path + ) + + let payload = CloudHypervisor.PayloadConfig( + kernel: kernel.path.path, + cmdline: kernel.linuxCommandline(initialFilesystem: rootfs) + ) + + return CloudHypervisor.VmConfig( + cpus: .init(bootVcpus: config.cpus, maxVcpus: config.cpus), + // `shared: true` is required as soon as any vhost-user device (e.g. + // virtiofsd) is attached — CH rejects `vm.boot` with "Using + // vhost-user requires using shared memory or huge pages" otherwise. + // We set it unconditionally because virtiofs can be added via + // hotplug after boot (CHHotplugProvider.hotplugVirtioFS), and the + // memory config can't be changed once the VM has booted. The + // MAP_SHARED-backed RAM has negligible runtime impact. + memory: .init( + size: Self.alignMemorySize(config.memoryInBytes), + shared: true + ), + payload: payload, + disks: disks.isEmpty ? nil : disks, + net: net.isEmpty ? nil : net, + fs: fsConfigs.isEmpty ? nil : fsConfigs, + vsock: vsock, + // Kernel cmdline is `console=hvc0`, so userspace (vminitd) writes + // to hvc0 — capture that to the bootlog. We deliberately disable + // the pl011 (`serial`) UART entirely with `.Off`. Any non-Off mode + // makes cloud-hypervisor APPEND `earlycon=pl011,mmio,0x...` to + // the kernel cmdline (see CH device_manager.rs add_serial_device), + // which forces every early-boot printk character through an MMIO + // trap into CH's pl011 emulator and adds ~1.5s to VM boot. We + // don't need pl011 — virtio-console is enough — so just turn it + // off. To diagnose pre-virtio-console boot, switch to `.File` and + // re-add `earlycon=pl011,mmio,0x09000000` to the cmdline. + console: Self.consoleConfig(forBootLog: config.bootLog), + serial: .init(mode: .Off) + ) + } + + /// Round `bytes` up to the nearest 2 MiB boundary. Cloud Hypervisor + /// rejects `vm.boot` with "Memory size is misaligned with default page + /// size or its hugepage size" if the memory size isn't a multiple of the + /// guest's page size; 2 MiB is a multiple of both 4 KiB and 64 KiB pages + /// and the standard hugepage size on aarch64. + private static func alignMemorySize(_ bytes: UInt64) -> UInt64 { + let alignment: UInt64 = 2 * 1024 * 1024 + let remainder = bytes % alignment + return remainder == 0 ? bytes : bytes + (alignment - remainder) + } + + private static func consoleConfig(forBootLog bootLog: BootLog?) -> CloudHypervisor.ConsoleConfig { + guard let bootLog else { return .init(mode: .Null) } + switch bootLog.base { + case .file(let path, _): + return .init(mode: .File, file: path.path) + case .fileHandle: + // Cloud Hypervisor's File mode requires a path. For raw FDs we + // could route through a pipe/relay later; for v1 fall back to + // null to avoid silently dropping logs to a wrong place. + return .init(mode: .Null) + } + } + + /// Bounded retry loop for dialing the vminitd vsock port. Absorbs the + /// short delay between `vm.boot` and the guest agent advertising the + /// CONNECT/OK protocol on the host UDS. Vminitd typically becomes ready + /// within a few hundred ms of `vm.boot` returning, so we poll fast at + /// 10 ms intervals (capped at 50 ms) to avoid burning wall-clock in + /// exponential backoff while the guest is already up. Deadline stays + /// at 60s as a safety net for the cold-cache long tail. + private func dialVminitdWithRetries( + deadline: Duration = .seconds(60), + initialDelay: Duration = .milliseconds(10) + ) async throws -> FileHandle { + let baseSocket = workDir.appendingPathComponent("vsock.sock") + let clock = ContinuousClock() + let stop = clock.now.advanced(by: deadline) + var delay = initialDelay + var lastError: any Error = ContainerizationError(.timeout, message: "could not dial vminitd") + while clock.now < stop { + do { + return try await chVsockDial(baseSocket: baseSocket, port: Vminitd.port) + } catch { + lastError = error + try? await Task.sleep(for: delay) + if delay < .milliseconds(50) { + delay = delay * 2 + } + } + } + throw ContainerizationError(.timeout, message: "could not dial vminitd within \(deadline): \(lastError)") + } + + /// Blocking accept loop driving a `VsockListener`. Runs on a detached + /// task because `accept(2)` blocks. Exits when the listening fd is + /// closed (by `VsockListener.finish()`) or the stream consumer + /// terminates. + private static func acceptLoop(listenFd: Int32, into listener: VsockListener, logger: Logger?) { + while true { + logger?.debug("vsock acceptLoop blocking on accept port=\(listener.port) listenFd=\(listenFd)") + let connFd = accept(listenFd, nil, nil) + if connFd < 0 { + let savedErrno = errno + if savedErrno == EINTR { + continue + } + logger?.debug("vsock acceptLoop accept returned \(connFd) errno=\(savedErrno) port=\(listener.port)") + return + } + logger?.debug("vsock acceptLoop accepted connFd=\(connFd) port=\(listener.port)") + let handle = FileHandle(fileDescriptor: connFd, closeOnDealloc: true) + let result = listener.yield(handle) + if case .terminated = result { + logger?.debug("vsock acceptLoop yield terminated port=\(listener.port)") + try? handle.close() + return + } + logger?.debug("vsock acceptLoop yield enqueued port=\(listener.port)") + } + } +} + +// MARK: - Boot inventory + +extension CHVirtualMachineInstance.Configuration { + /// Walks boot-time mounts in deterministic order (rootfs first, then + /// `mountsByID` sorted by container id, then each container's mounts in + /// input order), allocating disk letters for virtio-blk mounts and seeding + /// the per-container `AttachedFilesystem` registry. + /// + /// The allocator is shared with the runtime hotplug provider, so block + /// hotplug picks up at the next free letter after boot. + func bootInventory( + allocator: any AddressAllocator + ) throws -> (attachments: [String: [AttachedFilesystem]], bootDisks: [CHVirtualMachineInstance.BootDisk]) { + var bootDisks: [CHVirtualMachineInstance.BootDisk] = [] + var attachments: [String: [AttachedFilesystem]] = [:] + + // Rootfs is not part of mountsByID. If it's a block device, it claims + // the first letter (vda) so the kernel cmdline `root=/dev/vda` is right. + if let rootfs = self.initialFilesystem, rootfs.isBlock { + let letter = try allocator.allocate() + bootDisks.append(.init(mount: rootfs, containerId: nil, letter: letter)) + } + + for cid in self.mountsByID.keys.sorted() { + guard let mounts = self.mountsByID[cid] else { continue } + var perContainer: [AttachedFilesystem] = [] + for mount in mounts { + let attached = try AttachedFilesystem(mount: mount, allocator: allocator) + if mount.isBlock, let letter = attached.source.last { + bootDisks.append(.init(mount: mount, containerId: cid, letter: letter)) + } + perContainer.append(attached) + } + attachments[cid] = perContainer + } + + return (attachments, bootDisks) + } +} +#endif diff --git a/Sources/Containerization/CHVirtualMachineManager.swift b/Sources/Containerization/CHVirtualMachineManager.swift new file mode 100644 index 00000000..ba20b5d1 --- /dev/null +++ b/Sources/Containerization/CHVirtualMachineManager.swift @@ -0,0 +1,149 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import Foundation +import Logging +import NIOCore + +/// VirtualMachineManager backed by `cloud-hypervisor` + KVM on Linux. +/// +/// One subprocess per VM. The manager itself is just a factory: kernel, +/// initial filesystem, host binary paths, and a runtime root (under which +/// each instance gets its own working directory). +public struct CHVirtualMachineManager: VirtualMachineManager { + private let kernel: Kernel + private let initialFilesystem: Mount + private let chBinary: URL + private let virtiofsdBinaryOverride: URL? + private let runtimeRoot: URL + private let group: (any EventLoopGroup)? + private let logger: Logger? + + /// - Parameters: + /// - kernel: The Linux kernel image used for every VM this manager creates. + /// - initialFilesystem: The rootfs `Mount` (typically the `init.ext4` + /// blob produced by `make init`). + /// - chBinary: Path to the `cloud-hypervisor` binary; if nil, looked + /// up on `PATH`. Validated at init time. + /// - virtiofsdBinary: Path to `virtiofsd`; if nil, looked up on `PATH` + /// lazily — only when a virtiofs share is actually used. A VM that + /// boots with only block-device mounts can run without virtiofsd + /// installed at all. + /// - runtimeRoot: Directory under which per-VM working directories are + /// created. Defaults to `/run/containerization/ch`. The directory is + /// created with mode `0o700` so per-VM UDS sockets (api.sock, + /// vsock.sock, vfs-*.sock) inside aren't reachable by other local + /// users. `/run` is tmpfs on every modern Linux distro, so contents + /// don't survive reboot — which is the right lifecycle for VM + /// runtime state. + /// - group: Optional shared NIO `EventLoopGroup`; if nil, each VM + /// spawns its own. + public init( + kernel: Kernel, + initialFilesystem: Mount, + chBinary: URL? = nil, + virtiofsdBinary: URL? = nil, + runtimeRoot: URL? = nil, + group: (any EventLoopGroup)? = nil, + logger: Logger? = nil + ) throws { + self.kernel = kernel + self.initialFilesystem = initialFilesystem + self.chBinary = try Self.resolveBinary(chBinary, name: "cloud-hypervisor") + if let virtiofsdBinary { + // Validate explicit overrides at init time so misconfiguration + // surfaces early. PATH-lookup deferral only applies when no + // override is supplied. + guard FileManager.default.isExecutableFile(atPath: virtiofsdBinary.path) else { + throw ContainerizationError( + .notFound, + message: "virtiofsd not executable at \(virtiofsdBinary.path)" + ) + } + } + self.virtiofsdBinaryOverride = virtiofsdBinary + let runtimeRoot = runtimeRoot ?? URL(fileURLWithPath: "/run/containerization/ch") + try FileManager.default.createDirectory( + at: runtimeRoot, + withIntermediateDirectories: true, + attributes: [.posixPermissions: 0o700] + ) + // createDirectory only sets attributes on directories it creates, so + // explicitly tighten an existing dir if a previous run left it at a + // looser mode. + try? FileManager.default.setAttributes([.posixPermissions: 0o700], ofItemAtPath: runtimeRoot.path) + self.runtimeRoot = runtimeRoot + self.group = group + self.logger = logger + } + + public func create(config: some VMCreationConfig) async throws -> any VirtualMachineInstance { + let vmConfig = config.configuration + + var instanceConfig = CHVirtualMachineInstance.Configuration() + instanceConfig.cpus = vmConfig.cpus + instanceConfig.memoryInBytes = vmConfig.memoryInBytes + instanceConfig.interfaces = vmConfig.interfaces + instanceConfig.mountsByID = vmConfig.mountsByID + instanceConfig.bootLog = vmConfig.bootLog + instanceConfig.extensions = vmConfig.extensions + instanceConfig.kernel = kernel + instanceConfig.initialFilesystem = initialFilesystem + + return try CHVirtualMachineInstance( + group: group, + config: instanceConfig, + runtimeRoot: runtimeRoot, + chBinary: chBinary, + virtiofsdBinary: virtiofsdBinaryOverride, + logger: logger + ) + } + + // MARK: - Binary resolution + + /// Resolve a binary path, accepting an explicit override or falling back to + /// `PATH` lookup. Used both at manager init for `cloud-hypervisor` and + /// lazily by the CH instance / hotplug provider for `virtiofsd` so a + /// block-only VM doesn't require virtiofsd to be installed. + static func resolveBinary(_ override: URL?, name: String) throws -> URL { + if let override { + guard FileManager.default.isExecutableFile(atPath: override.path) else { + throw ContainerizationError( + .notFound, + message: "\(name) not executable at \(override.path)" + ) + } + return override + } + + let path = ProcessInfo.processInfo.environment["PATH"] ?? "/usr/local/bin:/usr/bin:/bin" + for dir in path.split(separator: ":") where !dir.isEmpty { + let candidate = URL(fileURLWithPath: String(dir)).appendingPathComponent(name) + if FileManager.default.isExecutableFile(atPath: candidate.path) { + return candidate + } + } + + throw ContainerizationError( + .notFound, + message: "could not find \(name) on PATH; pass an explicit URL to CHVirtualMachineManager.init" + ) + } +} +#endif diff --git a/Sources/Containerization/ContainerManager.swift b/Sources/Containerization/ContainerManager.swift index 2e2edd7b..1b3a4089 100644 --- a/Sources/Containerization/ContainerManager.swift +++ b/Sources/Containerization/ContainerManager.swift @@ -373,13 +373,6 @@ public struct ContainerManager: Sendable { } } -extension CIDRv4 { - /// The gateway address of the network. - public var gateway: IPv4Address { - IPv4Address(self.lower.value + 1) - } -} - extension CIDRv6 { /// The gateway address of the network. public var gateway: IPv6Address { diff --git a/Sources/Containerization/HostDefaultRoute.swift b/Sources/Containerization/HostDefaultRoute.swift new file mode 100644 index 00000000..e2312079 --- /dev/null +++ b/Sources/Containerization/HostDefaultRoute.swift @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +/// Reads the host's default IPv4 egress interface from `/proc/net/route`. +/// +/// `/proc/net/route` columns (tab-separated): +/// +/// Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT +/// +/// Numeric fields are hex with bytes in network order (so `0102A8C0` is +/// `192.168.2.1`). Pure-string parsing keeps this cross-platform-testable +/// even though `/proc/net/route` itself only exists on Linux. +enum HostDefaultRoute { + /// `RTF_GATEWAY` from ``. Set on rows representing a gateway route. + private static let RTF_GATEWAY: UInt32 = 0x0002 + + /// Parse the contents of `/proc/net/route` and return the iface for the + /// default route (destination 0.0.0.0 with `RTF_GATEWAY`). When multiple + /// default routes exist, the one with the lowest metric wins. + static func parseEgress(procNetRoute contents: String) -> String? { + var best: (iface: String, metric: UInt64)? + for (i, line) in contents.split(separator: "\n", omittingEmptySubsequences: true).enumerated() { + if i == 0 { continue } // header + let cols = line.split(separator: "\t", omittingEmptySubsequences: false) + guard cols.count >= 11 else { continue } + let iface = String(cols[0]) + let destination = cols[1] + let flagsHex = cols[3] + let metricStr = cols[6] + + guard destination == "00000000" else { continue } + guard let flags = UInt32(flagsHex, radix: 16), + flags & RTF_GATEWAY != 0 + else { continue } + let metric = UInt64(metricStr) ?? UInt64.max + if let current = best, metric >= current.metric { + continue + } + best = (iface, metric) + } + return best?.iface + } + + /// Read `/proc/net/route` and return the default-route iface, or nil if + /// the file is missing or no default route exists. + static func currentEgress() -> String? { + guard let contents = try? String(contentsOfFile: "/proc/net/route", encoding: .utf8) else { + return nil + } + return parseEgress(procNetRoute: contents) + } +} diff --git a/Sources/Containerization/IptablesRules.swift b/Sources/Containerization/IptablesRules.swift new file mode 100644 index 00000000..fa5c018e --- /dev/null +++ b/Sources/Containerization/IptablesRules.swift @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import ContainerizationOS +import Foundation + +/// Thin idempotent wrappers around the `iptables` CLI for use by +/// `BridgeManager`. We don't program nftables directly; modern distros +/// ship the `iptables` binary as a shim over nftables and it's universally +/// available. +enum IptablesRules { + /// Add a rule unless it already exists. `args` is the rule body + /// excluding the leading action (`-A`/`-C`/`-D`). + /// + /// Implementation: run `iptables -C ` first; if exit 0, the rule + /// exists already — return. Otherwise run `iptables -A ` and + /// throw on non-zero exit. + static func ensure(table: String? = nil, args: [String]) throws { + let tableArgs = table.map { ["-t", $0] } ?? [] + let check = try run(args: tableArgs + ["-C"] + args) + if check.exit == 0 { return } + let add = try run(args: tableArgs + ["-A"] + args) + if add.exit != 0 { + throw ContainerizationError( + .internalError, + message: """ + iptables -A \(args.joined(separator: " ")) failed (exit \(add.exit))\ + \(add.stderr.isEmpty ? "" : ": \(add.stderr)") + """ + ) + } + } + + /// Best-effort delete. Ignores non-zero exit (rule may not exist). + static func remove(table: String? = nil, args: [String]) { + let tableArgs = table.map { ["-t", $0] } ?? [] + _ = try? run(args: tableArgs + ["-D"] + args) + } + + /// Captured outcome of a single `iptables` invocation. + private struct InvocationResult { + let exit: Int32 + let stderr: String + } + + /// Run `iptables` with the given args, returning the exit status and any + /// stderr the binary emitted. Throws if no `iptables` binary is found. + private static func run(args: [String]) throws -> InvocationResult { + // ContainerizationOS.Command uses execve() under the hood, which + // requires an absolute path. Probe the two paths iptables actually + // ships at on Linux distros — /usr/sbin first (Debian, Ubuntu, + // Fedora, Alpine, RHEL), then /sbin (older / busybox-style). + let candidates = ["/usr/sbin/iptables", "/sbin/iptables"] + // Open /dev/null fresh rather than using FileHandle.nullDevice: + // swift-corelibs-foundation's nullDevice uses a sentinel fd that + // doesn't survive dup2() in Command's child, producing EBADF on exec. + // Capture stderr through a pipe so failures surface with the actual + // iptables error (locked xtables, missing kernel module, conflicting + // rule) instead of an opaque exit code. + let devNullOut = FileHandle(forWritingAtPath: "/dev/null") + let stderrPipe = Pipe() + defer { + try? devNullOut?.close() + try? stderrPipe.fileHandleForReading.close() + } + for path in candidates where FileManager.default.isExecutableFile(atPath: path) { + var cmd = Command(path, arguments: args) + cmd.stdout = devNullOut + cmd.stderr = stderrPipe.fileHandleForWriting + try cmd.start() + // Close the parent's write end so the read end sees EOF when + // iptables exits, even if iptables itself never writes anything. + try? stderrPipe.fileHandleForWriting.close() + let exit = try cmd.wait() + let data = (try? stderrPipe.fileHandleForReading.readToEnd()) ?? Data() + let stderr = + String(data: data, encoding: .utf8)? + .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + return InvocationResult(exit: exit, stderr: stderr) + } + throw ContainerizationError( + .notFound, + message: "iptables not found at /usr/sbin/iptables or /sbin/iptables; install iptables (or its nftables shim)" + ) + } +} +#endif diff --git a/Sources/Containerization/Kernel+Commandline.swift b/Sources/Containerization/Kernel+Commandline.swift new file mode 100644 index 00000000..a5ff83c4 --- /dev/null +++ b/Sources/Containerization/Kernel+Commandline.swift @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +extension Kernel { + /// Build the `init=/sbin/vminitd` Linux kernel command line for the given + /// rootfs type. Used by both the VZ and cloud-hypervisor backends since + /// the guest's vminitd init contract is identical across VMMs. + func linuxCommandline(initialFilesystem: Mount) -> String { + var args = self.commandLine.kernelArgs + + args.append("init=/sbin/vminitd") + // rootfs is always mounted read-only. + args.append("ro") + + switch initialFilesystem.type { + case "virtiofs": + args.append(contentsOf: [ + "rootfstype=virtiofs", + "root=rootfs", + ]) + case "ext4": + args.append(contentsOf: [ + "rootfstype=ext4", + "root=/dev/vda", + ]) + default: + fatalError("unsupported initfs filesystem \(initialFilesystem.type)") + } + + if self.commandLine.initArgs.count > 0 { + args.append("--") + args.append(contentsOf: self.commandLine.initArgs) + } + + return args.joined(separator: " ") + } +} diff --git a/Sources/Containerization/LinuxBridgedNetwork.swift b/Sources/Containerization/LinuxBridgedNetwork.swift new file mode 100644 index 00000000..de62ab68 --- /dev/null +++ b/Sources/Containerization/LinuxBridgedNetwork.swift @@ -0,0 +1,181 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import ContainerizationExtras +import ContainerizationNetlink +import Crypto +import Foundation + +/// A `Network` implementation backed by Linux TAP devices, optionally +/// enslaved to a pre-existing bridge. The bridge itself is **not** managed +/// by this type — callers own its creation, teardown, and any NAT/firewall +/// rules. This abstraction only handles per-VM TAP lifecycle and IPv4 +/// address allocation within a configured subnet. +/// +/// Mirrors the `VmnetNetwork` shape on macOS so the two backends are +/// interchangeable from a `LinuxContainer`/`Network` consumer's POV. +/// +/// Requires `CAP_NET_ADMIN` for TAP creation and bridge enslavement. +public struct LinuxBridgedNetwork: Network { + /// The IPv4 subnet from which container interfaces are allocated. + public let subnet: CIDRv4 + /// The default-route gateway for containers attached to this network. + public let ipv4Gateway: IPv4Address + /// Optional bridge name to enslave each created TAP to. + public let bridge: String? + /// MTU applied to every TAP this network creates. + public let mtu: UInt32 + + private var allocator: Allocator + private var taps: [String: TAPDevice] + + /// Per-id rotating IPv4 allocator. Mirrors `VmnetNetwork.Allocator` + /// verbatim: lower bound = `subnet.lower + 2` (gateway = `lower + 1`, + /// network = `lower`), size = `upper - lower - 3` (broadcast = `upper`, + /// also reserved). + struct Allocator: Sendable { + private let addressAllocator: any AddressAllocator + private let cidr: CIDRv4 + private var allocations: [String: UInt32] + + init(cidr: CIDRv4) throws { + self.cidr = cidr + self.allocations = [:] + let span = cidr.upper.value - cidr.lower.value + guard span >= 4 else { + throw ContainerizationError( + .invalidArgument, + message: "subnet \(cidr) has no usable host addresses (need at least 4)" + ) + } + let size = Int(span - 3) + self.addressAllocator = try UInt32.rotatingAllocator( + lower: cidr.lower.value + 2, + size: UInt32(size) + ) + } + + mutating func allocate(_ id: String) throws -> CIDRv4 { + if allocations[id] != nil { + throw ContainerizationError( + .exists, + message: "allocation with id \(id) already exists" + ) + } + let index = try addressAllocator.allocate() + allocations[id] = index + return try CIDRv4(IPv4Address(index), prefix: cidr.prefix) + } + + mutating func release(_ id: String) throws { + if let index = allocations[id] { + try addressAllocator.release(index) + allocations.removeValue(forKey: id) + } + } + } + + /// Create a Linux bridged network. + /// + /// - Parameters: + /// - subnet: The IPv4 subnet to allocate container addresses from. + /// - gateway: Default-route gateway IPv4. If nil, defaults to + /// `subnet.gateway` (= `lower + 1`). + /// - bridge: Existing bridge name to enslave each TAP to, or nil for + /// standalone TAPs. Validated at init time via netlink. + /// - mtu: MTU applied to every created TAP (default 1500). + public init( + subnet: CIDRv4, + gateway: IPv4Address? = nil, + bridge: String? = nil, + mtu: UInt32 = 1500 + ) throws { + self.subnet = subnet + self.ipv4Gateway = gateway ?? subnet.gateway + self.bridge = bridge + self.mtu = mtu + self.allocator = try Allocator(cidr: subnet) + self.taps = [:] + + if let bridge { + // Validate via the public linkGet — empty result or netlink error + // means the bridge does not exist or is unreachable. + let session = try NetlinkSession(socket: DefaultNetlinkSocket()) + do { + let links = try session.linkGet(interface: bridge) + guard !links.isEmpty else { + throw ContainerizationError( + .notFound, + message: "bridge \(bridge) not found" + ) + } + } catch let err as ContainerizationError { + throw err + } catch { + throw ContainerizationError( + .notFound, + message: "bridge \(bridge) not found: \(error)" + ) + } + } + } + + public mutating func createInterface(_ id: String) throws -> Interface? { + let cidr = try allocator.allocate(id) + let tapName = Self.derivedTAPName(forID: id) + + let device: TAPDevice + do { + device = try TAPDevice( + name: tapName, + bridge: bridge, + mtu: mtu, + macAddress: nil + ) + } catch { + // Roll back the allocator so the IP isn't leaked. + try? allocator.release(id) + throw error + } + taps[id] = device + + return TAPInterface( + tapName: device.name, + ipv4Address: cidr, + ipv4Gateway: ipv4Gateway, + macAddress: nil, + mtu: mtu + ) + } + + public mutating func releaseInterface(_ id: String) throws { + if let device = taps.removeValue(forKey: id) { + device.close() + } + try allocator.release(id) + } + + /// Derive a deterministic, IFNAMSIZ-compliant TAP name from a container id. + /// Format: `czt-<10 hex chars>` (14 chars total; IFNAMSIZ-1 = 15). + static func derivedTAPName(forID id: String) -> String { + let hash = SHA256.hash(data: Data(id.utf8)) + let hex = hash.map { String(format: "%02x", $0) }.joined() + return "czt-" + String(hex.prefix(10)) + } +} +#endif diff --git a/Sources/Containerization/LinuxContainer.swift b/Sources/Containerization/LinuxContainer.swift index cfdc5f64..62ffd65f 100644 --- a/Sources/Containerization/LinuxContainer.swift +++ b/Sources/Containerization/LinuxContainer.swift @@ -589,18 +589,57 @@ extension LinuxContainer { try await vm.start() do { + let mountsForAgent = containerMounts try await vm.withAgent { agent in try await agent.standardSetup() - // Mount the unified virtiofs share at /run/virtiofs - // All virtiofs directories appear as subdirectories here - try await agent.mount( - ContainerizationOCI.Mount( - type: "virtiofs", - source: "virtiofs", - destination: "/run/virtiofs", - options: [] - )) + // Mount the unified virtiofs share at /run/virtiofs only + // when at least one of the container's mounts is virtiofs + // — the bind-mount transform below derives its sources + // from /run/virtiofs/{tag}, so the unified share is only + // load-bearing when there are virtiofs mounts. The macOS + // VZ backend always exposes the virtiofs device (even + // with zero shares), but the cloud-hypervisor backend + // only spawns virtiofsd when shares exist; mounting an + // unbacked tag fails with EINVAL. + let hasVirtiofsMount = mountsForAgent.contains { mount in + if case .virtiofs = mount.runtimeOptions { return true } + return false + } + if hasVirtiofsMount { + // VZ exposes ONE virtio-fs device with tag "virtiofs" + // and multiple sources as subdirs (VZMultipleDirectoryShare). + // The CH backend exposes one device per source-hash + // tag instead, so the guest must mount each tag + // separately at /run/virtiofs/. The bind-mount + // transform below uses /run/virtiofs/ in both + // cases, so this branch is only about how /run/virtiofs + // gets populated. + if vm.virtiofsLayout == .perTag { + try await agent.mkdir(path: "/run/virtiofs", all: true, perms: 0o755) + let virtiofsAttachments = (vm.mounts[self.id] ?? []).filter { $0.type == "virtiofs" } + let uniqueTags = Set(virtiofsAttachments.map(\.source)) + for tag in uniqueTags { + let dest = "/run/virtiofs/\(tag)" + try await agent.mkdir(path: dest, all: true, perms: 0o755) + try await agent.mount( + ContainerizationOCI.Mount( + type: "virtiofs", + source: tag, + destination: dest, + options: [] + )) + } + } else { + try await agent.mount( + ContainerizationOCI.Mount( + type: "virtiofs", + source: "virtiofs", + destination: "/run/virtiofs", + options: [] + )) + } + } guard let attachments = vm.mounts[self.id] else { throw ContainerizationError(.notFound, message: "rootfs mount not found") diff --git a/Sources/Containerization/LinuxPod.swift b/Sources/Containerization/LinuxPod.swift index dbe4365e..a324f9c8 100644 --- a/Sources/Containerization/LinuxPod.swift +++ b/Sources/Containerization/LinuxPod.swift @@ -520,6 +520,19 @@ extension LinuxPod { mountsByID[self.id] = podVolumeMounts } + // Capture into an immutable `let` so the value is safely usable + // from the concurrent `withAgent` closure below. The container + // path makes the same decision in LinuxContainer.create — CH + // only attaches a virtiofs device when shares are configured, + // so mounting an unbacked /run/virtiofs would fail with EINVAL + // on the CH backend. + let hasVirtiofsMount = mountsByID.values.contains { mounts in + mounts.contains { mount in + if case .virtiofs = mount.runtimeOptions { return true } + return false + } + } + var vmConfig = VMConfiguration( cpus: self.config.cpus, memoryInBytes: self.config.memoryInBytes, @@ -543,16 +556,41 @@ extension LinuxPod { try await vm.withAgent { agent in try await agent.standardSetup() - // Mount the unified virtiofs share at /run/virtiofs - // All virtiofs directories appear as subdirectories here - try await agent.mkdir(path: "/run/virtiofs", all: true, perms: 0o755) - try await agent.mount( - ContainerizationOCI.Mount( - type: "virtiofs", - source: "virtiofs", - destination: "/run/virtiofs", - options: [] - )) + // Mount the unified virtiofs share at /run/virtiofs only + // when at least one container has a virtiofs mount. VZ + // tolerates the unbacked mount; CH does not. + if hasVirtiofsMount { + try await agent.mkdir(path: "/run/virtiofs", all: true, perms: 0o755) + if vm.virtiofsLayout == .perTag { + // CH backend: one virtio-fs device per source-hash + // tag, so mount each tag separately at + // /run/virtiofs/. See LinuxContainer for the + // VZ vs. CH model split. + var seenTags: Set = [] + for (_, attached) in vm.mounts { + for entry in attached where entry.type == "virtiofs" { + guard seenTags.insert(entry.source).inserted else { continue } + let dest = "/run/virtiofs/\(entry.source)" + try await agent.mkdir(path: dest, all: true, perms: 0o755) + try await agent.mount( + ContainerizationOCI.Mount( + type: "virtiofs", + source: entry.source, + destination: dest, + options: [] + )) + } + } + } else { + try await agent.mount( + ContainerizationOCI.Mount( + type: "virtiofs", + source: "virtiofs", + destination: "/run/virtiofs", + options: [] + )) + } + } // Create pause container if PID namespace sharing is enabled if shareProcessNamespace { diff --git a/Sources/Containerization/LinuxProcess.swift b/Sources/Containerization/LinuxProcess.swift index 0d8300ba..7583a9ba 100644 --- a/Sources/Containerization/LinuxProcess.swift +++ b/Sources/Containerization/LinuxProcess.swift @@ -125,7 +125,7 @@ public final class LinuxProcess: Sendable { extension LinuxProcess { func setupIO(listeners: [VsockListener?]) async throws -> [FileHandle?] { - let handles = try await Timeout.run(seconds: 3) { + let handles = try await Timeout.run(seconds: 30) { try await withThrowingTaskGroup(of: (Int, FileHandle?).self) { group in var results = [FileHandle?](repeating: nil, count: 3) diff --git a/Sources/Containerization/Mount+CH.swift b/Sources/Containerization/Mount+CH.swift new file mode 100644 index 00000000..2e25e2b9 --- /dev/null +++ b/Sources/Containerization/Mount+CH.swift @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import CloudHypervisor +import Foundation + +extension Mount { + /// Returns a `CloudHypervisor.DiskConfig` describing this mount as a virtio-blk + /// device, or `nil` if the mount is not a block device. + /// + /// The caller supplies the device id; cloud-hypervisor uses it both as a + /// stable handle for hotplug-remove and as the udev/sysfs identifier inside + /// the guest. + /// + /// `imageType` defaults to `.raw` because Containerization mounts are + /// always raw block files (ext4 produced by the EXT4 unpacker, NBD URLs, + /// etc.). When cloud-hypervisor doesn't see an `image_type` it falls + /// back to `Unknown` and silently rejects all writes — see CH's + /// `virtio-devices/src/block.rs` "Attempting to write to sector 0 on a + /// disk without specifying image_type" warning. + public func chDiskConfig(id: String) -> CloudHypervisor.DiskConfig? { + guard case .virtioblk = self.runtimeOptions else { + return nil + } + return CloudHypervisor.DiskConfig( + path: self.source, + readonly: self.options.contains("ro"), + direct: nil, + iommu: nil, + id: id, + pciSegment: nil, + imageType: .raw + ) + } + + /// Returns a `CloudHypervisor.FsConfig` describing this mount as a virtio-fs + /// share served by an out-of-process `virtiofsd`, or `nil` if the mount is + /// not a virtiofs share. + /// + /// `tag` is the guest-side mount tag and `socketPath` is the UDS path the + /// virtiofsd subprocess publishes. Both are owned by the caller. + public func chFsConfig(tag: String, socketPath: String, id: String) -> CloudHypervisor.FsConfig? { + guard case .virtiofs = self.runtimeOptions else { + return nil + } + return CloudHypervisor.FsConfig( + tag: tag, + socket: socketPath, + numQueues: nil, + queueSize: nil, + id: id, + pciSegment: nil + ) + } +} + +/// Build the host-side UDS path for a virtiofsd ↔ cloud-hypervisor socket. +/// +/// `tag` is the full source-hash (used as the FUSE tag advertised to the +/// guest); the socket *path* uses only an 8-char prefix because the full +/// path — `/virtiofs-.sock` with a 36-char tag — overshoots +/// Linux's 108-byte `SUN_LEN` limit. 32 bits of disambiguation is more +/// than enough within a single VM (handful of distinct virtiofs sources). +func chVirtiofsSocketURL(workDir: URL, tag: String) -> URL { + let short = String(tag.prefix(8)) + return workDir.appendingPathComponent("vfs-\(short).sock") +} diff --git a/Sources/Containerization/Mount.swift b/Sources/Containerization/Mount.swift index b72436bd..b516deae 100644 --- a/Sources/Containerization/Mount.swift +++ b/Sources/Containerization/Mount.swift @@ -21,6 +21,14 @@ import Foundation import Virtualization #endif +#if os(Linux) +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif +#endif + /// A filesystem mount exposed to a container. public struct Mount: Sendable { /// The filesystem or mount type. This is the string @@ -127,14 +135,23 @@ public struct Mount: Sendable { ) } - #if os(macOS) /// Clone the Mount to the provided path. /// - /// This uses `clonefile` to provide a copy-on-write copy of the Mount. + /// On macOS this uses `clonefile` (via `FileManager.copyItem`) for a + /// copy-on-write copy when the underlying filesystem supports it. On + /// Linux it tries `ioctl(FICLONE)` first (CoW on btrfs / xfs / bcachefs) + /// and falls back to a `SEEK_DATA`/`SEEK_HOLE` sparse copy that copies + /// only data ranges. This matters for EXT4 images produced by + /// `EXT4+Formatter`, which sparse-allocate via `lseek + 1-byte write` — + /// a non-sparse copy would inflate a ~50 MB alpine rootfs into a + /// fully-allocated 2 GiB clone and exhaust the integration suite's + /// writable layer in ~30 tests. public func clone(to: String) throws -> Self { - let fm = FileManager.default - let src = self.source - try fm.copyItem(atPath: src, toPath: to) + #if os(Linux) + try Self.linuxSparseCopy(from: self.source, to: to) + #else + try FileManager.default.copyItem(atPath: self.source, toPath: to) + #endif return .init( type: self.type, @@ -144,6 +161,121 @@ public struct Mount: Sendable { runtimeOptions: self.runtimeOptions ) } + + #if os(Linux) + /// Copy `src` to `dst`, preferring a CoW reflink (`ioctl(FICLONE)`) and + /// falling back to a SEEK_DATA/SEEK_HOLE sparse copy. The reflink path + /// succeeds on btrfs / xfs (`reflink=1`) / bcachefs; on ext4 / tmpfs / + /// overlayfs it fails fast with EOPNOTSUPP/EXDEV/EINVAL and we walk + /// the hole map instead. The sparse-copy path also handles + /// filesystems that don't support hole-seeking (the very first + /// SEEK_DATA returns EINVAL) by copying the remainder verbatim. Mode + /// bits are preserved from the source. + private static func linuxSparseCopy(from src: String, to dst: String) throws { + // Stable Linux ABI since 3.1 (ext4, tmpfs, overlayfs all support it). + // Re-declared here so the build doesn't depend on whether the + // Glibc/Musl Swift overlay re-exports them. + let SEEK_DATA: Int32 = 3 + let SEEK_HOLE: Int32 = 4 + // _IOW(0x94, 9, int) on every Linux arch we target (x86_64, aarch64). + let FICLONE: CUnsignedLong = 0x4004_9409 + + let srcFd = open(src, O_RDONLY | O_CLOEXEC) + guard srcFd >= 0 else { + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + defer { _ = close(srcFd) } + + var st = stat() + guard fstat(srcFd, &st) == 0 else { + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + let size = off_t(st.st_size) + let mode = mode_t(st.st_mode & 0o7777) + + let dstFd = open(dst, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode) + guard dstFd >= 0 else { + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + defer { _ = close(dstFd) } + + // FICLONE atomically replaces dst's contents with a CoW clone of + // src — sets size and contents in one shot, no ftruncate needed + // afterwards. On failure FICLONE guarantees dst is untouched, so + // we can safely fall through to the sparse-copy path. ioctl(2) is + // variadic; type-pun via a fixed-arity function pointer (same + // pattern as ContainerizationOS.Socket). + let ioctlFICLONE: @convention(c) (CInt, CUnsignedLong, CInt) -> CInt = ioctl + if ioctlFICLONE(dstFd, FICLONE, srcFd) == 0 { + return + } + + // Set the destination size up front so any trailing hole survives — + // we only ever pwrite data ranges, never zero-fill. + guard ftruncate(dstFd, size) == 0 else { + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + + let bufSize = 1 << 20 // 1 MiB + let buf = UnsafeMutableRawPointer.allocate(byteCount: bufSize, alignment: 16) + defer { buf.deallocate() } + + var pos: off_t = 0 + while pos < size { + let dataStart = lseek(srcFd, pos, SEEK_DATA) + if dataStart < 0 { + // ENXIO: no more data — rest is hole, already covered by ftruncate. + if errno == ENXIO { + break + } + // EINVAL/ENOTSUP: filesystem doesn't support SEEK_DATA. Treat + // the remainder as one big data range and copy it verbatim. + try Self.copyRange(srcFd: srcFd, dstFd: dstFd, start: pos, end: size, buf: buf, bufSize: bufSize) + break + } + + // SEEK_HOLE returns end-of-file when there's no trailing hole. + let dataEnd = lseek(srcFd, dataStart, SEEK_HOLE) + let endOff: off_t = dataEnd < 0 ? size : dataEnd + + try Self.copyRange(srcFd: srcFd, dstFd: dstFd, start: dataStart, end: endOff, buf: buf, bufSize: bufSize) + pos = endOff + } + } + + private static func copyRange( + srcFd: Int32, + dstFd: Int32, + start: off_t, + end: off_t, + buf: UnsafeMutableRawPointer, + bufSize: Int + ) throws { + var off = start + while off < end { + let want = Int(min(off_t(bufSize), end - off)) + let nread = pread(srcFd, buf, want, off) + if nread < 0 { + if errno == EINTR { continue } + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + if nread == 0 { + // Source shorter than fstat reported — shouldn't happen, but + // bail rather than spin. + return + } + var written = 0 + while written < nread { + let nwrite = pwrite(dstFd, buf.advanced(by: written), nread - written, off + off_t(written)) + if nwrite < 0 { + if errno == EINTR { continue } + throw POSIXError(POSIXErrorCode(rawValue: errno) ?? .EIO) + } + written += nwrite + } + off += off_t(nread) + } + } #endif } diff --git a/Sources/Containerization/SandboxOverrides.swift b/Sources/Containerization/SandboxOverrides.swift new file mode 100644 index 00000000..b94024e4 --- /dev/null +++ b/Sources/Containerization/SandboxOverrides.swift @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import Foundation + +/// Per-component opt-ins to weaken the upstream-secure spawn flags for +/// cloud-hypervisor and virtiofsd. Each flag is independent so an operator +/// can target the minimum hardening that needs to come off — e.g. running +/// inside apple/container's `--virtualization` dev container needs both, +/// but a future bare-metal Linux host with a custom seccomp policy might +/// only need one. +/// +/// Read once per process at first reference. Default false (secure). +/// +/// **Legacy alias.** `CONTAINERIZATION_RELAXED_SANDBOX=1` continues to flip +/// every flag here for back-compat with the original combined toggle. New +/// callers should prefer the per-component vars below. +enum SandboxOverrides { + /// When set, cloud-hypervisor is launched with `--seccomp false`. + /// Disables CH's userspace seccomp BPF filter; the kernel's filter + /// (whatever the host policy is) still applies. + /// + /// Env: `CONTAINERIZATION_NO_CH_SECCOMP=1` + static let chSeccompDisabled: Bool = + boolEnv("CONTAINERIZATION_NO_CH_SECCOMP") || legacyRelaxedSandbox + + /// When set, virtiofsd is launched with `--sandbox none`. Disables + /// virtiofsd's userns + pivot_root + seccomp setup. Combined with the + /// vendored cap-drop patch (`scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch`) + /// at build time, the daemon retains its parent's capabilities — so + /// only enable this when the parent process is the trust boundary you + /// want. + /// + /// Env: `CONTAINERIZATION_NO_VIRTIOFSD_SANDBOX=1` + static let virtiofsdSandboxDisabled: Bool = + boolEnv("CONTAINERIZATION_NO_VIRTIOFSD_SANDBOX") || legacyRelaxedSandbox + + /// True if any override is currently in effect — used by callers that + /// want to log a single banner regardless of which flag is set. + static var anyEnabled: Bool { + chSeccompDisabled || virtiofsdSandboxDisabled + } + + /// Back-compat alias: enables every per-component flag in one shot. + private static let legacyRelaxedSandbox: Bool = + boolEnv("CONTAINERIZATION_RELAXED_SANDBOX") + + private static func boolEnv(_ name: String) -> Bool { + ProcessInfo.processInfo.environment[name] == "1" + } +} + +/// Minimal environment allowlist for child processes we spawn (`CHProcess`, +/// `VirtiofsdProcess`). Inheriting the parent's full env exposes any +/// secrets the calling tool happens to have set (`AWS_*`, `KUBE_*`, +/// `*_TOKEN`, etc.) to a binary that has no use for them. Only the +/// variables below are forwarded — extend this list when a new spawn-time +/// dependency surfaces, and document why. +/// +/// - `PATH`, `HOME`: minimum POSIX hygiene; some libc/setuid paths look at +/// these even for self-contained binaries. +/// - `RUST_LOG`, `RUST_BACKTRACE`: cloud-hypervisor and virtiofsd are Rust +/// binaries; pass these through if the operator has set them so +/// debugging is unimpaired. +enum ChildEnvironment { + /// Construct a minimal environment for the child as `KEY=value` strings + /// suitable for `Command.environment`. Variables not present in the + /// parent env are simply omitted. + static func minimal() -> [String] { + let allowlist = ["PATH", "HOME", "RUST_LOG", "RUST_BACKTRACE"] + let parent = ProcessInfo.processInfo.environment + var entries: [String] = [] + // PATH falls back to a sane default since Command's execve needs an + // absolute path anyway, but child Rust binaries occasionally probe + // PATH for helper tools. + let path = parent["PATH"] ?? "/usr/sbin:/usr/bin:/sbin:/bin" + entries.append("PATH=\(path)") + for key in allowlist where key != "PATH" { + if let value = parent[key] { + entries.append("\(key)=\(value)") + } + } + return entries + } +} +#endif diff --git a/Sources/Containerization/TAPDevice.swift b/Sources/Containerization/TAPDevice.swift new file mode 100644 index 00000000..bae6e5a2 --- /dev/null +++ b/Sources/Containerization/TAPDevice.swift @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import CShim +import ContainerizationError +import ContainerizationExtras +import ContainerizationNetlink +import Foundation +import Synchronization + +#if canImport(Musl) +import Musl +let osClose = Musl.close +#elseif canImport(Glibc) +import Glibc +let osClose = Glibc.close +#endif + +/// A Linux TAP network device whose kernel interface lives only as long as +/// this `TAPDevice` instance. Created via `/dev/net/tun` + `ioctl(TUNSETIFF)`, +/// optionally enslaved to a pre-existing bridge, with MTU/MAC/UP applied via +/// netlink. The fd is held internally; closing it (explicitly or via deinit) +/// removes the interface from the kernel. +/// +/// `TUNSETPERSIST` is never called, so process death also cleans up the +/// device automatically. Cloud-hypervisor opens the same TAP **by name**; +/// the held fd keeps the interface alive across CH's open/close cycle. +/// +/// Requires `CAP_NET_ADMIN`. +public final class TAPDevice: Sendable { + /// The kernel-resolved interface name. May differ from the `name` + /// parameter passed to `init` if the kernel substituted one (e.g. when + /// `nil` was passed and the kernel picked `tapN`). + public let name: String + + public let mtu: UInt32 + + /// The MAC address as set on init, or nil if the kernel auto-assigned one. + /// Not read back from the kernel. + public let macAddress: MACAddress? + + private let _fd: Mutex + + /// Create a TAP device. + /// + /// - Parameters: + /// - name: Desired interface name. Empty or nil = kernel picks (`tap%d`). + /// Length must be < 16 (`IFNAMSIZ - 1`). + /// - bridge: Name of an existing bridge to enslave the TAP to, or nil. + /// - mtu: MTU in bytes (default 1500). + /// - macAddress: Hardware address to set, or nil to leave kernel default. + public init( + name: String? = nil, + bridge: String? = nil, + mtu: UInt32 = 1500, + macAddress: MACAddress? = nil + ) throws { + if let n = name, n.utf8.count >= 16 { + throw ContainerizationError( + .invalidArgument, + message: "TAP name too long: \(n) (must be < 16 chars)" + ) + } + + // 1. Open + TUNSETIFF via CShim. Returns fd on success, -errno on failure. + var resolved = [CChar](repeating: 0, count: 16) + let fd: Int32 = resolved.withUnsafeMutableBufferPointer { buf in + (name ?? "").withCString { reqPtr in + cz_tap_create(reqPtr, buf.baseAddress, 16) + } + } + guard fd >= 0 else { + throw ContainerizationError( + .internalError, + message: "cz_tap_create failed: errno=\(-fd)" + ) + } + + // From here on, any failure must close `fd` to release the kernel iface. + var fdToClean: Int32? = fd + defer { + if let f = fdToClean { + _ = osClose(f) + } + } + + let resolvedName: String = resolved.withUnsafeBufferPointer { buf in + // String(cString:) is deprecated in newer toolchains. Build the + // String from the NUL-terminated UTF-8 bytes directly. + let bytes = buf.prefix(while: { $0 != 0 }).map { UInt8(bitPattern: $0) } + return String(decoding: bytes, as: UTF8.self) + } + + // 2. Apply MAC and master via netlink (single RTM_NEWLINK). + let session = try NetlinkSession(socket: DefaultNetlinkSocket()) + do { + try session.linkSetAttributes( + interface: resolvedName, + macAddress: macAddress, + master: bridge + ) + } catch { + throw ContainerizationError( + .internalError, + message: "linkSetAttributes failed for \(resolvedName): \(error)" + ) + } + + // 3. Bring UP and set MTU. + do { + try session.linkSet(interface: resolvedName, up: true, mtu: mtu) + } catch { + throw ContainerizationError( + .internalError, + message: "linkSet(up:mtu:) failed for \(resolvedName): \(error)" + ) + } + + // 4. Success — store and clear cleanup. + self.name = resolvedName + self.mtu = mtu + self.macAddress = macAddress + self._fd = Mutex(fd) + fdToClean = nil + } + + /// Close the held fd, removing the interface from the kernel. Idempotent. + public func close() { + _fd.withLock { fd in + if let f = fd { + _ = osClose(f) + fd = nil + } + } + } + + deinit { + close() + } +} +#endif diff --git a/Sources/Containerization/VZVirtualMachineInstance.swift b/Sources/Containerization/VZVirtualMachineInstance.swift index a711b2b5..e27fb43e 100644 --- a/Sources/Containerization/VZVirtualMachineInstance.swift +++ b/Sources/Containerization/VZVirtualMachineInstance.swift @@ -566,38 +566,6 @@ extension VZVirtualMachineInstance.Configuration { } } -extension Kernel { - func linuxCommandline(initialFilesystem: Mount) -> String { - var args = self.commandLine.kernelArgs - - args.append("init=/sbin/vminitd") - // rootfs is always set as ro. - args.append("ro") - - switch initialFilesystem.type { - case "virtiofs": - args.append(contentsOf: [ - "rootfstype=virtiofs", - "root=rootfs", - ]) - case "ext4": - args.append(contentsOf: [ - "rootfstype=ext4", - "root=/dev/vda", - ]) - default: - fatalError("unsupported initfs filesystem \(initialFilesystem.type)") - } - - if self.commandLine.initArgs.count > 0 { - args.append("--") - args.append(contentsOf: self.commandLine.initArgs) - } - - return args.joined(separator: " ") - } -} - public protocol VZInterface { func device() throws -> VZVirtioNetworkDeviceConfiguration } diff --git a/Sources/Containerization/VirtiofsdProcess.swift b/Sources/Containerization/VirtiofsdProcess.swift new file mode 100644 index 00000000..7c73bd10 --- /dev/null +++ b/Sources/Containerization/VirtiofsdProcess.swift @@ -0,0 +1,187 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import ContainerizationExtras +import ContainerizationOS +import Foundation +import Logging +import Synchronization + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + +/// A managed `virtiofsd` subprocess serving a single shared directory. +/// +/// One `VirtiofsdProcess` per virtio-fs share. Cloud Hypervisor connects to +/// the published UDS via its `FsConfig.socket` field. Lifecycle mirrors +/// `CHProcess`: spawn + wait-for-socket on `start()`, SIGTERM/SIGKILL on +/// `terminate()`. +final class VirtiofsdProcess: Sendable { + struct Config: Sendable { + let binary: URL + let socketPath: URL + let sharedDir: URL + let readonly: Bool + } + + private struct State { + var command: Command? + var exitTask: Task? + } + + private let config: Config + private let logger: Logger? + private let state: Mutex + + init(config: Config, logger: Logger?) { + self.config = config + self.logger = logger + self.state = Mutex(State(command: nil, exitTask: nil)) + } + + /// Spawn virtiofsd and wait for its UDS to accept connections. + func start() async throws { + var arguments = [ + "--socket-path", config.socketPath.path, + "--shared-dir", config.sharedDir.path, + ] + if SandboxOverrides.virtiofsdSandboxDisabled { + // virtiofsd defaults to `--sandbox namespace`, which sets up a + // userns + pivot_root + seccomp filter. Inside apple/container's + // --virtualization dev container the default seccomp profile + // SIGSYS-kills processes that hit unfiltered syscalls (same + // reason CH runs with `--seccomp false`). `--sandbox none` + // skips both userns setup and seccomp; safe inside the per-VM + // dev container only. Opt-in via + // CONTAINERIZATION_NO_VIRTIOFSD_SANDBOX=1. + logger?.warning( + "virtiofsd launching with --sandbox none (CONTAINERIZATION_NO_VIRTIOFSD_SANDBOX=1) — userns/pivot_root/seccomp setup disabled" + ) + arguments.append(contentsOf: ["--sandbox", "none"]) + } + if config.readonly { + arguments.append("--readonly") + } + + var command = Command( + config.binary.path, + arguments: arguments, + environment: ChildEnvironment.minimal() + ) + // Inherit stderr so virtiofsd's startup logs surface in the host's + // log stream rather than vanishing into /dev/null (Command's default). + command.stderr = FileHandle.standardError + // Same rationale as CHProcess: keep virtiofsd out of the parent's + // controlling-TTY signal group so Ctrl-C doesn't kill it before our + // own terminate() ladder runs. + command.attrs.setsid = true + do { + try command.start() + } catch { + throw error + } + + let exitTask = Task.detached { [command, logger] in + do { + _ = try command.wait() + } catch { + logger?.error("virtiofsd wait failed: \(error)") + } + } + + state.withLock { + $0.command = command + $0.exitTask = exitTask + } + + try await waitForSocket() + } + + /// SIGTERM → grace window → SIGKILL. Returns once virtiofsd is reaped. + func terminate(graceSeconds: UInt32) async { + guard let command = state.withLock({ $0.command }) else { return } + + _ = command.kill(SIGTERM) + + do { + try await Timeout.run(for: .seconds(Int(graceSeconds))) { + await self.waitForExit() + } + } catch { + logger?.warning("virtiofsd did not exit within \(graceSeconds)s, sending SIGKILL") + _ = command.kill(SIGKILL) + await waitForExit() + } + } + + // MARK: - Private helpers + + private static let socketDeadline: Duration = .seconds(10) + private static let socketPollInterval: Duration = .milliseconds(50) + + private func waitForExit() async { + guard let task = state.withLock({ $0.exitTask }) else { return } + await task.value + } + + private func waitForSocket() async throws { + let clock = ContinuousClock() + let started = clock.now + let deadline = started.advanced(by: Self.socketDeadline) + + while clock.now < deadline { + if Self.isSocketReady(at: config.socketPath) { + let elapsed = clock.now - started + logger?.debug("virtiofsd socket bound in \(elapsed) at \(config.socketPath.path)") + return + } + try? await Task.sleep(for: Self.socketPollInterval) + } + + // Capture diagnostic state before terminating. + let fm = FileManager.default + let socketExists = fm.fileExists(atPath: config.socketPath.path) + let parentExists = fm.fileExists(atPath: config.socketPath.deletingLastPathComponent().path) + let sharedExists = fm.fileExists(atPath: config.sharedDir.path) + let detail = "socketExists=\(socketExists) parentDirExists=\(parentExists) sharedDirExists=\(sharedExists)" + + await terminate(graceSeconds: 5) + throw ContainerizationError( + .timeout, + message: "virtiofsd socket not connectable at \(config.socketPath.path) within \(Self.socketDeadline) [\(detail)]" + ) + } + + private static func isSocketReady(at url: URL) -> Bool { + // Only check that the socket file exists. Do NOT connect — virtiofsd + // runs in vhost-user mode where the first incoming connection is + // treated as the VMM (cloud-hypervisor); when that connection closes, + // virtiofsd exits. A connect-then-close readiness probe therefore + // kills virtiofsd before CH ever gets to it, leaving CH's vm.boot + // failing with "vhost-user: can't connect to peer: No such file + // or directory". + var st = stat() + guard stat(url.path, &st) == 0 else { return false } + return (st.st_mode & S_IFMT) == S_IFSOCK + } + +} +#endif diff --git a/Sources/Containerization/VirtualMachineInstance.swift b/Sources/Containerization/VirtualMachineInstance.swift index 1b834384..302e97ae 100644 --- a/Sources/Containerization/VirtualMachineInstance.swift +++ b/Sources/Containerization/VirtualMachineInstance.swift @@ -26,6 +26,17 @@ public enum VirtualMachineInstanceState: Sendable { case unknown } +/// How the VMM exposes virtiofs devices to the guest. +/// +/// - `unified`: a single virtio-fs device (tag `virtiofs`) carries all +/// shares as subdirectories (Apple's `VZMultipleDirectoryShare` model). +/// - `perTag`: one virtio-fs device per source-hash tag, each mounted +/// separately in the guest (cloud-hypervisor / virtiofsd model). +public enum VirtiofsLayout: Sendable { + case unified + case perTag +} + /// A live instance of a virtual machine. public protocol VirtualMachineInstance: Sendable { associatedtype Agent: VirtualMachineAgent @@ -34,6 +45,10 @@ public protocol VirtualMachineInstance: Sendable { var state: VirtualMachineInstanceState { get } var mounts: [String: [AttachedFilesystem]] { get } + + /// How this VMM exposes virtiofs devices to the guest. Defaults to + /// `.unified` (the VZ-shaped behavior); CH overrides to `.perTag`. + var virtiofsLayout: VirtiofsLayout { get } /// Dial the Agent. It's up the VirtualMachineInstance to determine /// what port the agent is listening on. func dialAgent() async throws -> Agent @@ -81,6 +96,7 @@ public protocol VirtualMachineInstance: Sendable { } extension VirtualMachineInstance { + public var virtiofsLayout: VirtiofsLayout { .unified } public func pause() async throws { throw ContainerizationError(.unsupported, message: "pause") } diff --git a/Sources/Containerization/Vsock+Linux.swift b/Sources/Containerization/Vsock+Linux.swift new file mode 100644 index 00000000..eacd874d --- /dev/null +++ b/Sources/Containerization/Vsock+Linux.swift @@ -0,0 +1,158 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ContainerizationError +import ContainerizationOS +import Foundation + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + +// MARK: - Cloud Hypervisor hybrid vsock host-side helpers +// +// Cloud Hypervisor exposes its vsock device to the host as a Unix-domain +// socket pair, not the kernel AF_VSOCK (this avoids the host needing the +// vhost-vsock kernel module). +// +// - Host → guest dials use the "base" UDS (`VsockConfig.socket`) with a +// one-line `CONNECT \n` request, answered by `OK \n`. After +// that the connection is bridged transparently. +// - Guest → host dials are accepted on per-port UDS files at the +// conventional path `_` that the host pre-creates. +// +// Spec: `docs/vsock.md` in the cloud-hypervisor repository. + +/// Returns the conventional per-port UDS path for guest→host vsock connections, +/// derived by suffixing the base socket path with `_`. +func chVsockListenSocketPath(baseSocket: URL, port: UInt32) -> URL { + URL(fileURLWithPath: "\(baseSocket.path)_\(port)") +} + +/// Bind + listen a fresh AF_UNIX SOCK_STREAM at `path`, unlinking any stale +/// socket file at that path first. Returns the listening fd; ownership is +/// transferred to the caller. +/// +/// The socket file is created with mode `perms` (default `0o600`). The +/// per-VM workDir already restricts access via its own `0o700` mode, but +/// tightening the socket itself is cheap defense-in-depth — vminitd's gRPC +/// surface trusts whoever can `connect(2)` and exposes full container +/// control, so any local-user reach into these sockets is a privilege +/// escalation primitive. +func chVsockBindListener(at path: URL, perms: mode_t = 0o600) throws -> Int32 { + let unix = try UnixType(path: path.path, perms: perms, unlinkExisting: true) + let socket = try Socket(type: unix, closeOnDeinit: false) + do { + try socket.listen() + } catch { + try? socket.close() + throw error + } + return socket.fileDescriptor +} + +/// Dial guest port `port` over the cloud-hypervisor hybrid vsock at +/// `baseSocket`. Returns a `FileHandle` wrapping the connected fd; the +/// FileHandle does **not** close the fd on deinit — ownership of the fd +/// transfers to the caller (typically `Vminitd.init`, which hands it to +/// NIO via `withConnectedSocket`; NIO is then responsible for closing it +/// when the channel is torn down). Callers using the FileHandle directly +/// must close the underlying fd themselves. +func chVsockDial(baseSocket: URL, port: UInt32) async throws -> FileHandle { + try await Task.detached { + try chVsockDialSync(baseSocket: baseSocket, port: port) + }.value +} + +// MARK: - Internals + +private func chVsockDialSync(baseSocket: URL, port: UInt32) throws -> FileHandle { + let unix = try UnixType(path: baseSocket.path) + let socket = try Socket(type: unix, closeOnDeinit: false) + + do { + try socket.connect() + // Bound the bootstrap reply read so a hung cloud-hypervisor muxer + // can't pin this thread forever. CH replies within milliseconds in + // healthy operation; 30 s is well outside that and matches the + // CloudHypervisor REST client default. After bootstrap the fd is + // handed to NIO which puts it in non-blocking mode, where + // SO_RCVTIMEO has no effect — so leaving the timeout in place is + // harmless. + try socket.setTimeout(option: .receive, seconds: 30) + let request = "CONNECT \(port)\n" + _ = try socket.write(data: Data(request.utf8)) + let response = try readLine(fd: socket.fileDescriptor) + // Cloud Hypervisor responds with "OK \n" where + // is the local-side port the muxer allocated for this + // forwarded connection — NOT the peer port we asked for. So we just + // require the response to start with "OK " and parse a UInt32 after. + guard response.hasPrefix("OK "), + UInt32(response.dropFirst(3)) != nil + else { + throw ContainerizationError( + .invalidState, + message: "unexpected vsock CONNECT response: \(response.debugDescription)" + ) + } + return FileHandle(fileDescriptor: socket.fileDescriptor, closeOnDealloc: false) + } catch { + try? socket.close() + throw error + } +} + +/// Read CH's hybrid-vsock `CONNECT` reply line (`OK \n`) one +/// byte at a time. Reads from `fd` until a `\n` is seen or `maxLength` is +/// reached; the returned string excludes the terminating newline. We do +/// this by hand because the fd is still in blocking mode (NIO takes over +/// only after the bootstrap completes) and there's no Foundation / +/// NIO line reader that operates on a raw blocking POSIX fd. +private func readLine(fd: Int32, maxLength: Int = 256) throws -> String { + var bytes: [UInt8] = [] + bytes.reserveCapacity(maxLength) + while bytes.count < maxLength { + var byte: UInt8 = 0 + let n = withUnsafeMutablePointer(to: &byte) { ptr -> ssize_t in + read(fd, ptr, 1) + } + if n == 0 { + break + } + if n < 0 { + let savedErrno = errno + // SO_RCVTIMEO expiry surfaces as EAGAIN / EWOULDBLOCK on a + // blocking socket. Translate to a clear timeout error so callers + // don't have to inspect errno. + if savedErrno == EAGAIN || savedErrno == EWOULDBLOCK { + throw ContainerizationError( + .timeout, + message: "vsock CONNECT response not received within socket receive timeout" + ) + } + throw POSIXError(POSIXErrorCode(rawValue: savedErrno) ?? .EIO) + } + if byte == UInt8(ascii: "\n") { + break + } + bytes.append(byte) + } + return String(decoding: bytes, as: UTF8.self) +} +#endif diff --git a/Sources/Containerization/VsockListener.swift b/Sources/Containerization/VsockListener.swift index 7a7b36fa..0a20d81c 100644 --- a/Sources/Containerization/VsockListener.swift +++ b/Sources/Containerization/VsockListener.swift @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// import Foundation +import Synchronization #if os(macOS) import Virtualization @@ -30,6 +31,7 @@ public final class VsockListener: NSObject, Sendable, AsyncSequence { private let connections: AsyncStream private let cont: AsyncStream.Continuation private let stopListening: @Sendable (_ port: UInt32) throws -> Void + private let finished: Mutex package init(port: UInt32, stopListen: @Sendable @escaping (_ port: UInt32) throws -> Void) { self.port = port @@ -37,13 +39,36 @@ public final class VsockListener: NSObject, Sendable, AsyncSequence { self.connections = stream self.cont = continuation self.stopListening = stopListen + self.finished = Mutex(false) } + /// Idempotent: calling more than once is a no-op. setupIO and the + /// caller-side defer can both call finish without double-closing the + /// listening fd (a double-close would target whatever fd was reallocated + /// in between, hanging the next operation that touched it). public func finish() throws { + let alreadyFinished = self.finished.withLock { state -> Bool in + if state { + return true + } + state = true + return false + } + if alreadyFinished { + return + } self.cont.finish() try self.stopListening(self.port) } + /// Push an accepted connection into the listener's stream. Used by + /// VMM-specific accept loops that don't go through a delegate (the + /// cloud-hypervisor backend on Linux). On macOS the VZ delegate hits + /// `cont.yield(_:)` directly via same-class access. + package func yield(_ handle: FileHandle) -> AsyncStream.Continuation.YieldResult { + cont.yield(handle) + } + public func makeAsyncIterator() -> AsyncStream.AsyncIterator { connections.makeAsyncIterator() } diff --git a/Sources/ContainerizationExtras/CIDRv4.swift b/Sources/ContainerizationExtras/CIDRv4.swift index c1fd86cf..d7823b01 100644 --- a/Sources/ContainerizationExtras/CIDRv4.swift +++ b/Sources/ContainerizationExtras/CIDRv4.swift @@ -112,3 +112,11 @@ extension CIDRv4: Codable { try container.encode(description) } } + +extension CIDRv4 { + /// The gateway address of the network. Conventionally the first usable + /// address in the subnet (`lower + 1`). + public var gateway: IPv4Address { + IPv4Address(self.lower.value + 1) + } +} diff --git a/Sources/ContainerizationNetlink/NetlinkSession.swift b/Sources/ContainerizationNetlink/NetlinkSession.swift index a586fc74..c6438ad1 100644 --- a/Sources/ContainerizationNetlink/NetlinkSession.swift +++ b/Sources/ContainerizationNetlink/NetlinkSession.swift @@ -118,6 +118,215 @@ public struct NetlinkSession { } } + /// Set link attributes (MAC and/or bridge master) on an existing interface. + /// Either argument may be omitted; if both are nil this is a no-op. + /// + /// Sends a single `RTM_NEWLINK` carrying any of: + /// - `IFLA_ADDRESS` — the new hardware address (6 bytes for an Ethernet MAC). + /// - `IFLA_MASTER` — the index of the bridge to enslave the link to. The + /// bridge is identified by name; the index is resolved internally. + /// + /// - Parameters: + /// - interface: The name of the interface to update. + /// - macAddress: If non-nil, the new MAC address. + /// - master: If non-nil, the name of a bridge to enslave the interface to. + public func linkSetAttributes( + interface: String, + macAddress: MACAddress? = nil, + master: String? = nil + ) throws { + if macAddress == nil && master == nil { + return + } + + let interfaceIndex = try getInterfaceIndex(interface) + + var masterIndex: Int32? = nil + if let master { + masterIndex = try getInterfaceIndex(master) + } + + // Build the attribute list. MAC is 6 raw bytes; master is a 4-byte + // integer holding the bridge's interface index. + let macAttr: RTAttribute? = + (macAddress != nil) + ? RTAttribute( + len: UInt16(RTAttribute.size + 6), + type: LinkAttributeType.IFLA_ADDRESS) + : nil + let masterAttr: RTAttribute? = + (masterIndex != nil) + ? RTAttribute( + len: UInt16(RTAttribute.size + MemoryLayout.size), + type: LinkAttributeType.IFLA_MASTER) + : nil + + let requestSize = + NetlinkMessageHeader.size + + InterfaceInfo.size + + (macAttr?.paddedLen ?? 0) + + (masterAttr?.paddedLen ?? 0) + + var requestBuffer = [UInt8](repeating: 0, count: requestSize) + var requestOffset = 0 + + let requestHeader = NetlinkMessageHeader( + len: UInt32(requestBuffer.count), + type: NetlinkType.RTM_NEWLINK, + flags: NetlinkFlags.NLM_F_REQUEST | NetlinkFlags.NLM_F_ACK, + pid: socket.pid) + requestOffset = try requestHeader.appendBuffer(&requestBuffer, offset: requestOffset) + + // No flag changes — passing 0/0 means "do not modify IFF_* flags". + let requestInfo = InterfaceInfo( + family: UInt8(AddressFamily.AF_PACKET), + index: interfaceIndex, + flags: 0, + change: 0) + requestOffset = try requestInfo.appendBuffer(&requestBuffer, offset: requestOffset) + + if let macAttr, let macAddress { + requestOffset = try macAttr.appendBuffer(&requestBuffer, offset: requestOffset) + for byte in macAddress.bytes { + guard let next = requestBuffer.copyIn(as: UInt8.self, value: byte, offset: requestOffset) else { + throw BindError.sendMarshalFailure(type: "RTAttribute", field: "IFLA_ADDRESS") + } + requestOffset = next + } + // Pad attribute payload to 4-byte boundary (NLA_ALIGN). + let payloadLen = 6 + let padded = ((payloadLen + 3) >> 2) << 2 + requestOffset += padded - payloadLen + } + + if let masterAttr, let masterIndex { + requestOffset = try masterAttr.appendBuffer(&requestBuffer, offset: requestOffset) + guard + let next = requestBuffer.copyIn(as: Int32.self, value: masterIndex, offset: requestOffset) + else { + throw BindError.sendMarshalFailure(type: "RTAttribute", field: "IFLA_MASTER") + } + requestOffset = next + } + + guard requestOffset == requestSize else { + throw Error.unexpectedOffset(offset: requestOffset, size: requestSize) + } + + try sendRequest(buffer: &requestBuffer) + let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWLINK) { InterfaceInfo() } + guard infos.count == 0 else { + throw Error.unexpectedResultSet(count: infos.count, expected: 0) + } + } + + /// Create a Linux bridge link via `RTM_NEWLINK` carrying + /// `IFLA_LINKINFO/IFLA_INFO_KIND="bridge"`. + /// + /// Sends `NLM_F_CREATE | NLM_F_EXCL`, so the kernel returns `EEXIST` if a + /// link with the same name already exists. Callers wanting idempotent + /// creation should catch and inspect the thrown error. + public func linkAddBridge(name: String) throws { + let nameBytes = Array(name.utf8) + [0] + let ifnameAttr = RTAttribute( + len: UInt16(RTAttribute.size + nameBytes.count), + type: LinkAttributeType.IFLA_IFNAME) + + let kindBytes = Array("bridge".utf8) + [0] + let kindAttr = RTAttribute( + len: UInt16(RTAttribute.size + kindBytes.count), + type: LinkInfoAttributeType.IFLA_INFO_KIND) + // IFLA_LINKINFO is a nest containing IFLA_INFO_KIND. + let linkInfoAttr = RTAttribute( + len: UInt16(RTAttribute.size + kindAttr.paddedLen), + type: LinkAttributeType.IFLA_LINKINFO) + + let requestSize = + NetlinkMessageHeader.size + + InterfaceInfo.size + + ifnameAttr.paddedLen + + linkInfoAttr.paddedLen + + var requestBuffer = [UInt8](repeating: 0, count: requestSize) + var requestOffset = 0 + + let header = NetlinkMessageHeader( + len: UInt32(requestBuffer.count), + type: NetlinkType.RTM_NEWLINK, + flags: NetlinkFlags.NLM_F_REQUEST | NetlinkFlags.NLM_F_ACK + | NetlinkFlags.NLM_F_CREATE | NetlinkFlags.NLM_F_EXCL, + pid: socket.pid) + requestOffset = try header.appendBuffer(&requestBuffer, offset: requestOffset) + + let info = InterfaceInfo( + family: UInt8(AddressFamily.AF_UNSPEC), + index: 0, + flags: 0, + change: 0) + requestOffset = try info.appendBuffer(&requestBuffer, offset: requestOffset) + + // IFLA_IFNAME + requestOffset = try ifnameAttr.appendBuffer(&requestBuffer, offset: requestOffset) + guard let next = requestBuffer.copyIn(buffer: nameBytes, offset: requestOffset) else { + throw BindError.sendMarshalFailure(type: "RTAttribute", field: "IFLA_IFNAME") + } + // Pad NUL-terminated name to NLA 4-byte boundary. + requestOffset = next + (ifnameAttr.paddedLen - RTAttribute.size - nameBytes.count) + + // IFLA_LINKINFO -> IFLA_INFO_KIND + requestOffset = try linkInfoAttr.appendBuffer(&requestBuffer, offset: requestOffset) + requestOffset = try kindAttr.appendBuffer(&requestBuffer, offset: requestOffset) + guard let after = requestBuffer.copyIn(buffer: kindBytes, offset: requestOffset) else { + throw BindError.sendMarshalFailure(type: "RTAttribute", field: "IFLA_INFO_KIND") + } + requestOffset = after + (kindAttr.paddedLen - RTAttribute.size - kindBytes.count) + + guard requestOffset == requestSize else { + throw Error.unexpectedOffset(offset: requestOffset, size: requestSize) + } + + try sendRequest(buffer: &requestBuffer) + let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWLINK) { InterfaceInfo() } + guard infos.count == 0 else { + throw Error.unexpectedResultSet(count: infos.count, expected: 0) + } + } + + /// Remove a link by name via `RTM_DELLINK`. + /// + /// Throws on netlink error. Callers wanting idempotent removal should + /// catch and inspect the thrown error (e.g. `ENODEV` ⇒ already gone). + public func linkDel(name: String) throws { + let interfaceIndex = try getInterfaceIndex(name) + let requestSize = NetlinkMessageHeader.size + InterfaceInfo.size + var requestBuffer = [UInt8](repeating: 0, count: requestSize) + var requestOffset = 0 + + let header = NetlinkMessageHeader( + len: UInt32(requestBuffer.count), + type: NetlinkType.RTM_DELLINK, + flags: NetlinkFlags.NLM_F_REQUEST | NetlinkFlags.NLM_F_ACK, + pid: socket.pid) + requestOffset = try header.appendBuffer(&requestBuffer, offset: requestOffset) + + let info = InterfaceInfo( + family: UInt8(AddressFamily.AF_UNSPEC), + index: interfaceIndex, + flags: 0, + change: 0) + requestOffset = try info.appendBuffer(&requestBuffer, offset: requestOffset) + + guard requestOffset == requestSize else { + throw Error.unexpectedOffset(offset: requestOffset, size: requestSize) + } + + try sendRequest(buffer: &requestBuffer) + let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_DELLINK) { InterfaceInfo() } + guard infos.count == 0 else { + throw Error.unexpectedResultSet(count: infos.count, expected: 0) + } + } + /// Performs a link get command on an interface. /// Returns information about the interface. /// - Parameter interface: The name of the interface to query. @@ -253,7 +462,7 @@ public struct NetlinkSession { } try sendRequest(buffer: &requestBuffer) - let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWLINK) { AddressInfo() } + let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWADDR) { AddressInfo() } guard infos.count == 0 else { throw Error.unexpectedResultSet(count: infos.count, expected: 0) } @@ -386,7 +595,7 @@ public struct NetlinkSession { } try sendRequest(buffer: &requestBuffer) - let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWLINK) { AddressInfo() } + let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWROUTE) { AddressInfo() } guard infos.count == 0 else { throw Error.unexpectedResultSet(count: infos.count, expected: 0) } @@ -462,7 +671,7 @@ public struct NetlinkSession { } try sendRequest(buffer: &requestBuffer) - let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWLINK) { AddressInfo() } + let (infos, _) = try parseResponse(infoType: NetlinkType.RTM_NEWROUTE) { AddressInfo() } guard infos.count == 0 else { throw Error.unexpectedResultSet(count: infos.count, expected: 0) } diff --git a/Sources/ContainerizationNetlink/Types.swift b/Sources/ContainerizationNetlink/Types.swift index 5c0a0e8c..81d62e2c 100644 --- a/Sources/ContainerizationNetlink/Types.swift +++ b/Sources/ContainerizationNetlink/Types.swift @@ -85,10 +85,17 @@ struct LinkAttributeType { static let IFLA_BROADCAST: UInt16 = 2 static let IFLA_IFNAME: UInt16 = 3 static let IFLA_MTU: UInt16 = 4 + static let IFLA_MASTER: UInt16 = 10 + static let IFLA_LINKINFO: UInt16 = 18 static let IFLA_STATS64: UInt16 = 23 static let IFLA_EXT_MASK: UInt16 = 29 } +/// Nested attribute types inside `IFLA_LINKINFO`. +struct LinkInfoAttributeType { + static let IFLA_INFO_KIND: UInt16 = 1 +} + struct LinkAttributeMaskFilter { static let RTEXT_FILTER_VF: UInt32 = 1 << 0 static let RTEXT_FILTER_SKIP_STATS: UInt32 = 1 << 3 diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index 498aa3f8..ef26c319 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -648,6 +648,7 @@ extension IntegrationSuite { } } + #if os(macOS) func testNestedVirtualizationEnabled() async throws { let id = "test-nested-virt" @@ -850,6 +851,8 @@ extension IntegrationSuite { } } + #endif + func testContainerStatistics() async throws { let id = "test-container-statistics" @@ -3188,6 +3191,7 @@ extension IntegrationSuite { } } + #if os(macOS) @available(macOS 26.0, *) func testInterfaceMTU() async throws { let id = "test-interface-mtu" @@ -3247,6 +3251,8 @@ extension IntegrationSuite { } } + #endif + func testSingleFileMount() async throws { let id = "test-single-file-mount" @@ -4666,6 +4672,7 @@ extension IntegrationSuite { } } + #if os(macOS) @available(macOS 26.0, *) func testNetworkingDisabled() async throws { let id = "test-networking-disabled" @@ -5215,6 +5222,8 @@ extension IntegrationSuite { } } + #endif + func testSysctl() async throws { let id = "test-container-sysctl" diff --git a/Sources/Integration/NBDServer.swift b/Sources/Integration/NBDServer.swift index aed49ffd..982c8752 100644 --- a/Sources/Integration/NBDServer.swift +++ b/Sources/Integration/NBDServer.swift @@ -20,6 +20,7 @@ import Logging import NIOCore import NIOPosix +#if os(macOS) /// A minimal NBD server for integration testing. /// /// Serves a file-backed block device using the NBD newstyle handshake protocol. @@ -379,3 +380,4 @@ private final class NBDConnectionHandler: ChannelInboundHandler { buf.writeInteger(cookie) } } +#endif diff --git a/Sources/Integration/PodTests.swift b/Sources/Integration/PodTests.swift index 2911e216..949c6d2a 100644 --- a/Sources/Integration/PodTests.swift +++ b/Sources/Integration/PodTests.swift @@ -2091,6 +2091,7 @@ extension IntegrationSuite { } } + #if os(macOS) @available(macOS 26.0, *) func testPodIPv6AddressAdd() async throws { let id = "test-pod-ipv6-address" @@ -2146,6 +2147,7 @@ extension IntegrationSuite { msg: "expected fd00::2 on eth0 inside pod container, got: \(output)") } } + #endif func testPodFilesystemOperation() async throws { let id = "test-pod-filesystem-operation" diff --git a/Sources/Integration/PodVolumeTests.swift b/Sources/Integration/PodVolumeTests.swift index 358ebdaa..aa029ba8 100644 --- a/Sources/Integration/PodVolumeTests.swift +++ b/Sources/Integration/PodVolumeTests.swift @@ -23,6 +23,7 @@ import Foundation import Logging import SystemPackage +#if os(macOS) extension IntegrationSuite { private func cloneRootfsForContainer(_ rootfs: Containerization.Mount, testID: String, containerID: String) throws -> Containerization.Mount { let clonePath = Self.testDir.appending(component: "\(testID)-\(containerID).ext4").absolutePath() @@ -853,3 +854,4 @@ extension IntegrationSuite { } } } +#endif diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 718ca783..5b9d3cfa 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -26,6 +26,12 @@ import NIOCore import NIOPosix import Synchronization +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + actor UnpackCoordinator { private var inFlight: [String: Task] = [:] @@ -173,6 +179,14 @@ struct IntegrationSuite: AsyncParsableCommand { @Option(name: .shortAndLong, help: "Only run tests whose names contain this string") var filter: String? + #if os(Linux) + @Option(name: .long, help: "Path to cloud-hypervisor binary (Linux only). Defaults to PATH lookup.") + var chBinary: String? + + @Option(name: .long, help: "Path to virtiofsd binary (Linux only). Defaults to PATH lookup.") + var virtiofsdBinary: String? + #endif + static func binPath(name: String) -> URL { URL(fileURLWithPath: FileManager.default.currentDirectoryPath) .appendingPathComponent("bin") @@ -203,8 +217,18 @@ struct IntegrationSuite: AsyncParsableCommand { } }() - var testKernel = Kernel(path: .init(filePath: kernel), platform: .linuxArm) - testKernel.commandLine.addDebug() + let testKernel = Kernel(path: .init(filePath: kernel), platform: .linuxArm) + // Intentionally NOT adding `debug` or `earlycon=pl011,...` here. + // Both look free, but each costs real wall-clock per VM boot: + // * `debug` floods printk through hvc0 (which CH writes to the + // bootlog file). + // * `earlycon=pl011,...` routes every early-boot printk character + // through pl011 MMIO traps into CH's serial emulator. With CH's + // pl011 wired to a file (see CHVirtualMachineInstance.serialConfig) + // each character is a synchronous file write and ~50–80 ms of + // dmesg quantization showed up in measurements — adding ~1.5 s + // to every VM boot before bootconsole hands over to virtio_console. + // Re-add either as a one-shot when actively diagnosing kernel boot. let image = try await Self.fetchImage(reference: reference, store: store) let platform = Platform(arch: "arm64", os: "linux", variant: "v8") @@ -227,29 +251,106 @@ struct IntegrationSuite: AsyncParsableCommand { } } + // Reap any per-test artifacts left over from prior tests. With + // `--max-concurrency 1` (linux-integration default) this runs after + // the previous test has fully completed, so it's race-free; on + // macOS where tests can run in parallel we just keep all files — + // disk usage isn't a concern there. Each per-test bootstrap clones + // a ~2GB rootfs and a ~512MB initfs, so without reaping the dev + // container fills its CoW layer in ~10 tests. + if self.maxConcurrency == 1 { + let preserve = fsPath.absolutePath() + if let entries = try? FileManager.default.contentsOfDirectory( + at: Self.testDir, + includingPropertiesForKeys: nil + ) { + for url in entries where url.absolutePath() != preserve { + try? FileManager.default.removeItem(at: url) + } + } + } + // Clone to test-specific path let clPath = Self.testDir.appending(component: "\(testID).ext4").absolutePath() try? FileManager.default.removeItem(atPath: clPath) let cl = try fs.clone(to: clPath) + // Per-test clone of the init.block. The init.block is supposed to be + // mounted read-only (kernel cmdline + readonly=true on the virtio-blk + // device for both VZ and CH), but sharing the same backing file across + // concurrent CH VMs has surfaced "internalError: mount" cascades on + // Linux/CH after a single test failure — symptomatic of the file + // entering a bad state when one CH instance is killed mid-flight. + // Cloning per test isolates each VM from any cross-test fallout. + let initClonePath = Self.testDir.appending(component: "\(testID).init.block").absolutePath() + try? FileManager.default.removeItem(atPath: initClonePath) + let initfsPerTest = try initfs.clone(to: initClonePath) + // Create bootLog directory and per-container bootLog path let bootlogDirURL = URL(filePath: bootlogDir) try? FileManager.default.createDirectory(at: bootlogDirURL, withIntermediateDirectories: true) let bootlogURL = bootlogDirURL.appendingPathComponent("\(testID).log") + let vmm: any VirtualMachineManager = try Self.makeVMM( + kernel: testKernel, + initialFilesystem: initfsPerTest, + chBinary: Self.chBinaryOverride(for: self), + virtiofsdBinary: Self.virtiofsdBinaryOverride(for: self) + ) + return ( cl, - VZVirtualMachineManager( - kernel: testKernel, - initialFilesystem: initfs, - group: Self.eventLoop - ), + vmm, image, BootLog.file(path: bootlogURL) ) } + private static func chBinaryOverride(for suite: IntegrationSuite) -> String? { + #if os(Linux) + return suite.chBinary + #else + _ = suite + return nil + #endif + } + + private static func virtiofsdBinaryOverride(for suite: IntegrationSuite) -> String? { + #if os(Linux) + return suite.virtiofsdBinary + #else + _ = suite + return nil + #endif + } + + private static func makeVMM( + kernel: Kernel, + initialFilesystem: Containerization.Mount, + chBinary: String?, + virtiofsdBinary: String? + ) throws -> any VirtualMachineManager { + #if os(macOS) + _ = chBinary + _ = virtiofsdBinary + return VZVirtualMachineManager( + kernel: kernel, + initialFilesystem: initialFilesystem, + group: Self.eventLoop + ) + #elseif os(Linux) + return try CHVirtualMachineManager( + kernel: kernel, + initialFilesystem: initialFilesystem, + chBinary: chBinary.map { URL(fileURLWithPath: $0) }, + virtiofsdBinary: virtiofsdBinary.map { URL(fileURLWithPath: $0) }, + group: Self.eventLoop, + logger: log + ) + #endif + } + static func fetchImage(reference: String, store: ImageStore) async throws -> Containerization.Image { do { return try await store.get(reference: reference) @@ -263,17 +364,24 @@ struct IntegrationSuite: AsyncParsableCommand { static func adjustLimits() throws { var limits = rlimit() - guard getrlimit(RLIMIT_NOFILE, &limits) == 0 else { + #if os(Linux) + let resource = __rlimit_resource_t(RLIMIT_NOFILE.rawValue) + #else + let resource = RLIMIT_NOFILE + #endif + + guard getrlimit(resource, &limits) == 0 else { throw POSIXError(.init(rawValue: errno)!) } limits.rlim_cur = 65536 limits.rlim_max = 65536 - guard setrlimit(RLIMIT_NOFILE, &limits) == 0 else { + guard setrlimit(resource, &limits) == 0 else { throw POSIXError(.init(rawValue: errno)!) } } + #if os(macOS) private func macOS26Tests() -> [Test] { if #available(macOS 26.0, *) { return [ @@ -292,6 +400,7 @@ struct IntegrationSuite: AsyncParsableCommand { } return [] } + #endif // Why does this exist? // @@ -309,168 +418,216 @@ struct IntegrationSuite: AsyncParsableCommand { let suiteStarted = Date().timeIntervalSinceReferenceDate log.info("starting integration suite\n") - let tests: [Test] = + let crossPlatformTests: [Test] = [ + // Process basics + Test("process true", testProcessTrue), + Test("process false", testProcessFalse), + Test("process echo hi", testProcessEchoHi), + Test("process no executable", testProcessNoExecutable), + Test("process user", testProcessUser), + Test("process stdin", testProcessStdin), + Test("process home envvar", testProcessHomeEnvvar), + Test("process custom home envvar", testProcessCustomHomeEnvvar), + Test("process tty ensure TERM", testProcessTtyEnvvar), + + // Hostname / hosts + Test("container hostname", testHostname), + Test("container hostname defaults to container id", testHostnameDefaultsToContainerID), + Test("container hosts", testHostsFile), + + // Statistics / cgroups / memory + Test("container statistics", testContainerStatistics), + Test("container cgroup limits", testCgroupLimits), + Test("container memory events OOM kill", testMemoryEventsOOMKill), + + // Console / boot / lifecycle + Test("container no serial console", testNoSerialConsole), + Test("container non-closure constructor", testNonClosureConstructor), + Test("container test large stdio ingest", testLargeStdioOutput), + Test("container bootlog using filehandle", testBootLogFileHandle), + Test("process delete idempotency", testProcessDeleteIdempotency), + Test("multiple execs without delete", testMultipleExecsWithoutDelete), + + // Capabilities + Test("container capabilities sys admin", testCapabilitiesSysAdmin), + Test("container capabilities net admin", testCapabilitiesNetAdmin), + Test("container capabilities OCI default", testCapabilitiesOCIDefault), + Test("container capabilities all capabilities", testCapabilitiesAllCapabilities), + Test("container capabilities file ownership", testCapabilitiesFileOwnership), + + // Stat / Copy + Test("container stat", testStat), + Test("container copy in", testCopyIn), + Test("container copy in file to existing directory", testCopyInFileToExistingDirectory), + Test("container copy in file to missing directory fails", testCopyInFileToMissingDirectoryFails), + Test("container copy in directory over existing file fails", testCopyInDirectoryOverExistingFileFails), + Test("container copy out", testCopyOut), + Test("container copy large file", testCopyLargeFile), + Test("container copy in directory", testCopyInDirectory), + Test("container copy out directory", testCopyOutDirectory), + Test("container copy empty file", testCopyEmptyFile), + Test("container copy empty directory", testCopyEmptyDirectory), + Test("container copy binary file", testCopyBinaryFile), + Test("container copy multiple files", testCopyMultipleFiles), + Test("container copy directory round trip", testCopyDirectoryRoundTrip), + Test("container copy in create parents", testCopyInCreateParents), + Test("container copy file permissions", testCopyFilePermissions), + Test("container copy large directory", testCopyLargeDirectory), + + // Read-only / writable layers + Test("container read-only rootfs", testReadOnlyRootfs), + Test("container read-only rootfs hosts file", testReadOnlyRootfsHostsFileWritten), + Test("container read-only rootfs DNS", testReadOnlyRootfsDNSConfigured), + Test("container writable layer", testWritableLayer), + Test("container writable layer journal writeback", testWritableLayerJournalWriteback), + Test("container writable layer journal ordered", testWritableLayerJournalOrdered), + Test("container writable layer journal data", testWritableLayerJournalData), + Test("container writable layer preserves lower", testWritableLayerPreservesLowerLayer), + Test("container writable layer reads from lower", testWritableLayerReadsFromLower), + Test("container writable layer with ro lower", testWritableLayerWithReadOnlyLower), + Test("container writable layer size", testWritableLayerSize), + Test("container writable layer DNS and hosts", testWritableLayerWithDNSAndHosts), + + // Stdin / stdout / exec + Test("large stdin input", testLargeStdinInput), + Test("exec large stdin input", testExecLargeStdinInput), + Test("exec custom path resolution", testExecCustomPathResolution), + Test("stdin explicit close", testStdinExplicitClose), + Test("stdin binary data", testStdinBinaryData), + Test("stdin multiple chunks", testStdinMultipleChunks), + Test("stdin very large", testStdinVeryLarge), + + // RLimit + Test("container rlimit open files", testRLimitOpenFiles), + Test("container rlimit multiple", testRLimitMultiple), + Test("container rlimit exec", testRLimitExec), + + // useInit + Test("container useInit basic", testUseInitBasic), + Test("container useInit exit code propagation", testUseInitExitCodePropagation), + Test("container useInit signal forwarding", testUseInitSignalForwarding), + Test("container useInit zombie reaping", testUseInitZombieReaping), + Test("container useInit with terminal", testUseInitWithTerminal), + Test("container useInit with stdin", testUseInitWithStdin), + + // Sysctl / security / workingDir + Test("container sysctl", testSysctl), + Test("container sysctl multiple", testSysctlMultiple), + Test("container noNewPrivileges", testNoNewPrivileges), + Test("container noNewPrivileges disabled", testNoNewPrivilegesDisabled), + Test("container noNewPrivileges exec", testNoNewPrivilegesExec), + Test("container workingDir created", testWorkingDirCreated), + Test("container workingDir exec created", testWorkingDirExecCreated), + + // VM resource overhead + Test("container VM resource overhead", testVMResourceOverhead), + + // Pods + Test("pod single container", testPodSingleContainer), + Test("pod multiple containers", testPodMultipleContainers), + Test("pod container output", testPodContainerOutput), + Test("pod concurrent containers", testPodConcurrentContainers), + Test("pod exec in container", testPodExecInContainer), + Test("pod exec in container env", testPodExecInContainerEnv), + Test("pod container hostname", testPodContainerHostname), + Test("pod container hostname defaults to container id", testPodContainerHostnameDefaultsToContainerID), + Test("pod stop container idempotency", testPodStopContainerIdempotency), + Test("pod list containers", testPodListContainers), + Test("pod container statistics", testPodContainerStatistics), + Test("pod memory events OOM kill", testPodMemoryEventsOOMKill), + Test("pod container resource limits", testPodContainerResourceLimits), + Test("pod container filesystem isolation", testPodContainerFilesystemIsolation), + Test("pod container PID namespace isolation", testPodContainerPIDNamespaceIsolation), + Test("pod container independent resource limits", testPodContainerIndependentResourceLimits), + Test("pod shared PID namespace", testPodSharedPIDNamespace), + Test("pod read-only rootfs", testPodReadOnlyRootfs), + Test("pod read-only rootfs DNS", testPodReadOnlyRootfsDNSConfigured), + Test("pod container hosts config", testPodContainerHostsConfig), + Test("pod multiple containers different DNS", testPodMultipleContainersDifferentDNS), + Test("pod multiple containers different hosts", testPodMultipleContainersDifferentHosts), + Test("pod level DNS", testPodLevelDNS), + Test("pod level DNS with container override", testPodLevelDNSWithContainerOverride), + Test("pod level hosts", testPodLevelHosts), + Test("pod level hosts with container override", testPodLevelHostsWithContainerOverride), + Test("pod level hostname", testPodLevelHostname), + Test("pod level hostname with container override", testPodLevelHostnameWithContainerOverride), + Test("pod rlimit open files", testPodRLimitOpenFiles), + Test("pod rlimit exec", testPodRLimitExec), + Test("pod useInit basic", testPodUseInitBasic), + Test("pod useInit exit code propagation", testPodUseInitExitCodePropagation), + Test("pod useInit signal forwarding", testPodUseInitSignalForwarding), + Test("pod useInit multiple containers", testPodUseInitMultipleContainers), + Test("pod useInit with shared PID namespace", testPodUseInitWithSharedPIDNamespace), + Test("pod sysctl", testPodSysctl), + Test("pod sysctl multiple containers", testPodSysctlMultipleContainers), + Test("pod invalid volume reference", testPodInvalidVolumeReference), + Test("pod duplicate volume name", testPodDuplicateVolumeName), + + // Mounts / virtiofs shares (cross-platform: VZ on macOS, virtiofsd on Linux/CH). + Test("container mount", testMounts), + Test("container single file mount", testSingleFileMount), + Test("container single file mount read-only", testSingleFileMountReadOnly), + Test("container single file mount write-back", testSingleFileMountWriteBack), + Test("container single file mount symlink", testSingleFileMountSymlink), + Test("container duplicate virtiofs mount", testDuplicateVirtiofsMount), + Test("container duplicate virtiofs mount via symlink", testDuplicateVirtiofsMountViaSymlink), + Test("container mount sort by depth", testMountsSortedByDepth), + Test("pod single file mount", testPodSingleFileMount), + ] + + #if os(macOS) + let macOSOnlyTests: [Test] = [ - // Containers - Test("process true", testProcessTrue), - Test("process false", testProcessFalse), - Test("process echo hi", testProcessEchoHi), - Test("process no executable", testProcessNoExecutable), - Test("process user", testProcessUser), - Test("process stdin", testProcessStdin), - Test("process home envvar", testProcessHomeEnvvar), - Test("process custom home envvar", testProcessCustomHomeEnvvar), - Test("process tty ensure TERM", testProcessTtyEnvvar), - Test("multiple concurrent processes", testMultipleConcurrentProcesses), - Test("multiple concurrent processes with output stress", testMultipleConcurrentProcessesOutputStress), - Test("container hostname", testHostname), - Test("container hostname defaults to container id", testHostnameDefaultsToContainerID), - Test("container hosts", testHostsFile), - Test("container mount", testMounts), + // ContainerManager-based tests (ContainerManager is macOS-only) Test("container stop idempotency", testContainerStopIdempotency), - Test("nested virt", testNestedVirtualizationEnabled), Test("container manager", testContainerManagerCreate), Test("container reuse", testContainerReuse), Test("container /dev/console", testContainerDevConsole), - Test("container statistics", testContainerStatistics), - Test("container cgroup limits", testCgroupLimits), - Test("container memory events OOM kill", testMemoryEventsOOMKill), - Test("container no serial console", testNoSerialConsole), + + // Nested virtualization (VZ-only feature) + Test("nested virt", testNestedVirtualizationEnabled), + + // Filesystem operations (TODO: promote to cross-platform once verified on CH) + Test("container frozen ext4 clone", testFrozenExt4Clone), + Test("container trim ext4 clone", testTrimExt4Clone), + + // Unix socket forwarding (dynamic vsock listen exceeds CH's prebound stdio pool) Test("unix socket into guest", testUnixSocketIntoGuest), Test("unix socket into guest long container id", testUnixSocketIntoGuestLongContainerID), Test("unix socket into guest symlink", testUnixSocketIntoGuestSymlink), - Test("container non-closure constructor", testNonClosureConstructor), - Test("container test large stdio ingest", testLargeStdioOutput), - Test("process delete idempotency", testProcessDeleteIdempotency), - Test("multiple execs without delete", testMultipleExecsWithoutDelete), - Test("container bootlog using filehandle", testBootLogFileHandle), - Test("container capabilities sys admin", testCapabilitiesSysAdmin), - Test("container capabilities net admin", testCapabilitiesNetAdmin), - Test("container capabilities OCI default", testCapabilitiesOCIDefault), - Test("container capabilities all capabilities", testCapabilitiesAllCapabilities), - Test("container capabilities file ownership", testCapabilitiesFileOwnership), - Test("container stat", testStat), - Test("container copy in", testCopyIn), - Test("container copy in file to existing directory", testCopyInFileToExistingDirectory), - Test("container copy in file to missing directory fails", testCopyInFileToMissingDirectoryFails), - Test("container copy in directory over existing file fails", testCopyInDirectoryOverExistingFileFails), - Test("container copy out", testCopyOut), - Test("container copy large file", testCopyLargeFile), - Test("container copy in directory", testCopyInDirectory), - Test("container copy out directory", testCopyOutDirectory), - Test("container copy empty file", testCopyEmptyFile), - Test("container copy empty directory", testCopyEmptyDirectory), - Test("container copy binary file", testCopyBinaryFile), - Test("container copy multiple files", testCopyMultipleFiles), - Test("container copy directory round trip", testCopyDirectoryRoundTrip), - Test("container copy in create parents", testCopyInCreateParents), - Test("container copy file permissions", testCopyFilePermissions), - Test("container copy large directory", testCopyLargeDirectory), - Test("container read-only rootfs", testReadOnlyRootfs), - Test("container read-only rootfs hosts file", testReadOnlyRootfsHostsFileWritten), - Test("container read-only rootfs DNS", testReadOnlyRootfsDNSConfigured), - Test("container writable layer", testWritableLayer), - Test("container writable layer journal writeback", testWritableLayerJournalWriteback), - Test("container writable layer journal ordered", testWritableLayerJournalOrdered), - Test("container writable layer journal data", testWritableLayerJournalData), - Test("container writable layer preserves lower", testWritableLayerPreservesLowerLayer), - Test("container writable layer reads from lower", testWritableLayerReadsFromLower), - Test("container writable layer with ro lower", testWritableLayerWithReadOnlyLower), - Test("container writable layer size", testWritableLayerSize), - Test("container writable layer DNS and hosts", testWritableLayerWithDNSAndHosts), - Test("container frozen ext4 clone", testFrozenExt4Clone), - Test("container trim ext4 clone", testTrimExt4Clone), - Test("large stdin input", testLargeStdinInput), - Test("exec large stdin input", testExecLargeStdinInput), - Test("exec custom path resolution", testExecCustomPathResolution), - Test("stdin explicit close", testStdinExplicitClose), - Test("stdin binary data", testStdinBinaryData), - Test("stdin multiple chunks", testStdinMultipleChunks), - Test("stdin very large", testStdinVeryLarge), - Test("container single file mount", testSingleFileMount), - Test("container single file mount read-only", testSingleFileMountReadOnly), - Test("container single file mount write-back", testSingleFileMountWriteBack), - Test("container single file mount symlink", testSingleFileMountSymlink), - Test("container rlimit open files", testRLimitOpenFiles), - Test("container rlimit multiple", testRLimitMultiple), - Test("container rlimit exec", testRLimitExec), - Test("container duplicate virtiofs mount", testDuplicateVirtiofsMount), - Test("container duplicate virtiofs mount via symlink", testDuplicateVirtiofsMountViaSymlink), - Test("container useInit basic", testUseInitBasic), - Test("container useInit exit code propagation", testUseInitExitCodePropagation), - Test("container useInit signal forwarding", testUseInitSignalForwarding), - Test("container useInit zombie reaping", testUseInitZombieReaping), - Test("container useInit with terminal", testUseInitWithTerminal), - Test("container useInit with stdin", testUseInitWithStdin), - Test("container sysctl", testSysctl), - Test("container sysctl multiple", testSysctlMultiple), - Test("container noNewPrivileges", testNoNewPrivileges), - Test("container noNewPrivileges disabled", testNoNewPrivilegesDisabled), - Test("container noNewPrivileges exec", testNoNewPrivilegesExec), - Test("container workingDir created", testWorkingDirCreated), - Test("container workingDir exec created", testWorkingDirExecCreated), - Test("container mount sort by depth", testMountsSortedByDepth), - Test("container VM resource overhead", testVMResourceOverhead), + Test("pod unix socket into guest symlink", testPodUnixSocketIntoGuestSymlink), + + // High-concurrency stdio (exceeds CH's prebound stdio pool size) + Test("multiple concurrent processes", testMultipleConcurrentProcesses), + Test("multiple concurrent processes with output stress", testMultipleConcurrentProcessesOutputStress), + + // NBD volumes (test infra is macOS-only) Test("container NBD mount", testContainerNBDMount), Test("container NBD read-only", testContainerNBDReadOnly), Test("container NBD raw block", testContainerNBDRawBlock), Test("container NBD volume identity", testContainerNBDVolumeIdentity), - - // Pods - Test("pod single container", testPodSingleContainer), - Test("pod multiple containers", testPodMultipleContainers), - Test("pod container output", testPodContainerOutput), - Test("pod concurrent containers", testPodConcurrentContainers), - Test("pod exec in container", testPodExecInContainer), - Test("pod exec in container env", testPodExecInContainerEnv), - Test("pod container hostname", testPodContainerHostname), - Test("pod container hostname defaults to container id", testPodContainerHostnameDefaultsToContainerID), - Test("pod stop container idempotency", testPodStopContainerIdempotency), - Test("pod list containers", testPodListContainers), - Test("pod container statistics", testPodContainerStatistics), - Test("pod memory events OOM kill", testPodMemoryEventsOOMKill), - Test("pod container resource limits", testPodContainerResourceLimits), - Test("pod container filesystem isolation", testPodContainerFilesystemIsolation), - Test("pod container PID namespace isolation", testPodContainerPIDNamespaceIsolation), - Test("pod container independent resource limits", testPodContainerIndependentResourceLimits), - Test("pod shared PID namespace", testPodSharedPIDNamespace), - Test("pod read-only rootfs", testPodReadOnlyRootfs), - Test("pod read-only rootfs DNS", testPodReadOnlyRootfsDNSConfigured), - Test("pod single file mount", testPodSingleFileMount), - Test("pod container hosts config", testPodContainerHostsConfig), - Test("pod multiple containers different DNS", testPodMultipleContainersDifferentDNS), - Test("pod multiple containers different hosts", testPodMultipleContainersDifferentHosts), - Test("pod level DNS", testPodLevelDNS), - Test("pod level DNS with container override", testPodLevelDNSWithContainerOverride), - Test("pod level hosts", testPodLevelHosts), - Test("pod level hosts with container override", testPodLevelHostsWithContainerOverride), - Test("pod level hostname", testPodLevelHostname), - Test("pod level hostname with container override", testPodLevelHostnameWithContainerOverride), - Test("pod rlimit open files", testPodRLimitOpenFiles), - Test("pod rlimit exec", testPodRLimitExec), - Test("pod useInit basic", testPodUseInitBasic), - Test("pod useInit exit code propagation", testPodUseInitExitCodePropagation), - Test("pod useInit signal forwarding", testPodUseInitSignalForwarding), - Test("pod useInit multiple containers", testPodUseInitMultipleContainers), - Test("pod useInit with shared PID namespace", testPodUseInitWithSharedPIDNamespace), - Test("pod unix socket into guest symlink", testPodUnixSocketIntoGuestSymlink), - Test("pod sysctl", testPodSysctl), - Test("pod sysctl multiple containers", testPodSysctlMultipleContainers), Test("pod shared NBD volume", testPodSharedNBDVolume), Test("pod multiple NBD volumes", testPodMultipleNBDVolumes), Test("pod unreferenced NBD volume", testPodUnreferencedVolume), Test("pod NBD volume persistence", testPodNBDVolumePersistence), Test("pod NBD concurrent writes", testPodNBDConcurrentWrites), Test("pod NBD volume identity", testPodNBDVolumeIdentity), - Test("pod invalid volume reference", testPodInvalidVolumeReference), - Test("pod duplicate volume name", testPodDuplicateVolumeName), Test("pod filesystem operation", testPodFilesystemOperation), Test("pod shared disk image volume", testPodSharedDiskImageVolume), ] + macOS26Tests() + let tests: [Test] = crossPlatformTests + macOSOnlyTests + #else + let tests: [Test] = crossPlatformTests + #endif let filteredTests: [Test] if let filter { - filteredTests = tests.filter { $0.name.contains(filter) } + // Comma-separated; ANY pattern matching the test name keeps it. + // E.g. `--filter "container mount,pod single file"`. + let patterns = filter.split(separator: ",").map { String($0) } + filteredTests = tests.filter { test in + patterns.contains { test.name.contains($0) } + } log.info("filter '\(filter)' matched \(filteredTests.count)/\(tests.count) tests") } else { filteredTests = tests diff --git a/Sources/cctl/BridgeCommand.swift b/Sources/cctl/BridgeCommand.swift new file mode 100644 index 00000000..2714acc2 --- /dev/null +++ b/Sources/cctl/BridgeCommand.swift @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import ArgumentParser +import Containerization +import ContainerizationExtras +import Foundation + +extension Application { + struct Bridge: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "bridge", + abstract: "Manage the host bridge used by `cctl run` for container networking", + subcommands: [Create.self, Delete.self] + ) + + struct Create: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "create", + abstract: "Create (or reconfigure idempotently) the host bridge + NAT plumbing" + ) + + @Option(name: .long, help: "Bridge interface name") + var name: String = "cz0" + + @Option(name: .long, help: "IPv4 subnet in CIDR form") + var subnet: String = "192.168.64.0/24" + + @Option(name: .long, help: "Host-side IPv4 on the bridge (defaults to subnet.lower+1)") + var gateway: String? + + @Option(name: .long, help: "Egress interface for MASQUERADE (default: auto-detect from default route)") + var egress: String? + + @Option(name: .long, help: "Bridge MTU") + var mtu: UInt32 = 1500 + + @Flag( + name: .customLong("enable-nat"), + help: + "Program iptables MASQUERADE/FORWARD and enable net.ipv4.ip_forward so containers reach the outside network. Off by default — host firewall policy is left untouched." + ) + var enableNAT: Bool = false + + func run() async throws { + let cidr = try CIDRv4(subnet) + let gw = try gateway.map { try IPv4Address($0) } + let mgr = BridgeManager( + name: name, + subnet: cidr, + gateway: gw, + mtu: mtu, + egressInterface: egress, + enableNAT: enableNAT, + logger: log + ) + try mgr.create() + } + } + + struct Delete: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "delete", + abstract: "Remove the bridge and revert the host plumbing this tool added" + ) + + @Option(name: .long, help: "Bridge interface name") + var name: String = "cz0" + + @Option(name: .long, help: "IPv4 subnet in CIDR form") + var subnet: String = "192.168.64.0/24" + + func run() async throws { + let cidr = try CIDRv4(subnet) + let mgr = BridgeManager(name: name, subnet: cidr, logger: log) + try mgr.delete() + } + } + } +} +#endif diff --git a/Sources/cctl/ImageCommand.swift b/Sources/cctl/ImageCommand.swift index 9c4a6e88..c7e29bef 100644 --- a/Sources/cctl/ImageCommand.swift +++ b/Sources/cctl/ImageCommand.swift @@ -22,7 +22,6 @@ import ContainerizationExtras import ContainerizationOCI import Foundation -#if os(macOS) extension Application { struct Images: AsyncParsableCommand { static let configuration = CommandConfiguration( @@ -276,8 +275,10 @@ extension Application { if let authentication { return try await body(authentication) } + #if os(macOS) let keychain = KeychainHelper(securityDomain: Application.keychainID) authentication = try? keychain.lookup(hostname: host) + #endif return try await body(authentication) } @@ -293,4 +294,3 @@ extension Application { } } } -#endif diff --git a/Sources/cctl/RunCommand.swift b/Sources/cctl/RunCommand.swift index 66cf2081..2624be22 100644 --- a/Sources/cctl/RunCommand.swift +++ b/Sources/cctl/RunCommand.swift @@ -195,3 +195,338 @@ extension Application { } } #endif + +#if os(Linux) +extension Application { + /// Linux-side `cctl run` — boots a container in a cloud-hypervisor VM. + /// + /// Mirrors the macOS `cctl run` UX: `-i / --image` pulls and unpacks the + /// container image into an ext4 rootfs automatically. The Linux-specific + /// surface is `--initfs` (the deployment ships an `initfs.ext4` containing + /// vminitd; macOS resolves the equivalent via the local image store, but + /// on Linux the boot artifact is a path on disk). + struct Run: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "run", + abstract: "Run a container via cloud-hypervisor" + ) + + @Option(name: [.customLong("image"), .customShort("i")], help: "Image reference to base the container on") + var imageReference: String = "docker.io/library/alpine:3.16" + + @Option(name: .long, help: "id for the container") + var id: String = "cctl" + + @Option(name: [.customLong("cpus"), .customShort("c")], help: "Number of CPUs to allocate") + var cpus: Int = 2 + + @Option(name: [.customLong("memory"), .customShort("m")], help: "Amount of memory in MiB") + var memory: UInt64 = 1024 + + @Option(name: .customLong("fs-size"), help: "The size to create the container rootfs ext4 as (MiB)") + var fsSizeInMB: UInt64 = 2048 + + @Option(name: .customLong("mount"), help: "Directory to share into the container (Example: /foo:/bar)") + var mounts: [String] = [] + + @Option(name: .long, help: "Path to OCI runtime to use for spawning the container") + var ociRuntimePath: String? + + @Flag(name: .long, help: "Make rootfs readonly") + var readOnly: Bool = false + + @Flag(name: .long, help: "Run with an init process for signal forwarding and zombie reaping") + var `init`: Bool = false + + @Option( + name: [.customLong("kernel"), .customShort("k")], + help: "Path to the Linux kernel image", + completion: .file() + ) + var kernel: String + + @Option( + name: .customLong("initfs"), + help: "Path to the ext4 initfs containing vminitd (boots the VM as PID 1)", + completion: .file() + ) + var initfs: String + + @Option( + name: .customLong("bridge"), + help: "Bridge interface name to attach the container TAP to" + ) + var bridge: String = "cz0" + + @Option( + name: .customLong("subnet"), + help: "IPv4 subnet for the container network (CIDR)" + ) + var subnet: String = "192.168.64.0/24" + + @Option( + name: .customLong("gateway"), + help: "Host-side IPv4 on the bridge (defaults to subnet.lower+1)" + ) + var bridgeGateway: String? + + @Option( + name: .customLong("egress"), + help: "Egress interface for outbound NAT (default: auto-detect from default route)" + ) + var egress: String? + + @Flag(name: .customLong("no-network"), help: "Skip all host network setup; container has no interface") + var noNetwork: Bool = false + + @Flag( + name: .customLong("enable-nat"), + help: + "Program iptables MASQUERADE/FORWARD and enable ip_forward so the container can reach external networks. Off by default — the bridge stays internal-only." + ) + var enableNAT: Bool = false + + @Option(name: .customLong("ns"), help: "Nameserver addresses (default: read host /etc/resolv.conf)") + var nameservers: [String] = [] + + @Option( + name: .customLong("ch-binary"), + help: "Path to cloud-hypervisor binary (defaults to PATH lookup)" + ) + var chBinary: String? + + @Option( + name: .customLong("virtiofsd-binary"), + help: "Path to virtiofsd binary (defaults to PATH lookup)" + ) + var virtiofsdBinary: String? + + @Option(name: .long, help: "Current working directory") + var cwd: String = "/" + + @Argument(parsing: .captureForPassthrough) + var arguments: [String] = ["/bin/sh"] + + func run() async throws { + #if arch(arm64) + let kernelPlatform = SystemPlatform.linuxArm + #else + let kernelPlatform = SystemPlatform.linuxAmd + #endif + let imagePlatform = Platform.current + + let kernelObj = Kernel( + path: URL(fileURLWithPath: kernel), + platform: kernelPlatform + ) + + // Wire up the host TTY when there is one. `Terminal.current` walks + // STDERR/STDOUT/STDIN looking for a tty fd and throws if none of + // them is one (e.g. all stdio piped). In that case fall through to + // the non-interactive path so `cctl run /bin/true` still works. + let hostTerminal = try? Terminal.current + if let hostTerminal { + try hostTerminal.setraw() + } + defer { hostTerminal?.tryReset() } + let sigwinchStream = AsyncSignalHandler.create(notify: [SIGWINCH]) + + // Pull the container image and unpack to a per-container ext4 (same + // shape as ContainerManager.unpack on macOS: reuse the existing + // rootfs.ext4 if it's already there, fresh-unpack otherwise). + let imageStore = Application.imageStore + let reference = try Reference.parse(imageReference) + reference.normalize() + let normalizedRef = reference.description + if normalizedRef != imageReference { + print("Reference resolved to \(normalizedRef)") + } + let image = try await imageStore.get(reference: normalizedRef, pull: true) + + let containersRoot = Application.appRoot + .appendingPathComponent("containers") + .appendingPathComponent(id) + try FileManager.default.createDirectory(at: containersRoot, withIntermediateDirectories: true) + let rootfsPath = containersRoot.appendingPathComponent("rootfs.ext4") + + var rootfsMount: Containerization.Mount + do { + let unpacker = EXT4Unpacker(blockSizeInBytes: fsSizeInMB.mib()) + rootfsMount = try await unpacker.unpack(image, for: imagePlatform, at: rootfsPath) + } catch let err as ContainerizationError where err.code == .exists { + rootfsMount = .block( + format: "ext4", + source: rootfsPath.absolutePath(), + destination: "/", + options: [] + ) + } + if readOnly { + rootfsMount.options.append("ro") + } + + let initfsMount = Mount.block( + format: "ext4", + source: initfs, + destination: "/", + options: ["ro"] + ) + + let manager = try CHVirtualMachineManager( + kernel: kernelObj, + initialFilesystem: initfsMount, + chBinary: chBinary.map { URL(fileURLWithPath: $0) }, + virtiofsdBinary: virtiofsdBinary.map { URL(fileURLWithPath: $0) }, + logger: log + ) + + // Seed process config from the image (entrypoint, env, cwd, user), + // then layer user-provided overrides on top — same precedence as + // ContainerManager + macOS Run. + let imageConfig = try await image.config(for: imagePlatform).config + var processConfig = LinuxProcessConfiguration() + if let imageConfig { + processConfig = .init(from: imageConfig) + } + processConfig.arguments = arguments + processConfig.workingDirectory = cwd + if let hostTerminal { + processConfig.setTerminalIO(terminal: hostTerminal) + } + + var interfaces: [any Interface] = [] + var dnsConfig: DNS? = nil + var hostsConfig: Hosts? = nil + + if !noNetwork { + let subnetCIDR = try CIDRv4(subnet) + let gw = try bridgeGateway.map { try IPv4Address($0) } + + let mgr = BridgeManager( + name: bridge, + subnet: subnetCIDR, + gateway: gw, + mtu: 1500, + egressInterface: egress, + enableNAT: enableNAT, + logger: log + ) + try mgr.create() + + var network = try LinuxBridgedNetwork( + subnet: subnetCIDR, + gateway: gw, + bridge: bridge, + mtu: 1500 + ) + if let iface = try network.createInterface(id) { + interfaces.append(iface) + + var h = Hosts.default + h.entries.append( + .init( + ipAddress: iface.ipv4Address.address.description, + hostnames: [id] + )) + hostsConfig = h + + let resolved = + nameservers.isEmpty + ? Self.readHostNameservers() + : nameservers + dnsConfig = DNS(nameservers: resolved) + } + } + + let cpusCount = cpus + let memoryBytes = memory * 1024 * 1024 + let networkInterfaces = interfaces + let useInit = self.`init` + let extraMounts = self.mounts + let runtimePath = self.ociRuntimePath + let dns = dnsConfig + let hosts = hostsConfig + + let container = try LinuxContainer( + id, + rootfs: rootfsMount, + vmm: manager, + logger: log + ) { config in + config.process = processConfig + config.cpus = cpusCount + config.memoryInBytes = memoryBytes + config.interfaces = networkInterfaces + config.useInit = useInit + if let dns { config.dns = dns } + if let hosts { config.hosts = hosts } + + for mount in extraMounts { + let paths = mount.split(separator: ":") + if paths.count != 2 { + throw ContainerizationError( + .invalidArgument, + message: "incorrect mount format detected: \(mount)" + ) + } + config.mounts.append( + Mount.share(source: String(paths[0]), destination: String(paths[1])) + ) + } + + if let runtimePath { + config.ociRuntimePath = runtimePath + config.mounts = LinuxContainer.defaultOCIMounts() + } + } + + try await container.create() + try await container.start() + + // Sync the guest pty winsize to the host on start, and on every + // SIGWINCH while running. Only meaningful when we have a tty. + if let hostTerminal { + try? await container.resize(to: try hostTerminal.size) + } + + let exit = try await withThrowingTaskGroup( + of: Void.self, + returning: ExitStatus.self + ) { group in + if let hostTerminal { + group.addTask { + for await _ in sigwinchStream.signals { + try await container.resize(to: try hostTerminal.size) + } + } + } + let result = try await container.wait() + group.cancelAll() + try await container.stop() + return result + } + + if exit.exitCode != 0 { + throw ExitCode(exit.exitCode) + } + } + + /// Read `nameserver` lines from `/etc/resolv.conf`. Returns + /// `["1.1.1.1"]` if the file is missing or has no entries. + private static func readHostNameservers() -> [String] { + guard let text = try? String(contentsOfFile: "/etc/resolv.conf", encoding: .utf8) else { + return ["1.1.1.1"] + } + let servers = + text + .split(separator: "\n") + .compactMap { line -> String? in + let parts = line.split(separator: " ", maxSplits: 1, omittingEmptySubsequences: true) + guard parts.count == 2, parts[0] == "nameserver" else { return nil } + return String(parts[1]).trimmingCharacters(in: .whitespaces) + } + return servers.isEmpty ? ["1.1.1.1"] : servers + } + } +} +#endif diff --git a/Sources/cctl/cctl.swift b/Sources/cctl/cctl.swift index 2069a99e..e2c54137 100644 --- a/Sources/cctl/cctl.swift +++ b/Sources/cctl/cctl.swift @@ -63,14 +63,14 @@ struct Application: AsyncParsableCommand { version: "2.0.0", subcommands: { var commands: [any ParsableCommand.Type] = [ - Rootfs.self - ] - #if os(macOS) - commands += [ + Rootfs.self, Images.self, - Login.self, Run.self, ] + #if os(macOS) + commands.append(Login.self) + #elseif os(Linux) + commands.append(Bridge.self) #endif return commands }() diff --git a/Tests/CloudHypervisorTests/ClientTests.swift b/Tests/CloudHypervisorTests/ClientTests.swift new file mode 100644 index 00000000..a5678dcd --- /dev/null +++ b/Tests/CloudHypervisorTests/ClientTests.swift @@ -0,0 +1,506 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import NIOPosix +import Testing + +@testable import CloudHypervisor + +@Suite("CloudHypervisor.Client") +struct ClientTests { + private static let group = MultiThreadedEventLoopGroup.singleton + + // MARK: - Init + + @Test("Client init succeeds with file:// URL") + func initSucceeds() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let socketURL = URL(filePath: server.socketPath) + let _ = try CloudHypervisor.Client( + socketPath: socketURL, + eventLoopGroup: Self.group + ) + } + + // MARK: - Invalid socket path + + @Test("Client init throws .invalidSocketPath for non-file URL") + func initThrowsForNonFileURL() throws { + let url = try #require(URL(string: "https://example.com")) + #expect(throws: CloudHypervisor.Error.self) { + try CloudHypervisor.Client(socketPath: url, eventLoopGroup: Self.group) + } + } + + // MARK: - Non-2xx response + + @Test("Non-2xx response throws .http with correct status") + func non2xxThrowsHTTPError() async throws { + let body = Data("not found".utf8) + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.notFound, body: body) + } + defer { Task { try? await server.shutdown() } } + + let socketURL = URL(filePath: server.socketPath) + let client = try CloudHypervisor.Client(socketPath: socketURL, eventLoopGroup: Self.group) + + struct Dummy: Decodable, Sendable {} + + do { + let _: Dummy = try await client.get("/api/v1/missing") + Issue.record("Expected .http error but call succeeded") + } catch let err as CloudHypervisor.Error { + guard case .http(let status, let respBody) = err else { + Issue.record("Expected .http, got \(err)") + return + } + #expect(status == .notFound) + #expect(respBody == body) + } catch { + Issue.record("Expected CloudHypervisor.Error but got \(error)") + } + } + + // MARK: - vmmPing + + @Test("vmmPing sends GET /api/v1/vmm.ping and decodes VmmPingResponse") + func vmmPing() async throws { + let expected = CloudHypervisor.VmmPingResponse(version: "v40.0", pid: 12345) + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(expected)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmmPing() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .GET) + #expect(recorded[0].uri == "/api/v1/vmm.ping") + #expect(recorded[0].body.isEmpty) + #expect(result.version == "v40.0") + #expect(result.pid == 12345) + } + + // MARK: - vmmShutdown + + @Test("vmmShutdown sends PUT /api/v1/vmm.shutdown and returns without throwing") + func vmmShutdown() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmmShutdown() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vmm.shutdown") + } + + // MARK: - vmmInfo + + @Test("vmmInfo sends GET /api/v1/vmm.info and decodes VmmInfo") + func vmmInfo() async throws { + let expected = CloudHypervisor.VmmInfo(version: "v40.0", pid: 99) + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(expected)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmmInfo() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .GET) + #expect(recorded[0].uri == "/api/v1/vmm.info") + #expect(result.version == "v40.0") + } + + // MARK: - vmCreate + + @Test("vmCreate sends PUT /api/v1/vm.create with encoded body") + func vmCreate() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let config = CloudHypervisor.VmConfig( + cpus: .init(bootVcpus: 2, maxVcpus: 4), + memory: .init(size: 512 * 1024 * 1024), + payload: .init(kernel: "/boot/vmlinux"), + console: .init(mode: .Null), + serial: .init(mode: .Tty) + ) + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmCreate(config) + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.create") + + let decoded = try JSONDecoder().decode(CloudHypervisor.VmConfig.self, from: recorded[0].body) + #expect(decoded == config) + } + + // MARK: - vmBoot + + @Test("vmBoot sends PUT /api/v1/vm.boot with no body") + func vmBoot() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmBoot() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.boot") + #expect(recorded[0].body.isEmpty) + } + + // Regression: cloud-hypervisor's HTTP parser rejects body-less PUTs + // unless they carry an explicit `Content-Length: 0`. With the + // AsyncHTTPClient transport, that wire shape is produced by + // assigning `request.body = .bytes(ByteBuffer())` so AHC's + // RequestValidation re-derives framing as `known(0)` per RFC 7230 + // §3.3.2. This test asserts the on-the-wire result rather than how + // it's produced, so any future transport change that drops the + // empty-body framing surfaces here. + @Test("Body-less PUT sends Content-Length: 0 with empty body") + func bodylessPUTSendsContentLengthZero() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client( + socketPath: URL(filePath: server.socketPath), + eventLoopGroup: Self.group + ) + try await client.vmBoot() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + let req = try #require(recorded.first) + #expect(req.method == .PUT) + #expect(req.uri == "/api/v1/vm.boot") + #expect(req.body.isEmpty) + #expect(req.headers["Content-Length"].first == "0") + } + + // MARK: - vmShutdown + + @Test("vmShutdown sends PUT /api/v1/vm.shutdown with no body") + func vmShutdown() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmShutdown() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.shutdown") + #expect(recorded[0].body.isEmpty) + } + + // MARK: - vmInfo + + @Test("vmInfo sends GET /api/v1/vm.info and decodes VmInfo") + func vmInfo() async throws { + let expectedConfig = CloudHypervisor.VmConfig( + cpus: .init(bootVcpus: 1, maxVcpus: 1), + memory: .init(size: 256 * 1024 * 1024), + payload: .init(kernel: "/boot/vmlinux"), + console: .init(mode: .Null), + serial: .init(mode: .Null) + ) + let expected = CloudHypervisor.VmInfo(config: expectedConfig, state: .Running) + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(expected)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmInfo() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .GET) + #expect(recorded[0].uri == "/api/v1/vm.info") + #expect(recorded[0].body.isEmpty) + #expect(result == expected) + } + + // MARK: - vmPause + + @Test("vmPause sends PUT /api/v1/vm.pause with no body") + func vmPause() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmPause() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.pause") + #expect(recorded[0].body.isEmpty) + } + + // MARK: - vmResume + + @Test("vmResume sends PUT /api/v1/vm.resume with no body") + func vmResume() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmResume() + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.resume") + #expect(recorded[0].body.isEmpty) + } + + // MARK: - vmAddDisk + + @Test("vmAddDisk sends PUT /api/v1/vm.add-disk and returns PciDeviceInfo") + func vmAddDisk() async throws { + let pciInfo = CloudHypervisor.PciDeviceInfo(id: "_disk0", bdf: "0000:00:01.0") + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(pciInfo)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let config = CloudHypervisor.DiskConfig(path: "/tmp/disk.img", readonly: true, id: "_disk0") + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmAddDisk(config) + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.add-disk") + + let decoded = try JSONDecoder().decode(CloudHypervisor.DiskConfig.self, from: recorded[0].body) + #expect(decoded == config) + #expect(result == pciInfo) + } + + // MARK: - vmAddFs + + @Test("vmAddFs sends PUT /api/v1/vm.add-fs and returns PciDeviceInfo") + func vmAddFs() async throws { + let pciInfo = CloudHypervisor.PciDeviceInfo(id: "_disk0", bdf: "0000:00:01.0") + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(pciInfo)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let config = CloudHypervisor.FsConfig(tag: "myfs", socket: "/tmp/virtiofsd.sock", id: "_fs0") + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmAddFs(config) + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.add-fs") + + let decoded = try JSONDecoder().decode(CloudHypervisor.FsConfig.self, from: recorded[0].body) + #expect(decoded == config) + #expect(result == pciInfo) + } + + // MARK: - vmAddNet + + @Test("vmAddNet sends PUT /api/v1/vm.add-net and returns PciDeviceInfo") + func vmAddNet() async throws { + let pciInfo = CloudHypervisor.PciDeviceInfo(id: "_disk0", bdf: "0000:00:01.0") + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(pciInfo)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let config = CloudHypervisor.NetConfig(tap: "tap0", mac: "AA:BB:CC:DD:EE:FF", id: "_net0") + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmAddNet(config) + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.add-net") + + let decoded = try JSONDecoder().decode(CloudHypervisor.NetConfig.self, from: recorded[0].body) + #expect(decoded == config) + #expect(result == pciInfo) + } + + // MARK: - vmAddVsock + + @Test("vmAddVsock sends PUT /api/v1/vm.add-vsock and returns PciDeviceInfo") + func vmAddVsock() async throws { + let pciInfo = CloudHypervisor.PciDeviceInfo(id: "_disk0", bdf: "0000:00:01.0") + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + (try? StubResponse.json(pciInfo)) ?? StubResponse.ok() + } + defer { Task { try? await server.shutdown() } } + + let config = CloudHypervisor.VsockConfig(cid: 3, socket: "/tmp/vsock.sock", id: "_vsock0") + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + let result = try await client.vmAddVsock(config) + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.add-vsock") + + let decoded = try JSONDecoder().decode(CloudHypervisor.VsockConfig.self, from: recorded[0].body) + #expect(decoded == config) + #expect(result == pciInfo) + } + + // MARK: - vmRemoveDevice + + @Test("vmRemoveDevice sends PUT /api/v1/vm.remove-device with id body") + func vmRemoveDevice() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let client = try CloudHypervisor.Client(socketPath: URL(filePath: server.socketPath), eventLoopGroup: Self.group) + try await client.vmRemoveDevice(id: "_disk0") + + let recorded = server.recordedRequests() + #expect(recorded.count == 1) + #expect(recorded[0].method == .PUT) + #expect(recorded[0].uri == "/api/v1/vm.remove-device") + + struct RemoveRequest: Decodable { let id: String } + let decoded = try JSONDecoder().decode(RemoveRequest.self, from: recorded[0].body) + #expect(decoded.id == "_disk0") + } + + // MARK: - Malformed JSON + + @Test("Malformed JSON on 200 response throws .decoding") + func malformedJSONThrowsDecoding() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.ok(Data("not json".utf8)) + } + defer { Task { try? await server.shutdown() } } + + let socketURL = URL(filePath: server.socketPath) + let client = try CloudHypervisor.Client(socketPath: socketURL, eventLoopGroup: Self.group) + + struct Dummy: Decodable, Sendable {} + + do { + let _: Dummy = try await client.get("/api/v1/vmm.info") + Issue.record("Expected .decoding error but call succeeded") + } catch let err as CloudHypervisor.Error { + guard case .decoding = err else { + Issue.record("Expected .decoding, got \(err)") + return + } + // Expected path — decoding error correctly surfaced. + } catch { + Issue.record("Expected CloudHypervisor.Error but got \(error)") + } + } + + // MARK: - Shutdown ordering + + /// Regression: with a caller-supplied group, `Client.shutdown()` must + /// drain the underlying HTTPClient before the caller tears the group + /// down. Without this, AsyncHTTPClient's deferred connection-cleanup + /// runs on the (now-dead) event loops and SwiftNIO prints + /// "Cannot schedule tasks on an EventLoop that has already shut down". + /// The singleton group used by the rest of this suite can't surface + /// the bug because it never shuts down, so we spin up a dedicated + /// group for the client here. The server stays on the singleton so + /// only the client-side AHC channels are at risk when we shut the + /// owned group down — otherwise the server's own pipeline cleanup + /// would race the same group teardown and confound the test. + @Test("Client.shutdown drains HTTPClient before a caller-owned group is torn down") + func shutdownDrainsHTTPClientBeforeGroup() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + let clientGroup = MultiThreadedEventLoopGroup(numberOfThreads: 2) + let client = try CloudHypervisor.Client( + socketPath: URL(filePath: server.socketPath), + eventLoopGroup: clientGroup + ) + // Round-trip a real request so AHC actually opens a connection + // and parks its post-response cleanup on `clientGroup`. + try await client.vmmShutdown() + + try await client.shutdown() + // Idempotent — a second call must not throw. + try await client.shutdown() + + // The owned group should now be safe to tear down without NIO + // warnings. + try await clientGroup.shutdownGracefully() + } + + @Test("Client.shutdown also tears down the group when the client owns it") + func shutdownOwnsGroup() async throws { + let server = try await StubHTTPServer(eventLoopGroup: Self.group) { _ in + StubResponse.status(.noContent) + } + defer { Task { try? await server.shutdown() } } + + // No eventLoopGroup → client owns its own. + let client = try CloudHypervisor.Client( + socketPath: URL(filePath: server.socketPath) + ) + try await client.vmmShutdown() + try await client.shutdown() + // Idempotent. + try await client.shutdown() + } +} diff --git a/Tests/CloudHypervisorTests/ErrorsTests.swift b/Tests/CloudHypervisorTests/ErrorsTests.swift new file mode 100644 index 00000000..4674222c --- /dev/null +++ b/Tests/CloudHypervisorTests/ErrorsTests.swift @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import NIOHTTP1 +import Testing + +@testable import CloudHypervisor + +@Suite("CloudHypervisor.Error") +struct ErrorsTests { + @Test("http case carries status and body") + func httpCase() { + let err = CloudHypervisor.Error.http(status: .badRequest, body: Data("nope".utf8)) + guard case .http(let status, let body) = err else { + Issue.record("expected .http") + return + } + #expect(status == .badRequest) + #expect(String(data: body, encoding: .utf8) == "nope") + } +} diff --git a/Tests/CloudHypervisorTests/StubHTTPServer.swift b/Tests/CloudHypervisorTests/StubHTTPServer.swift new file mode 100644 index 00000000..9ea545af --- /dev/null +++ b/Tests/CloudHypervisorTests/StubHTTPServer.swift @@ -0,0 +1,203 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import NIOConcurrencyHelpers +import NIOCore +import NIOHTTP1 +import NIOPosix + +// MARK: - StubRequest / StubResponse + +/// An inbound HTTP request captured by the stub server. +struct StubRequest: Sendable { + let method: HTTPMethod + let uri: String + let body: Data + let headers: HTTPHeaders +} + +/// A canned HTTP response produced by the stub server. +struct StubResponse: Sendable { + let status: HTTPResponseStatus + let body: Data + let headers: HTTPHeaders + + static func ok(_ body: Data = .init()) -> StubResponse { + StubResponse(status: .ok, body: body, headers: [:]) + } + + static func json(_ value: T) throws -> StubResponse { + let data = try JSONEncoder().encode(value) + var headers = HTTPHeaders() + headers.add(name: "Content-Type", value: "application/json") + return StubResponse(status: .ok, body: data, headers: headers) + } + + static func status(_ status: HTTPResponseStatus, body: Data = .init()) -> StubResponse { + StubResponse(status: status, body: body, headers: [:]) + } +} + +// MARK: - StubHTTPServer + +/// An in-process HTTP/1.1 server bound to a Unix Domain Socket, used in tests. +/// +/// Example: +/// ```swift +/// let server = try await StubHTTPServer(eventLoopGroup: group) { req in +/// return StubResponse.ok(Data("{}".utf8)) +/// } +/// defer { Task { try? await server.shutdown() } } +/// ``` +final class StubHTTPServer: Sendable { + /// The path to the Unix Domain Socket this server is bound to. + let socketPath: String + + private let channel: Channel + /// Recorded requests, protected by a lock so the test thread can read safely. + private let requests: NIOLockedValueBox<[StubRequest]> + + init( + eventLoopGroup: any EventLoopGroup, + handler: @escaping @Sendable (StubRequest) -> StubResponse + ) async throws { + let sockPath = FileManager.default.temporaryDirectory + .appendingPathComponent("ch-stub-\(UUID().uuidString).sock") + .path + + let requestsBox = NIOLockedValueBox<[StubRequest]>([]) + + let bootstrap = ServerBootstrap(group: eventLoopGroup) + .serverChannelOption(.backlog, value: 256) + .serverChannelOption(.socketOption(.so_reuseaddr), value: 1) + .childChannelInitializer { channel in + channel.eventLoop.makeCompletedFuture { + try channel.pipeline.syncOperations.configureHTTPServerPipeline( + withPipeliningAssistance: false + ) + try channel.pipeline.syncOperations.addHandler( + StubRequestHandler(userHandler: handler, requests: requestsBox) + ) + } + } + + let boundChannel = + try await bootstrap + .bind(unixDomainSocketPath: sockPath, cleanupExistingSocketFile: true) + .get() + + self.socketPath = sockPath + self.channel = boundChannel + self.requests = requestsBox + } + + /// Stop accepting connections and close the listening socket. + func shutdown() async throws { + try await channel.close().get() + try? FileManager.default.removeItem(atPath: socketPath) + } + + /// Returns all requests recorded so far. + func recordedRequests() -> [StubRequest] { + requests.withLockedValue { $0 } + } +} + +// MARK: - StubRequestHandler + +/// Handles a single inbound HTTP/1.1 request, invokes the user handler, and +/// writes the stub response. +/// +/// All ChannelHandler callbacks run on the channel's event loop, so the mutable +/// inbound-state fields need no external synchronisation. The shared `requests` +/// box is still locked because the test thread reads it from outside the loop. +private final class StubRequestHandler: ChannelInboundHandler, @unchecked Sendable { + typealias InboundIn = HTTPServerRequestPart + typealias OutboundOut = HTTPServerResponsePart + + private let userHandler: @Sendable (StubRequest) -> StubResponse + private let requests: NIOLockedValueBox<[StubRequest]> + + // Mutable inbound state — only touched on the event loop. + private var pendingMethod: HTTPMethod? + private var pendingURI: String? + private var pendingHeaders: HTTPHeaders = [:] + private var pendingBody: [UInt8] = [] + + init( + userHandler: @escaping @Sendable (StubRequest) -> StubResponse, + requests: NIOLockedValueBox<[StubRequest]> + ) { + self.userHandler = userHandler + self.requests = requests + } + + func channelRead(context: ChannelHandlerContext, data: NIOAny) { + switch unwrapInboundIn(data) { + case .head(let head): + pendingMethod = head.method + pendingURI = head.uri + pendingHeaders = head.headers + pendingBody = [] + case .body(var buf): + if let bytes = buf.readBytes(length: buf.readableBytes) { + pendingBody.append(contentsOf: bytes) + } + case .end: + guard let method = pendingMethod, let uri = pendingURI else { + context.close(promise: nil) + return + } + let request = StubRequest( + method: method, + uri: uri, + body: Data(pendingBody), + headers: pendingHeaders + ) + requests.withLockedValue { $0.append(request) } + let stubResp = userHandler(request) + writeResponse(context: context, response: stubResp) + } + } + + private func writeResponse(context: ChannelHandlerContext, response: StubResponse) { + var respHeaders = response.headers + respHeaders.replaceOrAdd(name: "Content-Length", value: "\(response.body.count)") + respHeaders.replaceOrAdd(name: "Connection", value: "close") + + let head = HTTPResponseHead(version: .http1_1, status: response.status, headers: respHeaders) + context.write(wrapOutboundOut(.head(head)), promise: nil) + + if !response.body.isEmpty { + var buf = context.channel.allocator.buffer(capacity: response.body.count) + buf.writeBytes(response.body) + context.write(wrapOutboundOut(.body(.byteBuffer(buf))), promise: nil) + } + + // Use NIOLoopBound to safely capture `context` in a @Sendable closure. + // The bound asserts event-loop access; the close runs on the same loop + // as the flush completion, which is correct. + let boundContext = NIOLoopBound(context, eventLoop: context.eventLoop) + context.writeAndFlush(wrapOutboundOut(.end(nil))).whenComplete { _ in + boundContext.value.close(promise: nil) + } + } + + func errorCaught(context: ChannelHandlerContext, error: any Error) { + context.close(promise: nil) + } +} diff --git a/Tests/CloudHypervisorTests/TypesTests.swift b/Tests/CloudHypervisorTests/TypesTests.swift new file mode 100644 index 00000000..8826643d --- /dev/null +++ b/Tests/CloudHypervisorTests/TypesTests.swift @@ -0,0 +1,328 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import Testing + +@testable import CloudHypervisor + +@Suite("CloudHypervisor types") +struct TypesTests { + @Test("VmConfig round-trips through JSON") + func vmConfigRoundTrip() throws { + let cfg = CloudHypervisor.VmConfig( + cpus: CloudHypervisor.CpusConfig(bootVcpus: 2, maxVcpus: 2), + memory: CloudHypervisor.MemoryConfig(size: UInt64(1) << 30), + payload: CloudHypervisor.PayloadConfig( + kernel: "/path/to/vmlinux", + cmdline: "init=/sbin/vminitd ro" + ), + disks: nil, + net: nil, + fs: nil, + vsock: nil, + console: CloudHypervisor.ConsoleConfig(mode: .Null), + serial: CloudHypervisor.ConsoleConfig(mode: .Null) + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let data = try encoder.encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.VmConfig.self, from: data) + #expect(decoded == cfg) + + // Verify snake_case keys are emitted. + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"boot_vcpus\"")) + #expect(jsonString.contains("\"max_vcpus\"")) + } + + @Test("CpusConfig round-trips through JSON") + func cpusConfigRoundTrip() throws { + let cfg = CloudHypervisor.CpusConfig(bootVcpus: 4, maxVcpus: 8) + let data = try JSONEncoder().encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.CpusConfig.self, from: data) + #expect(decoded == cfg) + } + + @Test("MemoryConfig round-trips through JSON") + func memoryConfigRoundTrip() throws { + let cfg = CloudHypervisor.MemoryConfig(size: UInt64(2) << 30, hotplugSize: UInt64(1) << 30, mergeable: true) + let data = try JSONEncoder().encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.MemoryConfig.self, from: data) + #expect(decoded == cfg) + } + + @Test("MemoryConfig omits nil optional fields from JSON") + func memoryConfigNilOmission() throws { + let cfg = CloudHypervisor.MemoryConfig(size: UInt64(1) << 30) + let data = try JSONEncoder().encode(cfg) + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(!jsonString.contains("\"hotplug_size\"")) + #expect(!jsonString.contains("\"mergeable\"")) + } + + @Test("PayloadConfig round-trips through JSON") + func payloadConfigRoundTrip() throws { + let cfg = CloudHypervisor.PayloadConfig( + kernel: "/boot/vmlinux", + initramfs: "/boot/initrd", + cmdline: "console=ttyS0" + ) + let data = try JSONEncoder().encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.PayloadConfig.self, from: data) + #expect(decoded == cfg) + } + + @Test("ConsoleConfig round-trips through JSON with capitalized mode strings") + func consoleConfigRoundTrip() throws { + for mode in [ + CloudHypervisor.ConsoleConfig.Mode.Off, + .Pty, + .Tty, + .File, + .Socket, + .Null, + ] { + let cfg = CloudHypervisor.ConsoleConfig(mode: mode) + let data = try JSONEncoder().encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.ConsoleConfig.self, from: data) + #expect(decoded == cfg) + // CH uses capitalized strings: "Off", "Pty", etc. + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"" + mode.rawValue + "\"")) + } + } + + @Test("DiskConfig round-trips through JSON") + func diskConfigRoundTrip() throws { + let cfg = CloudHypervisor.DiskConfig( + path: "/var/lib/disk.raw", + readonly: true, + direct: false, + iommu: nil, + id: "disk0", + pciSegment: 0 + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let data = try encoder.encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.DiskConfig.self, from: data) + #expect(decoded == cfg) + + // Verify snake_case key for pci_segment. + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"pci_segment\"")) + } + + @Test("DiskConfig omits nil optional fields from JSON") + func diskConfigNilOmission() throws { + let cfg = CloudHypervisor.DiskConfig(path: "/var/lib/disk.raw") + let data = try JSONEncoder().encode(cfg) + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(!jsonString.contains("\"readonly\"")) + #expect(!jsonString.contains("\"direct\"")) + #expect(!jsonString.contains("\"iommu\"")) + #expect(!jsonString.contains("\"id\"")) + #expect(!jsonString.contains("\"pci_segment\"")) + } + + @Test("NetConfig round-trips through JSON") + func netConfigRoundTrip() throws { + let cfg = CloudHypervisor.NetConfig( + tap: "tap0", + ip: "192.168.0.1", + mask: "255.255.255.0", + mac: "AA:BB:CC:DD:EE:FF", + mtu: 1500, + numQueues: 2, + queueSize: 256, + id: "net0" + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let data = try encoder.encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.NetConfig.self, from: data) + #expect(decoded == cfg) + + // Verify snake_case keys. + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"num_queues\"")) + #expect(jsonString.contains("\"queue_size\"")) + } + + @Test("FsConfig round-trips through JSON") + func fsConfigRoundTrip() throws { + let cfg = CloudHypervisor.FsConfig( + tag: "virtiofs0", + socket: "/run/virtiofs.sock", + numQueues: 1, + queueSize: 1024, + id: "fs0", + pciSegment: nil + ) + let data = try JSONEncoder().encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.FsConfig.self, from: data) + #expect(decoded == cfg) + } + + @Test("VsockConfig round-trips through JSON") + func vsockConfigRoundTrip() throws { + let cfg = CloudHypervisor.VsockConfig( + cid: 3, + socket: "/run/vsock.sock", + iommu: false, + id: "vsock0" + ) + let data = try JSONEncoder().encode(cfg) + let decoded = try JSONDecoder().decode(CloudHypervisor.VsockConfig.self, from: data) + #expect(decoded == cfg) + } + + @Test("PciDeviceInfo round-trips through JSON") + func pciDeviceInfoRoundTrip() throws { + let info = CloudHypervisor.PciDeviceInfo(id: "disk0", bdf: "0000:00:03.0") + let data = try JSONEncoder().encode(info) + let decoded = try JSONDecoder().decode(CloudHypervisor.PciDeviceInfo.self, from: data) + #expect(decoded == info) + } + + // MARK: - VmInfo / VmState + + @Test("VmState round-trips through JSON with CH literal strings") + func vmStateRoundTrip() throws { + for state in [ + CloudHypervisor.VmState.Created, + .Running, + .Shutdown, + .Paused, + .BreakPoint, + ] { + let data = try JSONEncoder().encode(state) + let decoded = try JSONDecoder().decode(CloudHypervisor.VmState.self, from: data) + #expect(decoded == state) + // CH uses the capitalized raw string literals exactly. + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"" + state.rawValue + "\"")) + } + } + + @Test("VmInfo round-trips through JSON") + func vmInfoRoundTrip() throws { + let cfg = CloudHypervisor.VmConfig( + cpus: CloudHypervisor.CpusConfig(bootVcpus: 2, maxVcpus: 2), + memory: CloudHypervisor.MemoryConfig(size: UInt64(1) << 30), + payload: CloudHypervisor.PayloadConfig(kernel: "/boot/vmlinux"), + console: CloudHypervisor.ConsoleConfig(mode: .Null), + serial: CloudHypervisor.ConsoleConfig(mode: .Null) + ) + let info = CloudHypervisor.VmInfo( + config: cfg, + state: .Running, + memoryActualSize: 1_073_741_824 + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let data = try encoder.encode(info) + let decoded = try JSONDecoder().decode(CloudHypervisor.VmInfo.self, from: data) + #expect(decoded == info) + + // Verify snake_case key is emitted. + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"memory_actual_size\"")) + } + + @Test("VmInfo omits nil optional fields from JSON") + func vmInfoNilOmission() throws { + let cfg = CloudHypervisor.VmConfig( + cpus: CloudHypervisor.CpusConfig(bootVcpus: 1, maxVcpus: 1), + memory: CloudHypervisor.MemoryConfig(size: UInt64(512) << 20), + payload: CloudHypervisor.PayloadConfig(kernel: "/boot/vmlinux"), + console: CloudHypervisor.ConsoleConfig(mode: .Off), + serial: CloudHypervisor.ConsoleConfig(mode: .Off) + ) + let info = CloudHypervisor.VmInfo(config: cfg, state: .Created) + let data = try JSONEncoder().encode(info) + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(!jsonString.contains("\"memory_actual_size\"")) + } + + // MARK: - VmmPingResponse + + @Test("VmmPingResponse round-trips through JSON") + func vmmPingResponseRoundTrip() throws { + let ping = CloudHypervisor.VmmPingResponse( + version: "v40.0", + pid: 12345, + features: ["acpi", "kvm"], + buildVersion: "abc123" + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let data = try encoder.encode(ping) + let decoded = try JSONDecoder().decode(CloudHypervisor.VmmPingResponse.self, from: data) + #expect(decoded == ping) + + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"build_version\"")) + } + + @Test("VmmPingResponse omits nil optional fields from JSON") + func vmmPingResponseNilOmission() throws { + let ping = CloudHypervisor.VmmPingResponse(version: "v40.0") + let data = try JSONEncoder().encode(ping) + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(!jsonString.contains("\"pid\"")) + #expect(!jsonString.contains("\"features\"")) + #expect(!jsonString.contains("\"build_version\"")) + } + + // MARK: - VmmInfo + + @Test("VmmInfo round-trips through JSON") + func vmmInfoRoundTrip() throws { + let cfg = CloudHypervisor.VmConfig( + cpus: CloudHypervisor.CpusConfig(bootVcpus: 2, maxVcpus: 2), + memory: CloudHypervisor.MemoryConfig(size: UInt64(1) << 30), + payload: CloudHypervisor.PayloadConfig(kernel: "/boot/vmlinux"), + console: CloudHypervisor.ConsoleConfig(mode: .Null), + serial: CloudHypervisor.ConsoleConfig(mode: .Null) + ) + let vmmInfo = CloudHypervisor.VmmInfo( + version: "v40.0", + pid: 99, + buildVersion: "deadbeef", + config: cfg + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let data = try encoder.encode(vmmInfo) + let decoded = try JSONDecoder().decode(CloudHypervisor.VmmInfo.self, from: data) + #expect(decoded == vmmInfo) + + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(jsonString.contains("\"build_version\"")) + } + + @Test("VmmInfo omits nil optional fields from JSON") + func vmmInfoNilOmission() throws { + let vmmInfo = CloudHypervisor.VmmInfo(version: "v40.0") + let data = try JSONEncoder().encode(vmmInfo) + let jsonString = try #require(String(data: data, encoding: .utf8)) + #expect(!jsonString.contains("\"pid\"")) + #expect(!jsonString.contains("\"build_version\"")) + #expect(!jsonString.contains("\"config\"")) + } +} diff --git a/Tests/ContainerizationTests/BridgeStateFileTests.swift b/Tests/ContainerizationTests/BridgeStateFileTests.swift new file mode 100644 index 00000000..77c4de57 --- /dev/null +++ b/Tests/ContainerizationTests/BridgeStateFileTests.swift @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation +import Testing + +@testable import Containerization + +@Suite("Bridge state file") +struct BridgeStateFileTests { + @Test("round-trip JSON encode/decode (NAT enabled)") + func roundTripNAT() throws { + let s = BridgeState(natEnabled: true, prevIpForward: "0", egressInterface: "eth0") + let data = try s.encode() + let s2 = try BridgeState.decode(data) + #expect(s2.natEnabled == true) + #expect(s2.prevIpForward == "0") + #expect(s2.egressInterface == "eth0") + } + + @Test("round-trip JSON encode/decode (NAT disabled)") + func roundTripNoNAT() throws { + let s = BridgeState(natEnabled: false) + let data = try s.encode() + let s2 = try BridgeState.decode(data) + #expect(s2.natEnabled == false) + #expect(s2.prevIpForward == nil) + #expect(s2.egressInterface == nil) + } + + @Test("legacy state file without natEnabled defaults to true") + func legacyDefaultsToNATEnabled() throws { + // Pre-flag JSON shape, written by an older version of BridgeManager. + let legacy = #"{"prevIpForward":"0","egressInterface":"eth0"}"# + let s = try BridgeState.decode(Data(legacy.utf8)) + #expect(s.natEnabled == true) + #expect(s.prevIpForward == "0") + #expect(s.egressInterface == "eth0") + } + + @Test("decode rejects malformed input") + func malformed() { + #expect(throws: (any Error).self) { + _ = try BridgeState.decode(Data("not json".utf8)) + } + } +} diff --git a/Tests/ContainerizationTests/CHInterfaceTests.swift b/Tests/ContainerizationTests/CHInterfaceTests.swift new file mode 100644 index 00000000..55a2fc3f --- /dev/null +++ b/Tests/ContainerizationTests/CHInterfaceTests.swift @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import CloudHypervisor +import ContainerizationExtras +import Testing + +@testable import Containerization + +@Suite("TAPInterface") +struct CHInterfaceTests { + @Test("chNetConfig populates tap, mac, and mtu and leaves IP fields nil") + func chNetConfigShape() throws { + let cidr = try CIDRv4("192.168.64.3/24") + let gateway = try IPv4Address("192.168.64.1") + let mac = try MACAddress("02:42:ac:11:00:02") + let iface = TAPInterface( + tapName: "tap0", + ipv4Address: cidr, + ipv4Gateway: gateway, + macAddress: mac, + mtu: 1500 + ) + + let cfg = try iface.chNetConfig() + #expect(cfg.tap == "tap0") + #expect(cfg.mac == mac.description) + #expect(cfg.mtu == 1500) + #expect(cfg.ip == nil) + #expect(cfg.mask == nil) + #expect(cfg.id == nil) + } + + @Test("chNetConfig omits mac when macAddress is nil") + func chNetConfigOmitsMac() throws { + let cidr = try CIDRv4("10.0.0.5/24") + let iface = TAPInterface(tapName: "ch-tap1", ipv4Address: cidr) + + let cfg = try iface.chNetConfig() + #expect(cfg.tap == "ch-tap1") + #expect(cfg.mac == nil) + #expect(cfg.mtu == 1500) + } + + @Test("TAPInterface satisfies Interface") + func interfaceConformance() throws { + let cidr = try CIDRv4("192.168.64.3/24") + let iface: any Interface = TAPInterface(tapName: "tap0", ipv4Address: cidr) + #expect(iface.ipv4Address == cidr) + #expect(iface.ipv4Gateway == nil) + #expect(iface.macAddress == nil) + #expect(iface.mtu == 1500) + } +} +#endif diff --git a/Tests/ContainerizationTests/HostDefaultRouteTests.swift b/Tests/ContainerizationTests/HostDefaultRouteTests.swift new file mode 100644 index 00000000..d21bb628 --- /dev/null +++ b/Tests/ContainerizationTests/HostDefaultRouteTests.swift @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Testing + +@testable import Containerization + +@Suite("Host default route parsing") +struct HostDefaultRouteTests { + // Header + one default-route row (Destination=00000000, Flags=0003 has RTF_GATEWAY=0x2). + static let singleDefault = """ + Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT + eth0\t00000000\t0102A8C0\t0003\t0\t0\t0\t00000000\t0\t0\t0 + eth0\t0002A8C0\t00000000\t0001\t0\t0\t0\tFFFFFF00\t0\t0\t0 + """ + + static let noDefault = """ + Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT + eth0\t0002A8C0\t00000000\t0001\t0\t0\t0\tFFFFFF00\t0\t0\t0 + """ + + static let multiDefault = """ + Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT + wlan0\t00000000\t0102A8C0\t0003\t0\t0\t600\t00000000\t0\t0\t0 + eth0\t00000000\t0102A8C0\t0003\t0\t0\t100\t00000000\t0\t0\t0 + """ + + @Test("returns iface for a single default route") + func singleDefaultRoute() { + #expect(HostDefaultRoute.parseEgress(procNetRoute: Self.singleDefault) == "eth0") + } + + @Test("returns nil when no default route") + func noDefaultRoute() { + #expect(HostDefaultRoute.parseEgress(procNetRoute: Self.noDefault) == nil) + } + + @Test("returns lowest-metric default when multiple") + func multipleDefaultRoutes() { + // eth0 has metric 100; wlan0 has metric 600. + #expect(HostDefaultRoute.parseEgress(procNetRoute: Self.multiDefault) == "eth0") + } + + @Test("ignores rows missing RTF_GATEWAY flag") + func noGatewayFlag() { + // Same as singleDefault but flags=0001 (no RTF_GATEWAY=0x2). + let input = """ + Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT + eth0\t00000000\t0102A8C0\t0001\t0\t0\t0\t00000000\t0\t0\t0 + """ + #expect(HostDefaultRoute.parseEgress(procNetRoute: input) == nil) + } +} diff --git a/Tests/ContainerizationTests/MountCHTests.swift b/Tests/ContainerizationTests/MountCHTests.swift new file mode 100644 index 00000000..4b809ea2 --- /dev/null +++ b/Tests/ContainerizationTests/MountCHTests.swift @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import CloudHypervisor +import Testing + +@testable import Containerization + +@Suite("Mount+CH") +struct MountCHTests { + @Test("block mount without options produces DiskConfig with readonly=false") + func blockNoOptions() { + let mount = Mount.block(format: "ext4", source: "/foo.img", destination: "/data") + let cfg = mount.chDiskConfig(id: "blk-0") + #expect(cfg?.path == "/foo.img") + #expect(cfg?.readonly == false) + #expect(cfg?.id == "blk-0") + #expect(cfg?.direct == nil) + #expect(cfg?.iommu == nil) + #expect(cfg?.pciSegment == nil) + } + + @Test("block mount with 'ro' option produces DiskConfig with readonly=true") + func blockReadOnly() { + let mount = Mount.block(format: "ext4", source: "/foo.img", destination: "/data", options: ["ro"]) + let cfg = mount.chDiskConfig(id: "blk-1") + #expect(cfg?.readonly == true) + } + + @Test("non-block mount returns nil from chDiskConfig") + func chDiskConfigNilForNonBlock() { + let share = Mount.share(source: "/host", destination: "/guest") + #expect(share.chDiskConfig(id: "x") == nil) + + let any = Mount.any(type: "tmpfs", source: "tmpfs", destination: "/tmp") + #expect(any.chDiskConfig(id: "x") == nil) + } + + @Test("share mount produces FsConfig with tag and socket") + func shareMount() { + let mount = Mount.share(source: "/host/dir", destination: "/guest/dir") + let cfg = mount.chFsConfig(tag: "share0", socketPath: "/tmp/vfs.sock", id: "fs-0") + #expect(cfg?.tag == "share0") + #expect(cfg?.socket == "/tmp/vfs.sock") + #expect(cfg?.id == "fs-0") + #expect(cfg?.numQueues == nil) + #expect(cfg?.queueSize == nil) + #expect(cfg?.pciSegment == nil) + } + + @Test("non-share mount returns nil from chFsConfig") + func chFsConfigNilForNonShare() { + let block = Mount.block(format: "ext4", source: "/foo.img", destination: "/data") + #expect(block.chFsConfig(tag: "t", socketPath: "/s", id: "x") == nil) + + let any = Mount.any(type: "tmpfs", source: "tmpfs", destination: "/tmp") + #expect(any.chFsConfig(tag: "t", socketPath: "/s", id: "x") == nil) + } +} diff --git a/Tests/ContainerizationTests/TAPNameDerivationTests.swift b/Tests/ContainerizationTests/TAPNameDerivationTests.swift new file mode 100644 index 00000000..7f52500b --- /dev/null +++ b/Tests/ContainerizationTests/TAPNameDerivationTests.swift @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +#if os(Linux) +import Testing + +@testable import Containerization + +@Suite("TAP name derivation") +struct TAPNameDerivationTests { + @Test("name is deterministic for a given id") + func deterministic() { + let a = LinuxBridgedNetwork.derivedTAPName(forID: "container-abc") + let b = LinuxBridgedNetwork.derivedTAPName(forID: "container-abc") + #expect(a == b) + } + + @Test("different ids produce different names") + func differentIds() { + let a = LinuxBridgedNetwork.derivedTAPName(forID: "alpha") + let b = LinuxBridgedNetwork.derivedTAPName(forID: "beta") + #expect(a != b) + } + + @Test("name fits within IFNAMSIZ - 1 (15 chars)") + func ifnamsizFit() { + for id in ["a", "short", "a-much-longer-container-id-that-exceeds-typical-bounds"] { + let n = LinuxBridgedNetwork.derivedTAPName(forID: id) + #expect(n.count <= 15, "name '\(n)' exceeds IFNAMSIZ-1") + } + } + + @Test("name uses czt- prefix") + func prefix() { + let n = LinuxBridgedNetwork.derivedTAPName(forID: "anything") + #expect(n.hasPrefix("czt-")) + } + + @Test("hex suffix is 10 chars") + func suffixLength() { + let n = LinuxBridgedNetwork.derivedTAPName(forID: "anything") + // "czt-" is 4 chars; total 14. + #expect(n.count == 14) + } +} +#endif diff --git a/docs/x86_64-build.md b/docs/x86_64-build.md new file mode 100644 index 00000000..5d4e2c4f --- /dev/null +++ b/docs/x86_64-build.md @@ -0,0 +1,228 @@ +# x86_64 Deployment Build + +`make dist-x86_64` produces a self-contained x86_64 Linux deployment tarball +at `bin/containerization-x86_64-.tar.gz`. The build runs entirely inside +the aarch64 Linux dev container — there is no host tooling requirement beyond +`make`, `container`, and the prerequisites the dev image installs. + +The tarball ships everything needed to run a Containerization VM on an x86_64 +Linux host: the `cctl` host binary, the `cloud-hypervisor` VMM, the +`virtiofsd` filesystem daemon, an x86_64 Linux kernel, and an `initfs.ext4` +guest rootfs containing `vminitd` + `vmexec`. + +`cctl`, `cloud-hypervisor`, and `vminitd`/`vmexec` are statically linked +against musl, so they run on any x86_64 Linux. **`virtiofsd` is dynamically +linked against glibc 2.35+**; the deployment host must provide glibc +≥ 2.35 (Ubuntu 22.04 / Debian 12 / RHEL 9 era) plus `libseccomp.so.2` and +`libcap-ng.so.0`. Both are present by default on essentially every server +distro shipped in the last few years. + +## Prerequisites + +Before the first `make dist-x86_64`: + +1. **Source checkouts under `.local/`** — pinned by you, not fetched by the + build. There is no fetch target; clone the revision you want shipped: + + ```sh + git clone -b v52.0 https://github.com/cloud-hypervisor/cloud-hypervisor \ + .local/cloud-hypervisor + git clone https://gitlab.com/virtio-fs/virtiofsd .local/virtiofsd + ``` + +2. **An x86_64 kernel** at `kernel/vmlinuz-x86_64` (preferred) or + `kernel/vmlinux-x86_64`. Build via `make -C kernel TARGET_ARCH=x86_64`. + The build fails hard if neither exists — a tarball without a kernel is + not usable. + +3. **The Linux dev image.** `dist-x86_64` depends on the `linux-image` + make target, so the `container build` cache handles this automatically; + the first run takes a few minutes, subsequent runs are seconds. + +The dev image (`images/linux-dev/Dockerfile`) bundles Swiftly, the Static +Linux SDK, the Rust toolchain (with `cargo-zigbuild`), a prebuilt +`/opt/cross-x86_64-musl/` prefix containing zlib, xz, bzip2, libarchive, +libcap-ng, and libseccomp built static-musl for x86_64, and a sibling +`/opt/cross-x86_64-gnu/` prefix containing libcap-ng and libseccomp built +as glibc-dynamic shared libraries for virtiofsd's link step. +`scripts/build-musl-x86_64-deps.sh` and `scripts/build-glibc-x86_64-deps.sh` +produce these prefixes at image build time. + +## Running the build + +```sh +make dist-x86_64 +``` + +Drives `scripts/build-dist-x86_64.sh` inside the dev container via the +`linux_run` macro. The container bind-mounts the repo at `/workspace`, so +all build outputs land back on the host under `bin/dist-x86_64/`. + +## Pipeline + +The script runs five build stages plus a packaging stage. Each build stage +is gated by a freshness check (see [Rebuild gating](#rebuild-gating)) so +unchanged components are skipped on subsequent runs. + +1. **`cctl` cross-compile to x86_64-linux-musl.** + `swift build --swift-sdk x86_64-swift-linux-musl --product cctl`. Always + runs — this is the artifact under iteration, and Swift's incremental + build is a near-no-op when nothing changed. + +2. **`vminitd` + `vmexec` cross-compile to x86_64-linux-musl.** + `make -C vminitd LIBC=musl MUSL_ARCH=x86_64`. The guest agent and + process launcher; both run inside the VM as PID 1's children. + +3. **`cloud-hypervisor` cross-compile to x86_64-unknown-linux-musl.** + `cargo zigbuild --target x86_64-unknown-linux-musl --bin cloud-hypervisor` + from `.local/cloud-hypervisor`. + +4. **`virtiofsd` cross-compile to x86_64-unknown-linux-gnu.2.35.** + `cargo zigbuild --target x86_64-unknown-linux-gnu.2.35` from + `.local/virtiofsd`, with `scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch` + applied first. The patch is idempotent — applied if missing, skipped if + already present, fails hard if it can't be applied cleanly. Unlike the + other three host binaries, virtiofsd is **glibc-dynamic**: it expects + the deployment host to provide glibc ≥ 2.35, `libseccomp.so.2`, and + `libcap-ng.so.0`. Link-time `.so` files come from + `/opt/cross-x86_64-gnu/`. + +5. **`initfs.ext4` packaging.** + A native aarch64 `cctl` is built (Swift release) and used as the packer: + `cctl rootfs create --vminitd … --vmexec …` writes a ready-to-mount + ext4 image with the x86_64 guest binaries inside. The native build + only runs when this stage runs. + +6. **Stage and tar.** Always runs. Lays out the staging tree at + `bin/dist-x86_64//`: + + ``` + / + ├── bin/ + │ ├── cctl + │ ├── cloud-hypervisor + │ └── virtiofsd + ├── kernel/ + │ └── vmlinuz-x86_64 # or vmlinux-x86_64, whichever was found + └── initfs.ext4 + ``` + + Then `tar -czf bin/.tar.gz`. + +## Rebuild gating + +By default, every stage skips when its output is up-to-date. Each freshness +check has a corresponding `REBUILD_*=1` environment variable that forces +the stage to rerun. + +| Stage | Skip condition | Force rebuild | +| --- | --- | --- | +| `cctl` x86 cross | (never skipped — always runs) | n/a | +| `vminitd` + `vmexec` | both binaries exist under `bin/dist-x86_64/` AND nothing under `vminitd/Sources/`, `vminitd/Package.swift`, or `Sources/Containerization/SandboxContext/` is newer than them | `REBUILD_VMINITD=1` | +| `cloud-hypervisor` | `bin/dist-x86_64/cloud-hypervisor` exists | `REBUILD_CH=1` | +| `virtiofsd` | `bin/dist-x86_64/virtiofsd` exists | `REBUILD_VIRTIOFSD=1` | +| `initfs.ext4` | exists AND is newer than both staged `vminitd` and `vmexec` (also implicitly skipped when `vminitd` was skipped) | `REBUILD_INITFS=1` | +| native aarch64 `cctl` | only built when `initfs.ext4` is being rebuilt | `REBUILD_INITFS=1` | +| stage tree + tar | (always runs) | n/a | + +The freshness checks intentionally use binary presence and source mtimes +rather than content hashing — fast to evaluate, easy to bypass with `touch` +or `rm`. There is no global "rebuild everything" switch by design; force +the specific component you want, or `rm -rf bin/dist-x86_64/` for a full +clean rebuild. + +`cloud-hypervisor` and `virtiofsd` only check binary presence (not source +mtime against `.local/`). The pinned-source convention assumes you opt +into rebuilds explicitly — the `REBUILD_CH=1` / `REBUILD_VIRTIOFSD=1` +escape hatches exist for exactly that case. Walking the full Rust source +tree on every run was the alternative; not worth the cost. + +### Common rebuild scenarios + +- **Iterating on host-side `cctl` or `Containerization` Swift code:** just + `make dist-x86_64`. Only the x86 cctl rebuild runs (and tar). +- **Touched `vminitd` source or the proto:** `REBUILD_VMINITD=1` is + picked up automatically by mtime; `make dist-x86_64`. `vminitd` and + `initfs.ext4` rebuild. +- **Pulled new `.local/cloud-hypervisor`:** `REBUILD_CH=1 make dist-x86_64`. +- **Pulled new `.local/virtiofsd`:** `REBUILD_VIRTIOFSD=1 make dist-x86_64`. +- **Suspect a stale artifact:** `rm -rf bin/dist-x86_64 && make dist-x86_64` + for a full clean rebuild. + +## Cross-compilation toolchain + +Two cross toolchains live side-by-side in the dev image. `cctl`, +`vminitd`/`vmexec`, and `cloud-hypervisor` target `x86_64-linux-musl` and +ship statically linked so the artifacts are host-libc independent. +`virtiofsd` targets `x86_64-linux-gnu.2.35` and ships dynamically linked; +the deployment host provides glibc, libseccomp, and libcap-ng. + +- **Swift** uses Apple's Static Linux SDK (`x86_64-swift-linux-musl`), + installed by `make cross-prep` at dev-image build time. The same SDK + is used for both `cctl` and `vminitd` cross-builds. +- **Rust C cross-compiler is Zig.** For musl stages, `zig cc -target + x86_64-linux-musl` is wrapped as `x86_64-linux-musl-{gcc,g++,ar,ranlib,strip}`. + For virtiofsd, parallel `x86_64-linux-gnu-*` wrappers dispatch to + `zig cc -target x86_64-linux-gnu.2.35`, plus an `x86_64-linux-gnu-ld` + wrapper backed by LLVM's `ld.lld` (apt-installed). The `ld` wrapper + is needed because libtool's shared-library detection probes the + linker with `-m elf_x86_64`; the host's aarch64 `/usr/bin/ld` + rejects that and would silently disable `.so` emission. The gnu + `gcc`/`g++` wrappers intercept `-print-prog-name=ld` so libtool + discovers the cross-ld wrapper instead of the host linker. The + pinned `.2.35` glibc baseline determines the minimum host glibc; + bumping it requires editing the wrapper scripts under + `images/linux-dev/wrappers/`. Zig was chosen over musl.cc / gcc + cross prebuilts because aarch64-hosted versions of those aren't + published. +- **Rust linker** is **not** set explicitly. `cargo-zigbuild` installs + its own linker wrapper that strips Rust's self-contained musl crt + files (which would otherwise collide with Zig's musl crt). Setting + `CARGO_TARGET_*_LINKER` ourselves overrides that and produces + duplicate-symbol link errors. +- **`pkg-config`** points at `/opt/cross-x86_64-musl/lib/pkgconfig` for + the musl stages; the virtiofsd block overrides it in a subshell to + point at `/opt/cross-x86_64-gnu/lib/pkgconfig` so `libseccomp-sys` + and `libcap-ng`'s `capng-sys` resolve against the glibc-dynamic + `.so` files, not the static-musl `.a` archives. The musl prefix uses + GNU ld linker scripts at `lib{seccomp,cap-ng}.so` to redirect + dynamic-link requests into the static archives; the gnu prefix ships + real shared libraries. + +The cross C dep prefixes are built by `scripts/build-musl-x86_64-deps.sh` +and `scripts/build-glibc-x86_64-deps.sh` during `make linux-image`. +Modifying either script invalidates that layer of the dev image and +triggers a rebuild on the next `make dist-x86_64`. + +## Troubleshooting + +- **`ERROR: missing .local/cloud-hypervisor source checkout`** — see + Prerequisites. There is no fetch target; clone the revision you want + pinned. +- **`ERROR: no x86_64 kernel found`** — run + `make -C kernel TARGET_ARCH=x86_64`. The build refuses to ship a + tarball without a kernel. +- **`ERROR: virtiofsd cap-drop patch does not apply cleanly`** — the + patch only applies to known-good upstream revisions of virtiofsd. If + you bumped `.local/virtiofsd` past that, refresh + `scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch` + against the new revision. +- **Stale binary on the deployment host** — confirm the tarball SHA in + `bin/containerization-x86_64-.tar.gz` matches `git rev-parse + --short HEAD`. The script tags the tarball with `HEAD` at build time; + uncommitted changes ship under the same SHA as their parent commit. +- **Linker errors mentioning duplicate `crt*.o` symbols** — something is + setting `CARGO_TARGET_*_LINKER`. Unset it and let `cargo-zigbuild` + manage the linker. +- **`virtiofsd: error while loading shared libraries: libseccomp.so.2`** + (or `libcap-ng.so.0`) on the deployment host — install the system + packages (`apt install libseccomp2 libcap-ng0` on Debian/Ubuntu, + `dnf install libseccomp libcap-ng` on Fedora/RHEL). `virtiofsd` is + glibc-dynamic by design; the libs are not bundled in the tarball. +- **`virtiofsd: /lib/x86_64-linux-gnu/libc.so.6: version 'GLIBC_2.35' + not found`** — the deployment host's glibc is older than the build's + baseline. Either upgrade the host or rebuild with a lower baseline + by editing the `-target x86_64-linux-gnu.` arg in + `images/linux-dev/wrappers/x86_64-linux-gnu-{gcc,g++}` and the + `cargo zigbuild --target x86_64-unknown-linux-gnu.` line in + `scripts/build-dist-x86_64.sh`. diff --git a/images/linux-dev/Dockerfile b/images/linux-dev/Dockerfile index afde9e95..8d3a8f34 100644 --- a/images/linux-dev/Dockerfile +++ b/images/linux-dev/Dockerfile @@ -16,7 +16,12 @@ ARG SWIFT_VERSION=6.3 FROM swift:${SWIFT_VERSION}-noble RUN apt-get update \ - && apt-get install -y make libarchive-dev libbz2-dev liblzma-dev libssl-dev \ + && apt-get install -y --no-install-recommends \ + make \ + libarchive-dev \ + libbz2-dev \ + liblzma-dev \ + libssl-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -25,3 +30,103 @@ ARG SWIFT_SDK_CHECKSUM RUN if [ -n "$SWIFT_SDK_URL" ]; then \ swift sdk install "$SWIFT_SDK_URL" --checksum "$SWIFT_SDK_CHECKSUM"; \ fi + +# x86_64 cross-build tooling +# --- +# Used by `make dist-x86_64` to produce a Linux x86_64 deployment +# tarball from this aarch64 dev container. Adds: Zig (used as a +# clang-based cross C/C++ compiler — its bundled musl + LLVM lets us +# target x86_64-linux-musl from any host arch), wrapper scripts that +# look like a standard `x86_64-linux-musl-*` toolchain so autotools' +# `--host=x86_64-linux-musl` works, parallel `x86_64-linux-gnu-*` +# wrappers pinned at glibc 2.35 for virtiofsd's dynamic build, +# Rust stable + both x86_64-unknown-linux-{musl,gnu} targets, +# autotools/gperf, static-musl builds of zlib, xz, bzip2, libarchive, +# libcap-ng, and libseccomp at /opt/cross-x86_64-musl, and +# glibc-dynamic builds of libcap-ng and libseccomp at +# /opt/cross-x86_64-gnu. + +# Build deps for static-musl C libraries. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + libtool \ + gperf \ + curl \ + ca-certificates \ + xz-utils \ + build-essential \ + pkg-config \ + lld \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Zig (cross compiler). Pin to a specific release; bump as needed. +# Zig provides clang-based `zig cc -target ...` plus `zig ar`, `zig +# ranlib`, etc., bundled with musl libc — no need for a separate +# arch-specific gcc cross toolchain. +# +# The SHA256 is captured from the upstream release index +# (https://ziglang.org/download/) and verified before extraction so a +# tampered tarball can't poison the dev image. +ARG ZIG_VERSION=0.13.0 +ARG ZIG_AARCH64_SHA256=041ac42323837eb5624068acd8b00cd5777dac4cf91179e8dad7a7e90dd0c556 +RUN set -eux; \ + curl -fsSL -o /tmp/zig.tar.xz \ + "https://ziglang.org/download/${ZIG_VERSION}/zig-linux-aarch64-${ZIG_VERSION}.tar.xz"; \ + echo "${ZIG_AARCH64_SHA256} /tmp/zig.tar.xz" | sha256sum -c -; \ + tar -xJf /tmp/zig.tar.xz -C /opt; \ + rm /tmp/zig.tar.xz; \ + ln -s "/opt/zig-linux-aarch64-${ZIG_VERSION}" /opt/zig +ENV PATH="/opt/zig:${PATH}" + +# Wrapper scripts that look like conventional cross toolchains — +# `x86_64-linux-musl-{gcc,g++,cc,c++,ar,ranlib,strip}` for the musl +# side (used by cctl, vminitd, cloud-hypervisor) and parallel +# `x86_64-linux-gnu-*` wrappers for virtiofsd's glibc-dynamic build. +# All dispatch to zig under the hood. Lets autotools' +# `--host=x86_64-linux-{musl,gnu}` and cargo's +# CARGO_TARGET_X86_64_UNKNOWN_LINUX_{MUSL,GNU}_LINKER point at these +# names without knowing about Zig. +# +# The C/C++ wrappers filter out `--target=` args that cc-rs +# (used by Rust build scripts like zstd-sys, libseccomp-sys) adds — +# cc-rs emits the Rust-form triple (e.g. x86_64-unknown-linux-musl) +# which Zig refuses to parse, and we always set our own -target. +COPY images/linux-dev/wrappers/ /usr/local/bin/ +RUN chmod +x /usr/local/bin/x86_64-linux-musl-* \ + && ln -s x86_64-linux-musl-gcc /usr/local/bin/x86_64-linux-musl-cc \ + && ln -s x86_64-linux-musl-g++ /usr/local/bin/x86_64-linux-musl-c++ \ + && chmod +x /usr/local/bin/x86_64-linux-gnu-* \ + && ln -s x86_64-linux-gnu-gcc /usr/local/bin/x86_64-linux-gnu-cc \ + && ln -s x86_64-linux-gnu-g++ /usr/local/bin/x86_64-linux-gnu-c++ + +# Rust toolchain at the same path the existing build-cloud-hypervisor / +# build-virtiofsd targets expect, with the x86_64-musl cross target +# pre-installed so dist-x86_64 doesn't redo it on every run. Also +# installs cargo-zigbuild — a cargo subcommand that uses Zig as the +# linker and handles the Rust+musl+Zig integration (specifically: it +# strips Rust's self-contained musl crt files so they don't collide +# with Zig's, and wires up libunwind correctly). +RUN curl --proto '=https' --tlsv1.2 -fsSL https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain stable --profile minimal \ + && /root/.cargo/bin/rustup target add x86_64-unknown-linux-musl \ + && /root/.cargo/bin/rustup target add x86_64-unknown-linux-gnu \ + && /root/.cargo/bin/cargo install --locked cargo-zigbuild +ENV PATH="/root/.cargo/bin:${PATH}" + +# Static-musl C libraries (libarchive + deps for cctl, libcap-ng + +# libseccomp for virtiofsd). Installs to /opt/cross-x86_64-musl; +# build-dist-x86_64.sh adds -L/-I flags pointing at that prefix when +# linking the host-side binaries. +COPY scripts/build-musl-x86_64-deps.sh /tmp/build-musl-x86_64-deps.sh +RUN /tmp/build-musl-x86_64-deps.sh && rm /tmp/build-musl-x86_64-deps.sh + +# Glibc-dynamic C libraries for virtiofsd. virtiofsd ships +# glibc-dynamic in the x86_64 tarball so deployment hosts use their +# system libseccomp.so.2 + libcap-ng.so.0; this prefix only provides +# the link-time .so + headers + pkg-config files. cloud-hypervisor +# and cctl stay musl-static and link against /opt/cross-x86_64-musl/. +COPY scripts/build-glibc-x86_64-deps.sh /tmp/build-glibc-x86_64-deps.sh +RUN /tmp/build-glibc-x86_64-deps.sh && rm /tmp/build-glibc-x86_64-deps.sh diff --git a/images/linux-dev/wrappers/x86_64-linux-gnu-ar b/images/linux-dev/wrappers/x86_64-linux-gnu-ar new file mode 100755 index 00000000..6c3646ab --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-gnu-ar @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/zig/zig ar "$@" diff --git a/images/linux-dev/wrappers/x86_64-linux-gnu-g++ b/images/linux-dev/wrappers/x86_64-linux-gnu-g++ new file mode 100755 index 00000000..ea64733e --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-gnu-g++ @@ -0,0 +1,24 @@ +#!/bin/bash +# Wrapper that dispatches to `zig c++ -target x86_64-linux-gnu.2.35`. +# Mirrors x86_64-linux-musl-g++ but pins a glibc 2.35 baseline. +# +# See x86_64-linux-gnu-gcc for the rationale behind intercepting +# `-print-prog-name=ld`. +case " $* " in + *" -print-prog-name=ld "*) + echo /usr/local/bin/x86_64-linux-gnu-ld + exit 0 + ;; +esac + +# Filters out `--target=` args that cc-rs adds — cc-rs +# emits the Rust-form triple (x86_64-unknown-linux-gnu) which Zig +# can't parse. We always pass our own -target below. +args=() +for arg in "$@"; do + case "$arg" in + --target=*) ;; + *) args+=("$arg") ;; + esac +done +exec /opt/zig/zig c++ -target x86_64-linux-gnu.2.35 "${args[@]}" diff --git a/images/linux-dev/wrappers/x86_64-linux-gnu-gcc b/images/linux-dev/wrappers/x86_64-linux-gnu-gcc new file mode 100755 index 00000000..59f921e6 --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-gnu-gcc @@ -0,0 +1,31 @@ +#!/bin/bash +# Wrapper that dispatches to `zig cc -target x86_64-linux-gnu.2.35`. +# Mirrors x86_64-linux-musl-gcc but pins a glibc 2.35 baseline; the +# resulting binaries run on any host with glibc >= 2.35. +# +# Intercepts `-print-prog-name=ld` and returns our cross-ld wrapper — +# libtool uses that query to discover the linker before probing it +# with `-m elf_x86_64` to decide whether shared library builds are +# supported. zig cc passes the query through to the host `/usr/bin/ld`, +# which is aarch64-only and rejects the x86_64 emulation mode, causing +# libtool to silently disable shared-lib emission. Pointing libtool at +# our ld.lld-backed wrapper makes the probe succeed. +case " $* " in + *" -print-prog-name=ld "*) + echo /usr/local/bin/x86_64-linux-gnu-ld + exit 0 + ;; +esac + +# Filters out `--target=` args that cc-rs (Rust build +# scripts like libseccomp-sys, capng-sys) adds — cc-rs emits the +# Rust-form triple (x86_64-unknown-linux-gnu) which Zig can't parse. +# We always pass our own -target below, so cc-rs's is redundant. +args=() +for arg in "$@"; do + case "$arg" in + --target=*) ;; + *) args+=("$arg") ;; + esac +done +exec /opt/zig/zig cc -target x86_64-linux-gnu.2.35 "${args[@]}" diff --git a/images/linux-dev/wrappers/x86_64-linux-gnu-ld b/images/linux-dev/wrappers/x86_64-linux-gnu-ld new file mode 100755 index 00000000..15daeca7 --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-gnu-ld @@ -0,0 +1,9 @@ +#!/bin/sh +# Wrapper that points the autotools / libtool linker probe at +# LLVM's lld, which can produce x86_64 ELF output on an aarch64 +# host. The system `/usr/bin/ld` on the dev image is aarch64-only +# and rejects `-m elf_x86_64`, causing libtool to silently disable +# shared-library builds. libtool searches PATH for `-ld` +# before falling back to plain `ld`; this wrapper satisfies the +# search and unblocks shared-lib emission. +exec ld.lld "$@" diff --git a/images/linux-dev/wrappers/x86_64-linux-gnu-ranlib b/images/linux-dev/wrappers/x86_64-linux-gnu-ranlib new file mode 100755 index 00000000..5118caa7 --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-gnu-ranlib @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/zig/zig ranlib "$@" diff --git a/images/linux-dev/wrappers/x86_64-linux-gnu-strip b/images/linux-dev/wrappers/x86_64-linux-gnu-strip new file mode 100755 index 00000000..052a9b9a --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-gnu-strip @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/zig/zig strip "$@" diff --git a/images/linux-dev/wrappers/x86_64-linux-musl-ar b/images/linux-dev/wrappers/x86_64-linux-musl-ar new file mode 100755 index 00000000..6c3646ab --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-musl-ar @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/zig/zig ar "$@" diff --git a/images/linux-dev/wrappers/x86_64-linux-musl-g++ b/images/linux-dev/wrappers/x86_64-linux-musl-g++ new file mode 100755 index 00000000..57b7ee3e --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-musl-g++ @@ -0,0 +1,14 @@ +#!/bin/bash +# Wrapper that dispatches to `zig c++ -target x86_64-linux-musl`. +# Filters out `--target=` args that cc-rs (Rust build +# scripts) adds — cc-rs emits the Rust-form triple +# (x86_64-unknown-linux-musl) which Zig can't parse. We always pass +# our own -target below, so cc-rs's is redundant. +args=() +for arg in "$@"; do + case "$arg" in + --target=*) ;; + *) args+=("$arg") ;; + esac +done +exec /opt/zig/zig c++ -target x86_64-linux-musl "${args[@]}" diff --git a/images/linux-dev/wrappers/x86_64-linux-musl-gcc b/images/linux-dev/wrappers/x86_64-linux-musl-gcc new file mode 100755 index 00000000..fd43c980 --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-musl-gcc @@ -0,0 +1,14 @@ +#!/bin/bash +# Wrapper that dispatches to `zig cc -target x86_64-linux-musl`. +# Filters out `--target=` args that cc-rs (Rust build +# scripts like zstd-sys) adds — cc-rs emits the Rust-form triple +# (x86_64-unknown-linux-musl) which Zig can't parse. We always pass +# our own -target below, so cc-rs's is redundant. +args=() +for arg in "$@"; do + case "$arg" in + --target=*) ;; + *) args+=("$arg") ;; + esac +done +exec /opt/zig/zig cc -target x86_64-linux-musl "${args[@]}" diff --git a/images/linux-dev/wrappers/x86_64-linux-musl-ranlib b/images/linux-dev/wrappers/x86_64-linux-musl-ranlib new file mode 100755 index 00000000..5118caa7 --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-musl-ranlib @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/zig/zig ranlib "$@" diff --git a/images/linux-dev/wrappers/x86_64-linux-musl-strip b/images/linux-dev/wrappers/x86_64-linux-musl-strip new file mode 100755 index 00000000..052a9b9a --- /dev/null +++ b/images/linux-dev/wrappers/x86_64-linux-musl-strip @@ -0,0 +1,2 @@ +#!/bin/sh +exec /opt/zig/zig strip "$@" diff --git a/scripts/build-dist-x86_64.sh b/scripts/build-dist-x86_64.sh new file mode 100755 index 00000000..f0e0cf97 --- /dev/null +++ b/scripts/build-dist-x86_64.sh @@ -0,0 +1,243 @@ +#!/bin/bash +# Copyright © 2026 Apple Inc. and the Containerization project authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Builds the x86_64 deployment tarball. +# +# Runs INSIDE the aarch64 Linux dev container (invoked by +# `make dist-x86_64` via the linux_run macro). Cross-compiles all four +# host-side binaries — cctl, vminitd, cloud-hypervisor, virtiofsd — +# to x86_64-linux-musl, packs an initfs.ext4 with the x86_64 guest +# binaries inside, and emits bin/containerization-x86_64-.tar.gz. +# +# See docs/x86_64-build.md for full documentation: prerequisites, +# pipeline stages, toolchain rationale, and troubleshooting. +# +# Force-rebuild env vars (default = skip stages whose outputs are +# up-to-date): +# REBUILD_VMINITD=1 vminitd + vmexec +# REBUILD_INITFS=1 initfs.ext4 (and the native aarch64 cctl packer) +# REBUILD_CH=1 cloud-hypervisor +# REBUILD_VIRTIOFSD=1 virtiofsd +# cctl x86 cross always rebuilds (Swift incremental handles no-ops). + +set -euo pipefail + +cd /workspace + +GIT_SHA=$(git rev-parse --short HEAD 2>/dev/null || echo unknown) +DIST_NAME="containerization-x86_64-${GIT_SHA}" +DIST_DIR="bin/dist-x86_64" +STAGE="${DIST_DIR}/${DIST_NAME}" +TGZ="bin/${DIST_NAME}.tar.gz" + +CROSS_PREFIX=/opt/cross-x86_64-musl +GNU_PREFIX=/opt/cross-x86_64-gnu +PATCH=/workspace/scripts/patches/virtiofsd-skip-cap-drop-with-sandbox-none.patch + +# Cargo cross env for the musl stages (cctl, vminitd, cloud-hypervisor). +# CC/CXX/AR are used by cc-rs (Rust build scripts that compile C, e.g. +# zstd-sys, libseccomp-sys, capng) — points them at the Zig-backed +# wrappers. Linker is intentionally NOT set here: cargo-zigbuild +# installs its own linker wrapper that strips Rust's self-contained +# musl crt files (which would otherwise collide with Zig's musl crt). +# Setting CARGO_TARGET_*_LINKER ourselves would override that and +# cause duplicate-symbol link errors. +# pkg-config (used by libseccomp-sys and libcap-ng's capng-sys) points +# at the static-musl prefix, not the aarch64 host. +# PKG_CONFIG_ALL_STATIC=1 makes pkg-config-rs emit +# `rustc-link-lib=static=...` for resolved libs — required because +# the C libs at $CROSS_PREFIX are static-only (.a, no .so), so the +# default dynamic link would fail to find the .so. +# +# virtiofsd has its own glibc-dynamic env block below; it overrides +# PKG_CONFIG_LIBDIR / SYSROOT_DIR in a subshell so the musl values +# stay correct for cloud-hypervisor. +. /root/.cargo/env +export CC_x86_64_unknown_linux_musl=x86_64-linux-musl-gcc +export CXX_x86_64_unknown_linux_musl=x86_64-linux-musl-g++ +export AR_x86_64_unknown_linux_musl=x86_64-linux-musl-ar +export PKG_CONFIG_LIBDIR="${CROSS_PREFIX}/lib/pkgconfig" +export PKG_CONFIG_SYSROOT_DIR="${CROSS_PREFIX}" +export PKG_CONFIG_ALLOW_CROSS=1 +# Add the static-musl cross prefix to rustc's native library search +# path so the linker finds the libseccomp.so / libcap-ng.so linker +# scripts that build-musl-x86_64-deps.sh installs alongside the .a +# files. The scripts redirect resolution to the static archives. +export CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="-L native=${CROSS_PREFIX}/lib" + +# Pre-flight checks +[ -f .local/cloud-hypervisor/Cargo.toml ] || { + echo "ERROR: missing .local/cloud-hypervisor source checkout." >&2 + echo " git clone -b v52.0 https://github.com/cloud-hypervisor/cloud-hypervisor .local/cloud-hypervisor" >&2 + exit 1 +} +[ -f .local/virtiofsd/Cargo.toml ] || { + echo "ERROR: missing .local/virtiofsd source checkout." >&2 + echo " git clone https://gitlab.com/virtio-fs/virtiofsd .local/virtiofsd" >&2 + exit 1 +} + +# Kernel candidates (prefer the compressed bzImage produced by +# `make -C kernel TARGET_ARCH=x86_64`, fall back to an uncompressed +# vmlinux). The kernel is required — a tarball without one isn't +# usable, so fail hard rather than silently producing one. +KERNEL_SRC= +for candidate in kernel/vmlinuz-x86_64 kernel/vmlinux-x86_64; do + if [ -f "$candidate" ]; then + if file "$candidate" | grep -qE 'x86 boot|x86-64'; then + KERNEL_SRC=$candidate + break + else + echo "ERROR: $candidate exists but is not x86_64." >&2 + file "$candidate" >&2 + exit 1 + fi + fi +done +if [ -z "${KERNEL_SRC}" ]; then + echo "ERROR: no x86_64 kernel found at kernel/vmlinuz-x86_64 or kernel/vmlinux-x86_64." >&2 + echo " build one with 'make -C kernel TARGET_ARCH=x86_64'." >&2 + exit 1 +fi + +mkdir -p "${DIST_DIR}" + +# Decide which steps need to run before doing any work, so log messages +# match what's actually happening. + +NEED_VMINITD=1 +if [ "${REBUILD_VMINITD:-0}" != "1" ] \ + && [ -x "${DIST_DIR}/vminitd" ] && [ -x "${DIST_DIR}/vmexec" ] \ + && [ -z "$(find vminitd/Sources vminitd/Package.swift Sources/Containerization/SandboxContext \ + -newer "${DIST_DIR}/vminitd" -print -quit 2>/dev/null)" ] \ + && [ -z "$(find vminitd/Sources vminitd/Package.swift Sources/Containerization/SandboxContext \ + -newer "${DIST_DIR}/vmexec" -print -quit 2>/dev/null)" ]; then + NEED_VMINITD=0 +fi + +NEED_INITFS=1 +if [ "${REBUILD_INITFS:-0}" != "1" ] \ + && [ "${NEED_VMINITD}" = "0" ] \ + && [ -f "${DIST_DIR}/initfs.ext4" ] \ + && [ "${DIST_DIR}/initfs.ext4" -nt "${DIST_DIR}/vminitd" ] \ + && [ "${DIST_DIR}/initfs.ext4" -nt "${DIST_DIR}/vmexec" ]; then + NEED_INITFS=0 +fi + +NEED_CH=1 +if [ "${REBUILD_CH:-0}" != "1" ] && [ -x "${DIST_DIR}/cloud-hypervisor" ]; then + NEED_CH=0 +fi + +NEED_VIRTIOFSD=1 +if [ "${REBUILD_VIRTIOFSD:-0}" != "1" ] && [ -x "${DIST_DIR}/virtiofsd" ]; then + NEED_VIRTIOFSD=0 +fi + +echo "==> Cross-compiling cctl to x86_64-linux-musl" +swift build -c release \ + --swift-sdk x86_64-swift-linux-musl \ + --product cctl \ + -Xswiftc -warnings-as-errors \ + -Xlinker -L"${CROSS_PREFIX}/lib" \ + --disable-automatic-resolution +CCTL_X86_64_BIN="$(swift build -c release --swift-sdk x86_64-swift-linux-musl --show-bin-path)/cctl" +install -m 755 "${CCTL_X86_64_BIN}" "${DIST_DIR}/cctl" + +if [ "${NEED_VMINITD}" = "1" ]; then + echo "==> Cross-compiling vminitd + vmexec to x86_64-linux-musl" + make -C vminitd \ + LIBC=musl \ + MUSL_ARCH=x86_64 \ + BUILD_CONFIGURATION=release \ + INSTALL_DIR="$(pwd)/${DIST_DIR}" +else + echo "==> Reusing staged vminitd + vmexec (sources unchanged; set REBUILD_VMINITD=1 to force)" +fi + +if [ "${NEED_CH}" = "1" ]; then + echo "==> Cross-compiling cloud-hypervisor to x86_64-unknown-linux-musl" + ( + cd .local/cloud-hypervisor + cargo zigbuild --release --target x86_64-unknown-linux-musl --bin cloud-hypervisor + ) + install -m 755 \ + ".local/cloud-hypervisor/target/x86_64-unknown-linux-musl/release/cloud-hypervisor" \ + "${DIST_DIR}/cloud-hypervisor" +else + echo "==> Reusing staged cloud-hypervisor (set REBUILD_CH=1 to force)" +fi + +if [ "${NEED_VIRTIOFSD}" = "1" ]; then + echo "==> Cross-compiling virtiofsd to x86_64-unknown-linux-gnu.2.35 (glibc-dynamic, with cap-drop patch)" + # virtiofsd ships glibc-dynamic: the deployment host provides + # libseccomp.so.2 and libcap-ng.so.0 at runtime. Subshell scopes + # the gnu env so it doesn't bleed into other stages. + ( + export CC_x86_64_unknown_linux_gnu=x86_64-linux-gnu-gcc + export CXX_x86_64_unknown_linux_gnu=x86_64-linux-gnu-g++ + export AR_x86_64_unknown_linux_gnu=x86_64-linux-gnu-ar + export PKG_CONFIG_LIBDIR="${GNU_PREFIX}/lib/pkgconfig" + export PKG_CONFIG_SYSROOT_DIR="${GNU_PREFIX}" + export CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS="-L native=${GNU_PREFIX}/lib" + cd .local/virtiofsd + if git apply --check "${PATCH}" 2>/dev/null; then + git apply "${PATCH}" + echo "applied virtiofsd cap-drop patch" + elif git apply --reverse --check "${PATCH}" 2>/dev/null; then + echo "virtiofsd cap-drop patch already applied" + else + echo "ERROR: virtiofsd cap-drop patch does not apply cleanly" >&2 + exit 1 + fi + cargo zigbuild --release --target x86_64-unknown-linux-gnu.2.35 + ) + install -m 755 \ + ".local/virtiofsd/target/x86_64-unknown-linux-gnu/release/virtiofsd" \ + "${DIST_DIR}/virtiofsd" +else + echo "==> Reusing staged virtiofsd (set REBUILD_VIRTIOFSD=1 to force)" +fi + +if [ "${NEED_INITFS}" = "1" ]; then + echo "==> Building aarch64 cctl natively (used to pack initfs.ext4)" + swift build -c release --product cctl -Xswiftc -warnings-as-errors --disable-automatic-resolution + NATIVE_CCTL="$(swift build -c release --show-bin-path)/cctl" + + echo "==> Building initfs.ext4 with x86_64 guest binaries" + rm -f "${DIST_DIR}/init.rootfs.tar.gz" "${DIST_DIR}/initfs.ext4" + "${NATIVE_CCTL}" rootfs create \ + --vminitd "${DIST_DIR}/vminitd" \ + --vmexec "${DIST_DIR}/vmexec" \ + --ext4 "${DIST_DIR}/initfs.ext4" \ + --label org.opencontainers.image.source=https://github.com/apple/containerization \ + --image vminit-x86_64:latest \ + "${DIST_DIR}/init.rootfs.tar.gz" +else + echo "==> Reusing staged initfs.ext4 (vminitd/vmexec unchanged; set REBUILD_INITFS=1 to force)" +fi + +echo "==> Staging tree at ${STAGE} and packaging" +rm -rf "${STAGE}" +mkdir -p "${STAGE}/bin" +install -m 755 "${DIST_DIR}/cctl" "${STAGE}/bin/cctl" +install -m 755 "${DIST_DIR}/cloud-hypervisor" "${STAGE}/bin/cloud-hypervisor" +install -m 755 "${DIST_DIR}/virtiofsd" "${STAGE}/bin/virtiofsd" +mkdir -p "${STAGE}/kernel" +cp "${KERNEL_SRC}" "${STAGE}/kernel/$(basename "${KERNEL_SRC}")" +cp "${DIST_DIR}/initfs.ext4" "${STAGE}/initfs.ext4" +rm -f "${TGZ}" +tar -czf "${TGZ}" -C "${DIST_DIR}" "${DIST_NAME}" +echo "wrote ${TGZ}" diff --git a/scripts/build-glibc-x86_64-deps.sh b/scripts/build-glibc-x86_64-deps.sh new file mode 100755 index 00000000..751815c4 --- /dev/null +++ b/scripts/build-glibc-x86_64-deps.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# Copyright © 2026 Apple Inc. and the Containerization project authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Builds x86_64-linux-gnu shared-library versions of libseccomp and +# libcap-ng (the two C libs virtiofsd links against) and installs them +# at /opt/cross-x86_64-gnu. Sibling of build-musl-x86_64-deps.sh; the +# Dockerfile invokes both at image build time. +# +# Why a separate prefix: virtiofsd ships glibc-dynamic in the x86_64 +# tarball (see docs/x86_64-build.md) so deployment hosts can use their +# system libseccomp.so.2 + libcap-ng.so.0; everything else in the tarball +# stays musl-static. Mixing static-musl .a archives and dynamic-gnu .so +# files at one prefix confuses pkg-config and the linker, so they live +# apart. +# +# The glibc baseline is pinned at 2.35 (Ubuntu 22.04 / Debian 12 / RHEL 9 +# era) via the Zig wrapper scripts at /usr/local/bin/x86_64-linux-gnu-*, +# which dispatch to `zig cc -target x86_64-linux-gnu.2.35`. Bump the +# wrapper triple if the baseline moves. +# +# Only two libraries here, both shared: virtiofsd's other build-script +# deps (zstd, etc.) come from cargo crates that vendor C sources, and +# cctl's libarchive lives on the musl side. + +set -euo pipefail + +HOST=x86_64-linux-gnu +PREFIX=/opt/cross-x86_64-gnu + +mkdir -p "${PREFIX}/lib" "${PREFIX}/include" + +export CC="${HOST}-gcc" +export CXX="${HOST}-g++" +export AR="${HOST}-ar" +export RANLIB="${HOST}-ranlib" +export STRIP="${HOST}-strip" +export PKG_CONFIG_PATH="${PREFIX}/lib/pkgconfig" + +WORK=$(mktemp -d) +trap 'rm -rf "${WORK}"' EXIT +cd "${WORK}" + +JOBS="$(nproc)" + +# fetch_extract URL ARCHIVE +# +# Downloads URL to ARCHIVE then extracts. Relies on HTTPS for transport +# integrity; no SHA pinning. The other build-time deps (apt packages, +# Zig, Rust toolchain) trust the same. +fetch_extract() { + local url=$1 archive=$2 + curl -fsSL -o "${archive}" "${url}" + tar -xf "${archive}" +} + +# Sanity check: cross compiler produces clean output for a trivial +# program. autotools / libtool turn unexpected compiler chatter into +# baffling configure errors; surfacing it here gives a real message. +echo "==> cross compiler sanity check" +"${CC}" --version +cat > "${WORK}/sanity.c" <<'EOF' +void foo(void) {} +EOF +out=$("${CC}" -c "${WORK}/sanity.c" -o "${WORK}/sanity.o" 2>&1) || { + echo "ERROR: cross compiler failed on trivial test.c:" >&2 + echo "${out}" >&2 + exit 1 +} +if [ -n "${out}" ]; then + echo "WARNING: cross compiler emitted output on a clean compile:" >&2 + echo "${out}" >&2 +fi + +# libcap-ng — github auto-archive (release artifacts for older tags +# aren't always uploaded). +LIBCAP_NG_VERSION=0.8.5 +fetch_extract "https://github.com/stevegrubb/libcap-ng/archive/refs/tags/v${LIBCAP_NG_VERSION}.tar.gz" libcap-ng.tar.gz +( + cd "libcap-ng-${LIBCAP_NG_VERSION}" + # GNU automake's strict mode requires these standard files to exist; + # the auto-archive tarball doesn't ship NEWS. Cheaper than + # configure.ac surgery. + touch NEWS README AUTHORS ChangeLog + autoreconf -i + ./configure --host="${HOST}" --prefix="${PREFIX}" \ + --disable-static --enable-shared \ + --without-python --without-python3 + make -j"${JOBS}" + make install +) + +# libseccomp — needs gperf at build time (installed via the Dockerfile +# alongside the musl deps). Built shared here. +LIBSECCOMP_VERSION=2.5.5 +fetch_extract "https://github.com/seccomp/libseccomp/releases/download/v${LIBSECCOMP_VERSION}/libseccomp-${LIBSECCOMP_VERSION}.tar.gz" libseccomp.tar.gz +( + cd "libseccomp-${LIBSECCOMP_VERSION}" + ./configure --host="${HOST}" --prefix="${PREFIX}" \ + --disable-static --enable-shared \ + --disable-python + make -j"${JOBS}" + make install +) + +# Drop libtool .la files — they encode build-host paths and confuse +# downstream consumers; the .so + pkg-config .pc files are sufficient. +rm -f "${PREFIX}/lib"/*.la + +# Force the unversioned dev symlinks (libseccomp.so, libcap-ng.so). +# libtool conservatively omits these when cross-compiling, but rustc's +# link step looks for them by unversioned name; without them the +# link fails with "unable to find dynamic system library 'seccomp'". +# Idempotent — `ln -sf` overwrites any existing symlink, and the loop +# picks up whatever versioned files libtool actually installed. +for stem in libseccomp libcap-ng; do + versioned=$(ls "${PREFIX}/lib/${stem}.so."* 2>/dev/null | sort -V | head -n1) + if [ -n "${versioned}" ]; then + ln -sf "$(basename "${versioned}")" "${PREFIX}/lib/${stem}.so" + else + echo "ERROR: no ${stem}.so.* found in ${PREFIX}/lib after install" >&2 + ls -la "${PREFIX}/lib" >&2 + exit 1 + fi +done + +echo "glibc-dynamic x86_64 C deps installed under ${PREFIX}" diff --git a/scripts/build-musl-x86_64-deps.sh b/scripts/build-musl-x86_64-deps.sh new file mode 100755 index 00000000..a0a44ab2 --- /dev/null +++ b/scripts/build-musl-x86_64-deps.sh @@ -0,0 +1,188 @@ +#!/bin/bash +# Copyright © 2026 Apple Inc. and the Containerization project authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Builds static-musl x86_64 versions of the C libraries that cctl and +# virtiofsd link against, and installs them at /opt/cross-x86_64-musl +# (a standalone prefix — the Zig-based cross compiler has no +# traditional sysroot, so the build-dist-x86_64.sh script and the +# cargo cross flow add explicit -L / -I flags pointing here). +# +# Invoked once at dev-image build time; the resulting layer is cached +# until this script changes. Adding/removing a library here is the only +# reason to invalidate it. + +set -euo pipefail + +HOST=x86_64-linux-musl +PREFIX=/opt/cross-x86_64-musl + +mkdir -p "${PREFIX}/lib" "${PREFIX}/include" + +export CC="${HOST}-gcc" +export CXX="${HOST}-g++" +export AR="${HOST}-ar" +export RANLIB="${HOST}-ranlib" +export STRIP="${HOST}-strip" +export PKG_CONFIG_PATH="${PREFIX}/lib/pkgconfig" + +WORK=$(mktemp -d) +trap 'rm -rf "${WORK}"' EXIT +cd "${WORK}" + +JOBS="$(nproc)" + +# fetch_extract URL ARCHIVE +# +# Downloads URL to ARCHIVE then extracts. Relies on HTTPS for transport +# integrity; no SHA pinning. The other build-time deps (apt packages, +# Zig, Rust toolchain) trust the same. +fetch_extract() { + local url=$1 archive=$2 + curl -fsSL -o "${archive}" "${url}" + tar -xf "${archive}" +} + +# Sanity check: cross compiler is on PATH and produces clean output +# for a trivial program. If it doesn't, zlib's configure script (which +# treats any stderr/stdout from a test compile as evidence of -Werror) +# will fail with a misleading "Compiler error reporting is too harsh" +# error. Surfacing this here gives us a real error message instead. +echo "==> cross compiler sanity check" +"${CC}" --version +cat > "${WORK}/sanity.c" <<'EOF' +void foo(void) {} +EOF +out=$("${CC}" -c "${WORK}/sanity.c" -o "${WORK}/sanity.o" 2>&1) || { + echo "ERROR: cross compiler failed on trivial test.c:" >&2 + echo "${out}" >&2 + exit 1 +} +if [ -n "${out}" ]; then + echo "WARNING: cross compiler emitted output on a clean compile:" >&2 + echo "${out}" >&2 + echo "(this will trip up zlib's configure script — see fix below)" >&2 +fi + +# zlib — provides libz.a. Its configure does not take --host, so the +# CC env var is what selects the cross compiler. +ZLIB_VERSION=1.3.1 +fetch_extract "https://zlib.net/fossils/zlib-${ZLIB_VERSION}.tar.gz" zlib.tar.gz +( + cd "zlib-${ZLIB_VERSION}" + if ! ./configure --static --prefix="${PREFIX}"; then + echo "==================== zlib configure.log ====================" >&2 + [ -f configure.log ] && cat configure.log >&2 + echo "============================================================" >&2 + exit 1 + fi + make -j"${JOBS}" + make install +) + +# xz — provides liblzma.a. +XZ_VERSION=5.6.4 +fetch_extract "https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz" xz.tar.gz +( + cd "xz-${XZ_VERSION}" + ./configure --host="${HOST}" --prefix="${PREFIX}" \ + --enable-static --disable-shared \ + --disable-doc --disable-scripts \ + --disable-xz --disable-xzdec --disable-lzmadec --disable-lzmainfo \ + --disable-lzma-links + make -j"${JOBS}" + make install +) + +# bzip2 — no autotools, drives a plain Makefile. Build only the static +# library; the bzip2 CLI tools are not needed. +BZIP2_VERSION=1.0.8 +fetch_extract "https://sourceware.org/pub/bzip2/bzip2-${BZIP2_VERSION}.tar.gz" bzip2.tar.gz +( + cd "bzip2-${BZIP2_VERSION}" + make CC="${CC}" AR="${AR}" RANLIB="${RANLIB}" libbz2.a -j"${JOBS}" + install -m 644 libbz2.a "${PREFIX}/lib/" + install -m 644 bzlib.h "${PREFIX}/include/" +) + +# libarchive — needs zlib + lzma + bz2 (built above). Disable optional +# deps that pull in extra toolchain weight (xml2, iconv, zstd, lz4, +# openssl, libb2). cctl uses libarchive for tar/ext layouts; the +# disabled formats are not used at runtime. +LIBARCHIVE_VERSION=3.7.7 +fetch_extract "https://github.com/libarchive/libarchive/releases/download/v${LIBARCHIVE_VERSION}/libarchive-${LIBARCHIVE_VERSION}.tar.gz" libarchive.tar.gz +( + cd "libarchive-${LIBARCHIVE_VERSION}" + ./configure --host="${HOST}" --prefix="${PREFIX}" \ + --enable-static --disable-shared \ + --disable-bsdtar --disable-bsdcat --disable-bsdcpio --disable-bsdunzip \ + --without-xml2 --without-iconv --without-zstd --without-lz4 \ + --without-openssl --without-libb2 \ + CPPFLAGS="-I${PREFIX}/include" \ + LDFLAGS="-L${PREFIX}/lib" + make -j"${JOBS}" + make install +) + +# libcap-ng — github auto-archive (release artifacts for older tags +# aren't always uploaded; the auto-archive URL is always available +# for any tag, but doesn't ship a pre-generated configure script, so +# we autoreconf it ourselves). +LIBCAP_NG_VERSION=0.8.5 +fetch_extract "https://github.com/stevegrubb/libcap-ng/archive/refs/tags/v${LIBCAP_NG_VERSION}.tar.gz" libcap-ng.tar.gz +( + cd "libcap-ng-${LIBCAP_NG_VERSION}" + # GNU automake's default (strict) mode requires these standard + # files to exist; the auto-archive tarball doesn't ship NEWS. + # Cheaper than configure.ac surgery. + touch NEWS README AUTHORS ChangeLog + autoreconf -i + ./configure --host="${HOST}" --prefix="${PREFIX}" \ + --enable-static --disable-shared \ + --without-python --without-python3 + make -j"${JOBS}" + make install +) + +# libseccomp — needs gperf at build time (installed via apt above). +LIBSECCOMP_VERSION=2.5.5 +fetch_extract "https://github.com/seccomp/libseccomp/releases/download/v${LIBSECCOMP_VERSION}/libseccomp-${LIBSECCOMP_VERSION}.tar.gz" libseccomp.tar.gz +( + cd "libseccomp-${LIBSECCOMP_VERSION}" + ./configure --host="${HOST}" --prefix="${PREFIX}" \ + --enable-static --disable-shared \ + --disable-python + make -j"${JOBS}" + make install +) + +# Linker-script `.so` shims for libseccomp + libcap-ng. The Rust +# `-sys` crates emit plain `cargo:rustc-link-lib=seccomp` (no +# static= prefix), and they don't declare `links = "..."` in their +# Cargo.toml, so cargo's build-script override can't match them. +# Instead we hand the linker fake `.so` files that are actually GNU +# ld linker scripts pointing at the static archive — when ld resolves +# `-lseccomp` to libseccomp.so it reads the script and pulls in +# libseccomp.a as if statically linked. Works regardless of -Bstatic +# vs -Bdynamic state and avoids needing to patch the -sys crates. +cat > "${PREFIX}/lib/libseccomp.so" < "${PREFIX}/lib/libcap-ng.so" </dev/null || echo "") export BUILD_TIME := $(shell date -u +%Y-%m-%dT%H:%M:%SZ) SWIFT_WARNING_CONFIG := $(if $(filter-out false,$(WARNINGS_AS_ERRORS)),-Xswiftc -warnings-as-errors) +# MUSL_ARCH selects which Static Linux SDK triple to build against +# ($(MUSL_ARCH)-swift-linux-musl). Defaults to the host architecture +# so the in-tree aarch64 flow works unchanged, but callers can override +# (e.g. `make MUSL_ARCH=x86_64` for the dist-x86_64 cross-build path). +ifndef MUSL_ARCH UNAME_M := $(shell uname -m) ifeq ($(UNAME_M),arm64) MUSL_ARCH := aarch64 @@ -26,6 +31,7 @@ MUSL_ARCH := aarch64 else MUSL_ARCH := x86_64 endif +endif LIBC ?= musl ifeq ($(LIBC),musl) @@ -58,16 +64,21 @@ endif .DEFAULT_GOAL := all +# INSTALL_DIR is where built binaries land. Defaults to ./bin so the +# in-tree aarch64 flow is unchanged. The dist-x86_64 cross-build +# overrides this to keep its artifacts out of vminitd/bin/. +INSTALL_DIR ?= ./bin + .PHONY: all -all: +all: @echo Building vminitd and vmexec... - @mkdir -p ./bin/ - @rm -f ./bin/vminitd - @rm -f ./bin/vmexec + @mkdir -p $(INSTALL_DIR) + @rm -f $(INSTALL_DIR)/vminitd + @rm -f $(INSTALL_DIR)/vmexec @$(SWIFT) --version @$(SWIFT) build -c $(BUILD_CONFIGURATION) $(SWIFT_CONFIGURATION) - @install "$(BUILD_BIN_DIR)/vminitd" ./bin/ - @install "$(BUILD_BIN_DIR)/vmexec" ./bin/ + @install "$(BUILD_BIN_DIR)/vminitd" $(INSTALL_DIR)/ + @install "$(BUILD_BIN_DIR)/vmexec" $(INSTALL_DIR)/ .PHONY: cross-prep cross-prep: swift linux-sdk diff --git a/vminitd/Sources/VminitdCore/Server+GRPC.swift b/vminitd/Sources/VminitdCore/Server+GRPC.swift index 6659199c..ff6d1c5f 100644 --- a/vminitd/Sources/VminitdCore/Server+GRPC.swift +++ b/vminitd/Sources/VminitdCore/Server+GRPC.swift @@ -951,7 +951,7 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContext.SimpleServ if error is RPCError { throw error } - throw RPCError(code: .internalError, message: "createProcess", cause: error) + throw RPCError(code: .internalError, message: "createProcess: \(error)", cause: error) } }