From 59437391a07fe3624270b2151f2f257f614e2540 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 3 May 2026 21:36:20 -0700 Subject: [PATCH 1/3] add gpu support --- .../scripts/openshell-vm-sandbox-init.sh | 16 + crates/openshell-driver-vm/src/driver.rs | 37 +- crates/openshell-driver-vm/src/rootfs.rs | 561 +++++++++++++++++- crates/openshell-driver-vm/src/runtime.rs | 2 + .../runtime/kernel/openshell.kconfig | 22 + sandboxes/nvidia-gpu/Dockerfile | 84 +++ sandboxes/nvidia-gpu/README.md | 98 +++ sandboxes/nvidia-gpu/versions.env | 6 + tasks/scripts/gateway-vm.sh | 19 +- tasks/scripts/vm/build-nvidia-modules.sh | 176 ++++++ tasks/scripts/vm/build-supervisor-bundle.sh | 29 +- tasks/vm.toml | 4 + 12 files changed, 1037 insertions(+), 17 deletions(-) create mode 100644 sandboxes/nvidia-gpu/Dockerfile create mode 100644 sandboxes/nvidia-gpu/README.md create mode 100644 sandboxes/nvidia-gpu/versions.env create mode 100755 tasks/scripts/vm/build-nvidia-modules.sh diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index b61fd4900..e590be195 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -234,6 +234,17 @@ create_gpu_device_nodes_mknod() { setup_gpu() { ts "GPU_ENABLED=true — initializing GPU passthrough" + # Kernel modules are built for a specific guest kernel version. + # If the running kernel doesn't match, depmod/modprobe will silently fail. + local expected_kver="6.12.76" + local actual_kver + actual_kver="$(uname -r)" + if [ "${actual_kver}" != "${expected_kver}" ]; then + ts "WARNING: kernel version mismatch: expected ${expected_kver}, got ${actual_kver}" + ts " GPU modules are installed under lib/modules/${expected_kver}/" + ts " modprobe may fail to find them" + fi + if ! command -v modprobe >/dev/null 2>&1; then ts "FATAL: modprobe not found; cannot load nvidia kernel modules" return 1 @@ -249,6 +260,11 @@ setup_gpu() { fi fi + ts "generating module dependency index" + if ! depmod -a "$(uname -r)" 2>/dev/null; then + ts "WARNING: depmod failed; modprobe may not find modules" + fi + ts "loading nvidia kernel modules" modprobe nvidia || { ts "FATAL: modprobe nvidia failed"; return 1; } modprobe nvidia_uvm 2>/dev/null || true diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index d79e5d922..84338c85c 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -5,8 +5,8 @@ use crate::gpu::{ GpuInventory, SubnetAllocator, allocate_vsock_cid, mac_from_sandbox_id, tap_device_name, }; use crate::rootfs::{ - create_rootfs_archive_from_dir, extract_rootfs_archive_to, - prepare_sandbox_rootfs_from_image_root, sandbox_guest_init_path, + create_rootfs_archive_from_dir, extract_rootfs_archive_to, inject_gpu_modules, + prepare_sandbox_rootfs_from_image_root, refresh_runtime_artifacts, sandbox_guest_init_path, }; use bollard::Docker; use bollard::errors::Error as BollardError; @@ -419,6 +419,28 @@ impl VmDriver { return Err(err); } }; + if is_gpu { + let rootfs_for_gpu = rootfs.clone(); + let driver_state_dir = self.config.state_dir.clone(); + if let Err(err) = tokio::task::spawn_blocking(move || { + inject_gpu_modules(&rootfs_for_gpu, &driver_state_dir) + }) + .await + .map_err(|e| Status::internal(format!("GPU module injection panicked: {e}")))? + { + warn!( + sandbox_id = %sandbox.id, + error = %err, + "vm driver: GPU module injection failed" + ); + let _ = tokio::fs::remove_dir_all(&state_dir).await; + return Err(Status::failed_precondition(format!( + "GPU module injection failed: {err}" + ))); + } + info!(sandbox_id = %sandbox.id, "vm driver: GPU modules injected into rootfs"); + } + if let Some(tls_paths) = tls_paths.as_ref() && let Err(err) = prepare_guest_tls_materials(&rootfs, tls_paths).await { @@ -738,10 +760,13 @@ impl VmDriver { .await?; let archive_path = image_cache_rootfs_archive(&self.config.state_dir, &image_identity); let rootfs_dest = rootfs.to_path_buf(); - tokio::task::spawn_blocking(move || extract_rootfs_archive_to(&archive_path, &rootfs_dest)) - .await - .map_err(|err| Status::internal(format!("sandbox rootfs extraction panicked: {err}")))? - .map_err(|err| Status::internal(format!("extract sandbox rootfs failed: {err}")))?; + tokio::task::spawn_blocking(move || { + extract_rootfs_archive_to(&archive_path, &rootfs_dest)?; + refresh_runtime_artifacts(&rootfs_dest) + }) + .await + .map_err(|err| Status::internal(format!("sandbox rootfs extraction panicked: {err}")))? + .map_err(|err| Status::internal(format!("extract sandbox rootfs failed: {err}")))?; Ok(image_identity) } diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs index 5ea687d15..cc4d21c22 100644 --- a/crates/openshell-driver-vm/src/rootfs.rs +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -4,7 +4,7 @@ use std::fs; use std::fs::File; use std::io::{BufWriter, Cursor}; -use std::path::Path; +use std::path::{Path, PathBuf}; const SUPERVISOR: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/openshell-sandbox.zst")); const ROOTFS_VARIANT_MARKER: &str = ".openshell-rootfs-variant"; @@ -29,6 +29,32 @@ pub fn prepare_sandbox_rootfs_from_image_root( Ok(()) } +/// Re-inject the init script and supervisor binary into an already-prepared +/// rootfs. The image rootfs archive cache is keyed by image digest, so a +/// driver rebuild does not invalidate it. Calling this after extraction +/// ensures the guest always runs the init script and supervisor that match +/// the running driver binary. +pub fn refresh_runtime_artifacts(rootfs: &Path) -> Result<(), String> { + let init_path = rootfs.join("srv/openshell-vm-sandbox-init.sh"); + if let Some(parent) = init_path.parent() { + fs::create_dir_all(parent).map_err(|e| format!("create {}: {e}", parent.display()))?; + } + fs::write( + &init_path, + include_str!("../scripts/openshell-vm-sandbox-init.sh"), + ) + .map_err(|e| format!("write {}: {e}", init_path.display()))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + fs::set_permissions(&init_path, fs::Permissions::from_mode(0o755)) + .map_err(|e| format!("chmod {}: {e}", init_path.display()))?; + } + + ensure_supervisor_binary(rootfs)?; + Ok(()) +} + pub fn extract_rootfs_archive_to(archive_path: &Path, dest: &Path) -> Result<(), String> { if dest.exists() { fs::remove_dir_all(dest) @@ -191,6 +217,350 @@ pub fn validate_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { Ok(()) } +/// Kernel version of the libkrunfw guest. Modules must be compiled against +/// this exact version; a mismatch causes `modprobe` failures at boot. +/// +/// Keep in sync with: +/// - `tasks/scripts/vm/build-nvidia-modules.sh` (KERNEL_TREE path) +/// - `openshell-vm-sandbox-init.sh` `setup_gpu()` expected version +const GUEST_KERNEL_VERSION: &str = "6.12.76"; + +/// Inject NVIDIA kernel modules, firmware, and `kmod` tooling into a prepared +/// sandbox rootfs. Called by the driver when a sandbox requests GPU support. +/// +/// Module source resolution order: +/// 1. `OPENSHELL_GPU_MODULES_DIR` environment variable +/// 2. `/gpu-modules/` (pre-provisioned by the operator) +/// +/// Firmware source resolution (first match wins): +/// 0. Rootfs already contains `.bin` files (e.g. from the image's `.run` +/// installer) — **skip injection entirely** to avoid version mismatch. +/// 1. `/../nvidia-firmware/` +/// 2. Host `/lib/firmware/nvidia/` +/// +/// Returns an error only if module injection is impossible (no source found +/// or a write fails). Missing firmware emits a warning and continues. +pub fn inject_gpu_modules(rootfs: &Path, state_dir: &Path) -> Result<(), String> { + let modules_dir = resolve_gpu_modules_dir(state_dir)?; + + let ko_files: Vec = fs::read_dir(&modules_dir) + .map_err(|e| format!("read GPU modules dir {}: {e}", modules_dir.display()))? + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension().is_some_and(|ext| ext == "ko") { + Some(path) + } else { + None + } + }) + .collect(); + + if ko_files.is_empty() { + return Err(format!( + "GPU modules dir {} contains no .ko files", + modules_dir.display() + )); + } + + let modules_dst = rootfs.join(format!( + "lib/modules/{GUEST_KERNEL_VERSION}/kernel/drivers/nvidia" + )); + fs::create_dir_all(&modules_dst) + .map_err(|e| format!("create {}: {e}", modules_dst.display()))?; + + for ko in &ko_files { + let dest = modules_dst.join(ko.file_name().unwrap()); + let bytes_copied = fs::copy(ko, &dest).map_err(|e| { + format!( + "copy {} -> {}: {e}", + ko.display(), + dest.display() + ) + })?; + tracing::info!( + module = %ko.file_name().unwrap().to_string_lossy(), + size_bytes = bytes_copied, + src = %ko.display(), + "injected GPU kernel module" + ); + } + + inject_gpu_firmware(rootfs, &modules_dir); + ensure_kmod_symlinks(rootfs); + warn_missing_gpu_userspace(rootfs); + + Ok(()) +} + +/// Check whether the rootfs contains essential GPU userspace binaries. +/// Emits actionable warnings when the sandbox image lacks nvidia-smi +/// or CUDA libraries — common when `--gpu` is used with a non-GPU base +/// image like `ubuntu:latest` instead of the GPU sandbox Dockerfile. +fn warn_missing_gpu_userspace(rootfs: &Path) { + let nvidia_smi_candidates = [ + "usr/bin/nvidia-smi", + "usr/local/bin/nvidia-smi", + "bin/nvidia-smi", + ]; + let has_nvidia_smi = nvidia_smi_candidates + .iter() + .any(|p| rootfs.join(p).exists()); + + if !has_nvidia_smi { + tracing::warn!( + "GPU sandbox image does not contain nvidia-smi. The sandbox will \ + have GPU kernel modules but no NVIDIA userspace tools. Use a \ + GPU-enabled image (e.g. --from ./sandboxes/nvidia-gpu/Dockerfile) \ + or install the NVIDIA driver userspace in your image." + ); + } +} + +/// Locate the directory containing pre-built NVIDIA `.ko` files. +/// +/// Resolution order: +/// 1. `OPENSHELL_GPU_MODULES_DIR` env var (explicit override) +/// 2. `/gpu-modules/` (operator pre-provisioned) +/// 3. `/target/libkrun-build/nvidia-modules/` (build tree, +/// discovered relative to the driver executable) +/// 4. Host `/lib/modules//kernel/drivers/nvidia/` +fn resolve_gpu_modules_dir(state_dir: &Path) -> Result { + if let Ok(dir) = std::env::var("OPENSHELL_GPU_MODULES_DIR") { + let p = PathBuf::from(&dir); + if p.is_dir() { + tracing::info!(path = %p.display(), "using GPU modules from OPENSHELL_GPU_MODULES_DIR"); + return Ok(p); + } + return Err(format!( + "OPENSHELL_GPU_MODULES_DIR={dir} is not a directory" + )); + } + + let provisioned = state_dir.join("gpu-modules"); + if provisioned.is_dir() { + tracing::info!(path = %provisioned.display(), "using pre-provisioned GPU modules"); + return Ok(provisioned); + } + + // Auto-discover from the build tree. The driver binary lives at + // `target/{debug,release}/openshell-driver-vm`, so the project root + // is two levels up. The old GPU rootfs script places modules at + // `target/libkrun-build/nvidia-modules/`. + if let Some(build_tree_dir) = discover_build_tree_modules() { + return Ok(build_tree_dir); + } + + // Check common host-installed module paths. + for candidate in [ + format!("/lib/modules/{GUEST_KERNEL_VERSION}/kernel/drivers/nvidia"), + format!("/lib/modules/{GUEST_KERNEL_VERSION}/extra/nvidia"), + ] { + let p = PathBuf::from(&candidate); + if dir_has_ko_files(&p) { + tracing::info!(path = %p.display(), "using host-installed GPU modules"); + return Ok(p); + } + } + + Err(format!( + "No GPU kernel modules found. Searched: OPENSHELL_GPU_MODULES_DIR (unset), \ + {}, build tree, host /lib/modules/{}. \ + Build modules with `mise run vm:nvidia-modules` \ + or set OPENSHELL_GPU_MODULES_DIR.", + provisioned.display(), + GUEST_KERNEL_VERSION, + )) +} + +/// Walk up from the driver executable to find `target/libkrun-build/nvidia-modules/`. +/// +/// This is a development convenience — production deployments should use +/// `OPENSHELL_GPU_MODULES_DIR` or pre-provision `/gpu-modules/`. +fn discover_build_tree_modules() -> Option { + #[cfg(unix)] + if unsafe { libc::getuid() } == 0 { + tracing::debug!("build-tree GPU module discovery running as root; \ + prefer OPENSHELL_GPU_MODULES_DIR in production"); + } + let exe = std::env::current_exe().ok()?; + // exe is typically target/{debug,release}/openshell-driver-vm + let target_dir = exe.parent()?.parent()?; + let modules_dir = target_dir.join("libkrun-build/nvidia-modules"); + if dir_has_ko_files(&modules_dir) { + tracing::info!( + path = %modules_dir.display(), + "auto-discovered GPU modules in build tree" + ); + return Some(modules_dir); + } + + // Also try CWD-relative (for `cargo run` or `mise run` from project root). + let cwd_candidate = PathBuf::from("target/libkrun-build/nvidia-modules"); + if dir_has_ko_files(&cwd_candidate) { + let abs = cwd_candidate.canonicalize().unwrap_or(cwd_candidate.clone()); + tracing::info!( + path = %abs.display(), + "auto-discovered GPU modules relative to CWD" + ); + return Some(abs); + } + + None +} + +fn dir_has_ko_files(dir: &Path) -> bool { + if !dir.is_dir() { + return false; + } + let Some(entries) = fs::read_dir(dir).ok() else { + return false; + }; + let mut has_uncompressed = false; + let mut has_compressed = false; + for entry in entries.flatten() { + let path = entry.path(); + match path.extension().and_then(|e| e.to_str()) { + Some("ko") => has_uncompressed = true, + Some("zst" | "xz") => { + if path.file_stem().and_then(|s| std::path::Path::new(s).extension()).is_some_and(|ext| ext == "ko") { + has_compressed = true; + } + } + _ => {} + } + } + if !has_uncompressed && has_compressed { + tracing::warn!( + path = %dir.display(), + "directory contains compressed .ko.zst/.ko.xz modules but only uncompressed .ko files are supported" + ); + } + has_uncompressed +} + +/// Copy NVIDIA GSP firmware into the rootfs. Non-fatal on failure. +/// +/// Skips injection if the rootfs already contains `.bin` firmware files +/// (e.g. the sandbox Docker image installed them via the NVIDIA `.run` +/// installer). Overwriting image-provided firmware with build-tree or +/// host firmware causes version mismatches when the host driver differs +/// from the image's driver version. +fn inject_gpu_firmware(rootfs: &Path, modules_dir: &Path) { + let fw_dst = rootfs.join("lib/firmware/nvidia"); + + if rootfs_has_firmware_bins(&fw_dst) { + tracing::info!( + path = %fw_dst.display(), + "rootfs already contains GPU firmware; skipping injection" + ); + return; + } + + // Try version-matched firmware next to the modules directory. + let fw_parent = modules_dir + .parent() + .map(|p| p.join("nvidia-firmware")); + + if let Some(ref fw_dir) = fw_parent { + if fw_dir.is_dir() { + if let Err(e) = copy_dir_contents(fw_dir, &fw_dst) { + tracing::warn!(error = %e, "failed to copy version-matched firmware"); + } else { + tracing::info!(src = %fw_dir.display(), "injected GPU firmware (version-matched)"); + return; + } + } + } + + // Fallback: host firmware + for candidate in ["/lib/firmware/nvidia", "/usr/lib/firmware/nvidia"] { + let host_fw = Path::new(candidate); + if host_fw.is_dir() { + if let Err(e) = copy_dir_contents(host_fw, &fw_dst) { + tracing::warn!(error = %e, src = candidate, "failed to copy host firmware"); + } else { + tracing::info!(src = candidate, "injected GPU firmware from host"); + return; + } + } + } + + tracing::warn!( + "no NVIDIA GSP firmware found; GPU guests may fail to initialize. \ + Place firmware in {:?} or host /lib/firmware/nvidia/", + fw_parent.as_deref().unwrap_or(Path::new("(unknown)")) + ); +} + +/// Check whether a firmware directory (or any subdirectory) contains `.bin` files. +fn rootfs_has_firmware_bins(fw_dir: &Path) -> bool { + if !fw_dir.is_dir() { + return false; + } + let Ok(entries) = fs::read_dir(fw_dir) else { + return false; + }; + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().is_some_and(|ext| ext == "bin") { + return true; + } + if path.is_dir() && rootfs_has_firmware_bins(&path) { + return true; + } + } + false +} + +/// Ensure `modprobe`, `insmod`, etc. symlinks exist. Many minimal container +/// images install `kmod` but lack the convenience symlinks in `/usr/sbin`. +fn ensure_kmod_symlinks(rootfs: &Path) { + let kmod_candidates = ["bin/kmod", "usr/bin/kmod", "sbin/kmod", "usr/sbin/kmod"]; + let kmod_exists = kmod_candidates + .iter() + .any(|p| rootfs.join(p).exists()); + + if !kmod_exists { + tracing::warn!("kmod not found in rootfs; modprobe will fail. \ + Ensure the sandbox image installs the 'kmod' package."); + return; + } + + let sbin = rootfs.join("usr/sbin"); + let _ = fs::create_dir_all(&sbin); + for tool in ["modprobe", "insmod", "rmmod", "lsmod", "depmod"] { + let link = sbin.join(tool); + if !link.exists() { + #[cfg(unix)] + { + let _ = std::os::unix::fs::symlink("../../bin/kmod", &link) + .or_else(|_| std::os::unix::fs::symlink("/usr/bin/kmod", &link)); + } + } + } +} + +/// Recursively copy all files from `src` to `dst`, preserving directory structure. +fn copy_dir_contents(src: &Path, dst: &Path) -> Result<(), String> { + fs::create_dir_all(dst).map_err(|e| format!("create {}: {e}", dst.display()))?; + + for entry in fs::read_dir(src).map_err(|e| format!("read {}: {e}", src.display()))? { + let entry = entry.map_err(|e| format!("read entry in {}: {e}", src.display()))?; + let src_path = entry.path(); + let dst_path = dst.join(entry.file_name()); + + if src_path.is_dir() { + copy_dir_contents(&src_path, &dst_path)?; + } else { + fs::copy(&src_path, &dst_path).map_err(|e| { + format!("copy {} -> {}: {e}", src_path.display(), dst_path.display()) + })?; + } + } + Ok(()) +} + fn ensure_sandbox_guest_user(rootfs: &Path) -> Result<(), String> { const SANDBOX_UID: u32 = 10001; const SANDBOX_GID: u32 = 10001; @@ -437,6 +807,195 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + #[test] + fn refresh_runtime_artifacts_overwrites_stale_init_script() { + let dir = unique_temp_dir(); + let rootfs = dir.join("rootfs"); + + fs::create_dir_all(rootfs.join("srv")).expect("create srv"); + fs::create_dir_all(rootfs.join("opt/openshell/bin")).expect("create openshell bin"); + fs::write( + rootfs.join("srv/openshell-vm-sandbox-init.sh"), + b"#!/bin/bash\n# stale placeholder", + ) + .expect("write stale init"); + fs::write( + rootfs.join("opt/openshell/bin/openshell-sandbox"), + b"old-supervisor", + ) + .expect("write stale supervisor"); + + refresh_runtime_artifacts(&rootfs).expect("refresh runtime artifacts"); + + let init_content = + fs::read_to_string(rootfs.join("srv/openshell-vm-sandbox-init.sh")).expect("read init"); + assert!( + init_content.contains("setup_gpu"), + "refreshed init script should contain GPU setup logic" + ); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn inject_gpu_modules_copies_ko_files() { + let dir = unique_temp_dir(); + let modules_dir = dir.join("modules"); + let rootfs = dir.join("rootfs"); + + fs::create_dir_all(&modules_dir).expect("create modules dir"); + fs::create_dir_all(&rootfs).expect("create rootfs dir"); + fs::write(modules_dir.join("nvidia.ko"), b"\x7fELF-fake-module-1").expect("write nvidia.ko"); + fs::write(modules_dir.join("nvidia-uvm.ko"), b"\x7fELF-fake-module-2") + .expect("write nvidia-uvm.ko"); + + unsafe { std::env::set_var("OPENSHELL_GPU_MODULES_DIR", &modules_dir) }; + let result = inject_gpu_modules(&rootfs, Path::new("/dummy/state")); + unsafe { std::env::remove_var("OPENSHELL_GPU_MODULES_DIR") }; + + result.expect("inject_gpu_modules should succeed"); + + let dest = rootfs.join("lib/modules/6.12.76/kernel/drivers/nvidia"); + assert!(dest.join("nvidia.ko").is_file()); + assert!(dest.join("nvidia-uvm.ko").is_file()); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn inject_gpu_modules_fails_with_no_ko_files() { + let dir = unique_temp_dir(); + let modules_dir = dir.join("modules"); + + fs::create_dir_all(&modules_dir).expect("create modules dir"); + fs::write(modules_dir.join("readme.txt"), b"not a kernel module").expect("write txt"); + + unsafe { std::env::set_var("OPENSHELL_GPU_MODULES_DIR", &modules_dir) }; + let result = inject_gpu_modules(Path::new("/dummy/rootfs"), Path::new("/dummy/state")); + unsafe { std::env::remove_var("OPENSHELL_GPU_MODULES_DIR") }; + + let err = result.expect_err("should fail with no .ko files"); + assert!( + err.contains("no .ko files"), + "error should mention 'no .ko files', got: {err}" + ); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn inject_gpu_modules_fails_with_missing_dir() { + let dir = unique_temp_dir(); + let missing = dir.join("does-not-exist"); + + unsafe { std::env::set_var("OPENSHELL_GPU_MODULES_DIR", &missing) }; + let result = inject_gpu_modules(Path::new("/dummy/rootfs"), Path::new("/dummy/state")); + unsafe { std::env::remove_var("OPENSHELL_GPU_MODULES_DIR") }; + + let err = result.expect_err("should fail with missing directory"); + assert!( + err.contains("not a directory"), + "error should mention 'not a directory', got: {err}" + ); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn inject_gpu_firmware_skips_when_rootfs_has_bins() { + let dir = unique_temp_dir(); + let rootfs = dir.join("rootfs"); + let modules_dir = dir.join("modules"); + let fw_dir = rootfs.join("lib/firmware/nvidia"); + + fs::create_dir_all(&fw_dir).expect("create firmware dir"); + fs::create_dir_all(&modules_dir).expect("create modules dir"); + fs::write(fw_dir.join("gsp.bin"), b"original-firmware-content").expect("write gsp.bin"); + + inject_gpu_firmware(&rootfs, &modules_dir); + + let content = fs::read(fw_dir.join("gsp.bin")).expect("read gsp.bin after injection"); + assert_eq!( + content, + b"original-firmware-content", + "firmware should not be overwritten when rootfs already has .bin files" + ); + + let _ = fs::remove_dir_all(&dir); + } + + #[cfg(unix)] + #[test] + fn ensure_kmod_symlinks_creates_links() { + let dir = unique_temp_dir(); + let rootfs = dir.join("rootfs"); + + fs::create_dir_all(rootfs.join("bin")).expect("create bin"); + fs::write(rootfs.join("bin/kmod"), b"kmod-stub").expect("write kmod"); + + ensure_kmod_symlinks(&rootfs); + + assert!( + rootfs.join("usr/sbin/modprobe").exists(), + "modprobe symlink should exist" + ); + assert!( + rootfs.join("usr/sbin/insmod").exists(), + "insmod symlink should exist" + ); + assert!( + rootfs.join("usr/sbin/depmod").exists(), + "depmod symlink should exist" + ); + assert!( + fs::symlink_metadata(rootfs.join("usr/sbin/modprobe")) + .unwrap() + .file_type() + .is_symlink(), + "modprobe should be a symlink" + ); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn ensure_kmod_symlinks_warns_without_kmod() { + let dir = unique_temp_dir(); + let rootfs = dir.join("rootfs"); + + fs::create_dir_all(&rootfs).expect("create rootfs"); + + ensure_kmod_symlinks(&rootfs); + + assert!( + !rootfs.join("usr/sbin/modprobe").exists(), + "modprobe should not exist when kmod is missing" + ); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn rootfs_has_firmware_bins_detects_nested() { + let dir1 = unique_temp_dir(); + fs::create_dir_all(dir1.join("subdir")).expect("create subdir"); + fs::write(dir1.join("subdir/file.bin"), b"firmware").expect("write .bin"); + assert!( + rootfs_has_firmware_bins(&dir1), + "should detect .bin in nested subdir" + ); + let _ = fs::remove_dir_all(&dir1); + + let dir2 = unique_temp_dir(); + fs::create_dir_all(dir2.join("subdir")).expect("create subdir"); + fs::write(dir2.join("subdir/file.txt"), b"not firmware").expect("write .txt"); + assert!( + !rootfs_has_firmware_bins(&dir2), + "should not detect .txt as firmware" + ); + let _ = fs::remove_dir_all(&dir2); + } + fn unique_temp_dir() -> PathBuf { static COUNTER: AtomicU64 = AtomicU64::new(0); let nanos = SystemTime::now() diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 758808c8e..6c4d49db5 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -346,6 +346,8 @@ fn build_kernel_cmdline(config: &VmLaunchConfig) -> String { if config.gpu_bdf.is_some() { parts.push("firmware_class.path=/lib/firmware".to_string()); + parts.push("modprobe.blacklist=nouveau".to_string()); + parts.push("nouveau.modeset=0".to_string()); } parts.join(" ") diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig index b5f0330af..072bf6e16 100644 --- a/crates/openshell-vm/runtime/kernel/openshell.kconfig +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -123,6 +123,28 @@ CONFIG_MEMCG=y CONFIG_POSIX_MQUEUE=y CONFIG_POSIX_MQUEUE_SYSCTL=y +# ── PCI/PCIe (required for GPU passthrough via QEMU vfio-pci) ──────────── +# The libkrunfw base config disables CONFIG_PCI. GPU sandboxes using the +# QEMU backend pass the GPU through as a PCIe device on a q35 machine. +# Without PCI core support the guest kernel cannot see any PCI bus, so the +# nvidia driver loads but finds zero devices. +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_PCIEPORTBUS=y + +# ── Loadable kernel modules (required for GPU passthrough) ────────────── +# The libkrunfw base config disables CONFIG_MODULES. GPU sandboxes need it +# to load nvidia.ko, nvidia-uvm.ko, and nvidia-modeset.ko at boot via +# modprobe. Without this, the guest kernel rejects all module loads. +# +# SECURITY NOTE: This enables module loading for ALL VMs (including +# non-GPU), expanding the guest kernel attack surface. The sandbox +# supervisor's seccomp profile must block init_module/finit_module +# syscalls for the sandbox user to prevent arbitrary module loading. +# Tracked: consider per-purpose kernel builds (GPU vs non-GPU). +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + # ── Security features required by the sandbox runtime ─────────────────── CONFIG_SECURITY_LANDLOCK=y CONFIG_SECCOMP_FILTER=y diff --git a/sandboxes/nvidia-gpu/Dockerfile b/sandboxes/nvidia-gpu/Dockerfile new file mode 100644 index 000000000..372d58b58 --- /dev/null +++ b/sandboxes/nvidia-gpu/Dockerfile @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# GPU-enabled sandbox image for OpenShell VM driver. +# +# Provides userspace GPU tooling (nvidia-smi, NVML, CUDA driver libs, kmod) +# on top of a minimal Ubuntu base with the full NVIDIA driver userspace +# installed via the official .run installer (no kernel modules -- those are +# injected at rootfs preparation time by the VM driver). +# +# Usage: +# openshell sandbox create --gpu --from ./sandboxes/nvidia-gpu/Dockerfile +# openshell sandbox create --gpu --from nvidia-gpu # once published +# +# Build-time args: +# CUDA_VERSION - CUDA toolkit version (default: 12.8.1) +# UBUNTU_VERSION - Ubuntu release (default: 22.04) +# NVIDIA_DRIVER_VERSION - Must match the kernel modules built by +# `mise run vm:nvidia-modules` (default: 580.159.03) + +ARG CUDA_VERSION=12.8.1 +ARG UBUNTU_VERSION=22.04 + +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION} + +# Must match NVIDIA_DRIVER_VERSION in sandboxes/nvidia-gpu/versions.env +# and NVIDIA_OPEN_VERSION in tasks/scripts/vm/build-nvidia-modules.sh +ARG NVIDIA_DRIVER_VERSION=580.159.03 + +# ── System packages required by the sandbox init script ────────────── +RUN apt-get update && apt-get install -y --no-install-recommends \ + bash \ + busybox-static \ + ca-certificates \ + curl \ + iproute2 \ + iptables \ + kmod \ + pciutils \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /usr/share/udhcpc && ln -sf /bin/busybox /sbin/udhcpc + +# ── NVIDIA driver userspace ────────────────────────────────────────── +# The nvidia/cuda base image does NOT include the driver (nvidia-smi, +# libcuda.so, libnvidia-ml.so). It relies on the NVIDIA Container +# Runtime to mount them from the host. In a VM there is no container +# runtime, so we install the driver userspace via the .run installer +# with --no-kernel-module (kernel modules are injected separately). +# TODO(gpu): Pin SHA-256 checksum for reproducible builds. Compute with: +# curl -fsSL | sha256sum +RUN curl -fsSL \ + "https://us.download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" \ + -o /tmp/nvidia.run \ + && chmod +x /tmp/nvidia.run \ + && /tmp/nvidia.run \ + --silent \ + --no-kernel-module \ + --no-drm \ + --no-x-check \ + --no-systemd \ + --no-nvidia-modprobe \ + --no-distro-scripts \ + && rm -f /tmp/nvidia.run + +# Ensure library paths are indexed for dlopen. +RUN mkdir -p /etc/ld.so.conf.d \ + && echo "/usr/local/cuda/lib64" > /etc/ld.so.conf.d/cuda.conf \ + && echo "/usr/lib/x86_64-linux-gnu" >> /etc/ld.so.conf.d/cuda.conf \ + && ldconfig 2>/dev/null || true + +# ── Kernel modules ─────────────────────────────────────────────────── +# NVIDIA kernel modules (.ko) must match the guest VM kernel (libkrunfw). +# They are NOT in this image -- the VM driver injects them at rootfs +# preparation time via `inject_gpu_modules`. +# +# GSP firmware (.bin) IS provided by the .run installer above. The VM +# driver detects its presence and skips firmware injection, avoiding +# version mismatches when the host driver differs from this image's. +RUN mkdir -p /lib/modules + +LABEL org.opencontainers.image.title="OpenShell GPU Sandbox" \ + org.opencontainers.image.description="GPU-enabled sandbox for OpenShell VM driver with CUDA support" \ + io.openshell.sandbox.gpu="true" diff --git a/sandboxes/nvidia-gpu/README.md b/sandboxes/nvidia-gpu/README.md new file mode 100644 index 000000000..7826b59cb --- /dev/null +++ b/sandboxes/nvidia-gpu/README.md @@ -0,0 +1,98 @@ + + + +# GPU Sandbox Image + +GPU-enabled sandbox image for the OpenShell VM driver. Provides NVIDIA +userspace tooling (nvidia-smi, NVML, CUDA driver libraries) on top of a +minimal Ubuntu base. Kernel modules are injected separately by the VM +driver at sandbox creation time. + +## Architecture + +The GPU sandbox splits responsibility between the container image and the +VM driver: + +| Layer | Source | Contents | +|-------|--------|----------| +| **Userspace** | This Dockerfile | nvidia-smi, libcuda.so, libnvidia-ml.so, kmod, iproute2 | +| **Kernel modules** | VM driver injection | nvidia.ko, nvidia_uvm.ko, nvidia_modeset.ko (built for guest kernel 6.12.76) | +| **GSP firmware** | `.run` installer in image OR host fallback | gsp_ga10x.bin, gsp_tu10x.bin | + +The kernel modules must be compiled against the exact guest kernel version +used by libkrunfw. The VM driver injects them into each sandbox's rootfs +at creation time via `inject_gpu_modules()`. + +## Prerequisites + +- Linux x86_64 host with an NVIDIA GPU +- IOMMU enabled (for VFIO GPU passthrough) +- Docker (for building the sandbox image) +- Guest kernel built with `CONFIG_MODULES=y` (`mise run vm:setup`) + +## Quick Start + +```shell +# 1. One-time: build the VM runtime (includes guest kernel with module support) +mise run vm:setup + +# 2. Build NVIDIA kernel modules for the guest kernel +mise run vm:nvidia-modules + +# 3. Build the GPU sandbox image +docker build -t nvidia-gpu:latest ./sandboxes/nvidia-gpu/ + +# 4. Start the gateway with GPU support +sudo mise run gateway:vm -- --gpu + +# 5. Create a GPU sandbox +openshell sandbox create --gpu --from nvidia-gpu:latest +``` + +## Version Coupling + +The NVIDIA driver version must match across three components: + +| Component | Variable | Default | +|-----------|----------|---------| +| Dockerfile (userspace) | `NVIDIA_DRIVER_VERSION` | `580.159.03` | +| Module build script | `NVIDIA_OPEN_VERSION` | `580.159.03` | +| Shared reference | `sandboxes/nvidia-gpu/versions.env` | `580.159.03` | + +A mismatch causes `modprobe` "version magic" errors or nvidia-smi ABI +failures at sandbox boot time. + +## Customization + +### Changing the CUDA version + +```shell +docker build \ + --build-arg CUDA_VERSION=12.6.0 \ + --build-arg UBUNTU_VERSION=22.04 \ + -t my-gpu-sandbox:latest \ + ./sandboxes/nvidia-gpu/ +``` + +### Changing the NVIDIA driver version + +Update all three locations: +1. `sandboxes/nvidia-gpu/versions.env` +2. `sandboxes/nvidia-gpu/Dockerfile` ARG `NVIDIA_DRIVER_VERSION` +3. Rebuild kernel modules: `NVIDIA_OPEN_VERSION= mise run vm:nvidia-modules` + +### Adding packages + +Add packages to the `apt-get install` line in the Dockerfile. The image +must retain `bash`, `kmod`, `iproute2`, and `busybox-static` — the VM +driver validates these at rootfs preparation time. + +## Troubleshooting + +| Symptom | Cause | Fix | +|---------|-------|-----| +| "No GPU kernel modules found" | Modules not built | `mise run vm:nvidia-modules` | +| "kmod not found in rootfs" | Image missing kmod package | Add `kmod` to Dockerfile `apt-get install` | +| `modprobe nvidia` fails | Kernel version mismatch | Rebuild modules after `mise run vm:setup` | +| nvidia-smi "driver/library mismatch" | Userspace/module version mismatch | Ensure Dockerfile and module versions match | +| "kernel version mismatch: expected X, got Y" | Guest kernel was rebuilt | Rebuild modules: `mise run vm:nvidia-modules` | diff --git a/sandboxes/nvidia-gpu/versions.env b/sandboxes/nvidia-gpu/versions.env new file mode 100644 index 000000000..d2f086da8 --- /dev/null +++ b/sandboxes/nvidia-gpu/versions.env @@ -0,0 +1,6 @@ +# Shared NVIDIA driver/module version for GPU sandbox images. +# Referenced by: +# - sandboxes/nvidia-gpu/Dockerfile (ARG NVIDIA_DRIVER_VERSION) +# - tasks/scripts/vm/build-nvidia-modules.sh (NVIDIA_OPEN_VERSION) +# These MUST match for kernel modules and userspace to be compatible. +NVIDIA_DRIVER_VERSION=580.159.03 diff --git a/tasks/scripts/gateway-vm.sh b/tasks/scripts/gateway-vm.sh index ac047dba2..f77e1c75d 100755 --- a/tasks/scripts/gateway-vm.sh +++ b/tasks/scripts/gateway-vm.sh @@ -32,6 +32,8 @@ set -euo pipefail +MISE="${__MISE_EXE:-$(command -v mise 2>/dev/null || echo mise)}" + ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" PORT="${OPENSHELL_SERVER_PORT:-18081}" GATEWAY_NAME="${OPENSHELL_VM_GATEWAY_NAME:-vm-dev}" @@ -274,13 +276,13 @@ if [ ! -d "${COMPRESSED_DIR}" ] \ || ! find "${COMPRESSED_DIR}" -maxdepth 1 -name 'libkrun*.zst' | grep -q . \ || [ ! -f "${COMPRESSED_DIR}/gvproxy.zst" ]; then echo "==> Preparing embedded VM runtime (mise run vm:setup)" - mise run vm:setup + "$MISE" run vm:setup fi if [ ! -f "${COMPRESSED_DIR}/openshell-sandbox.zst" ]; then check_supervisor_cross_toolchain echo "==> Building bundled VM supervisor (mise run vm:supervisor)" - mise run vm:supervisor + "$MISE" run vm:supervisor fi export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${COMPRESSED_DIR}" @@ -291,8 +293,17 @@ if [[ -n "${CARGO_BUILD_JOBS:-}" ]]; then fi echo "==> Building openshell-gateway and openshell-driver-vm" -cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ - -p openshell-server -p openshell-driver-vm +CARGO_CMD=(cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + -p openshell-server -p openshell-driver-vm) +if [ "$(id -u)" = "0" ] && [ -n "${SUDO_USER:-}" ]; then + chown -R "${SUDO_USER}" "${ROOT}/target" 2>/dev/null || true + sudo -u "${SUDO_USER}" env \ + "PATH=${PATH}" \ + "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR}" \ + "${CARGO_CMD[@]}" +else + "${CARGO_CMD[@]}" +fi if [ "$(uname -s)" = "Darwin" ]; then echo "==> Codesigning openshell-driver-vm (Hypervisor entitlement)" diff --git a/tasks/scripts/vm/build-nvidia-modules.sh b/tasks/scripts/vm/build-nvidia-modules.sh new file mode 100755 index 000000000..96287fc8c --- /dev/null +++ b/tasks/scripts/vm/build-nvidia-modules.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build NVIDIA kernel modules against the libkrunfw guest kernel. +# +# Uses the NVIDIA DKMS source already installed on the host (from the +# nvidia-dkms-* or nvidia-kernel-source-* package) and compiles it +# against the guest kernel tree produced by build-libkrun.sh. +# +# Prerequisites: +# - NVIDIA kernel source in /usr/src/nvidia-*/ +# - Guest kernel built with CONFIG_MODULES=y (mise run vm:setup) +# +# Output: +# target/libkrun-build/nvidia-modules/*.ko +# target/libkrun-build/nvidia-firmware//*.bin (if available) +# +# Usage: +# ./build-nvidia-modules.sh + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +BUILD_DIR="${ROOT}/target/libkrun-build" +OUTPUT_DIR="${BUILD_DIR}/nvidia-modules" + +# Guest kernel version — keep in sync with GUEST_KERNEL_VERSION in +# crates/openshell-driver-vm/src/rootfs.rs and the init script. +KERNEL_TREE="${BUILD_DIR}/libkrunfw/linux-6.12.76" +if [ ! -d "${KERNEL_TREE}" ]; then + echo "ERROR: Guest kernel tree not found at ${KERNEL_TREE}" >&2 + echo " Run: mise run vm:setup" >&2 + exit 1 +fi + +if ! grep -q 'CONFIG_MODULES=y' "${KERNEL_TREE}/.config" 2>/dev/null; then + echo "ERROR: Guest kernel was built without CONFIG_MODULES=y" >&2 + echo " Ensure openshell.kconfig includes CONFIG_MODULES=y and rebuild:" >&2 + echo " mise run vm:setup" >&2 + exit 1 +fi + +if [ ! -f "${KERNEL_TREE}/Module.symvers" ]; then + echo "ERROR: Module.symvers not found — the kernel needs to be rebuilt" >&2 + echo " with CONFIG_MODULES=y so the build produces Module.symvers." >&2 + echo " Run: mise run vm:setup" >&2 + exit 1 +fi + +# Detect the host NVIDIA driver version to pick a compatible module source. +HOST_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-}" +if [ -z "${HOST_DRIVER_VERSION}" ]; then + HOST_DRIVER_VERSION="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null || true)" +fi +if [ -z "${HOST_DRIVER_VERSION}" ]; then + HOST_DRIVER_VERSION="$(modinfo -F version /lib/modules/$(uname -r)/updates/dkms/nvidia.ko 2>/dev/null || true)" +fi + +# Use the open-gpu-kernel-modules release matching the host driver major +# version. The open modules support newer kernels better than the +# proprietary DKMS source shipped in /usr/src/. +# Must match NVIDIA_DRIVER_VERSION in sandboxes/nvidia-gpu/versions.env +# and sandboxes/nvidia-gpu/Dockerfile ARG NVIDIA_DRIVER_VERSION +NVIDIA_OPEN_VERSION="${NVIDIA_OPEN_VERSION:-580.159.03}" +NVIDIA_SRC_DIR="${BUILD_DIR}/open-gpu-kernel-modules-${NVIDIA_OPEN_VERSION}" + +if [ ! -d "${NVIDIA_SRC_DIR}/kernel-open" ]; then + echo "==> Downloading NVIDIA open kernel modules ${NVIDIA_OPEN_VERSION}" + TARBALL="${BUILD_DIR}/nvidia-open-${NVIDIA_OPEN_VERSION}.tar.gz" + if [ ! -f "${TARBALL}" ]; then + curl -fSL \ + "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${NVIDIA_OPEN_VERSION}.tar.gz" \ + -o "${TARBALL}" + # TODO(gpu): Add SHA-256 verification for supply chain integrity. + # echo " ${TARBALL}" | sha256sum -c - + fi + echo " Extracting..." + tar -xzf "${TARBALL}" -C "${BUILD_DIR}" + echo " Source: ${NVIDIA_SRC_DIR}" +fi + +NVIDIA_SRC="${NVIDIA_SRC_DIR}" + +# Patch API incompatibilities with newer kernels. +# __flush_tlb() was removed in kernel 6.12; use __flush_tlb_all() instead. +NV_PAT="${NVIDIA_SRC}/kernel-open/nvidia/nv-pat.c" +if [ -f "${NV_PAT}" ] && grep -q '__flush_tlb()' "${NV_PAT}"; then + echo "==> Patching nv-pat.c (__flush_tlb -> __flush_tlb_all)" + sed -i 's/__flush_tlb()/__flush_tlb_all()/g' "${NV_PAT}" +fi + +echo "==> Building NVIDIA ${NVIDIA_OPEN_VERSION} open kernel modules for guest kernel 6.12.76" +echo " NVIDIA source: ${NVIDIA_SRC}" +echo " Kernel tree: ${KERNEL_TREE}" +echo " Output: ${OUTPUT_DIR}" +if [ -n "${HOST_DRIVER_VERSION}" ]; then + echo " Host driver: ${HOST_DRIVER_VERSION}" +fi +echo "" + +mkdir -p "${OUTPUT_DIR}" + +NPROC="$(nproc 2>/dev/null || echo 4)" +IGNORE_CC_MISMATCH=1 make -C "${NVIDIA_SRC}" \ + SYSSRC="${KERNEL_TREE}" \ + SYSOUT="${KERNEL_TREE}" \ + -j"${NPROC}" \ + modules 2>&1 | tail -30 + +echo "" +echo "==> Collecting .ko files" + +# Open modules build into kernel-open// +for subdir in nvidia nvidia-uvm nvidia-modeset nvidia-drm nvidia-peermem; do + for search in "${NVIDIA_SRC}/kernel-open/${subdir}" "${NVIDIA_SRC}/${subdir}"; do + ko_file="${search}/${subdir//-/_}.ko" + if [ -f "${ko_file}" ]; then + cp "${ko_file}" "${OUTPUT_DIR}/" + echo " $(basename "${ko_file}") ($(du -h "${ko_file}" | cut -f1))" + break + fi + done +done + +# Also check for flat layouts. +for ko in "${NVIDIA_SRC}"/*.ko "${NVIDIA_SRC}"/kernel-open/*.ko; do + [ -f "${ko}" ] || continue + base="$(basename "${ko}")" + [ -f "${OUTPUT_DIR}/${base}" ] && continue + cp "${ko}" "${OUTPUT_DIR}/" + echo " ${base} ($(du -h "${ko}" | cut -f1))" +done + +KO_COUNT=$(find "${OUTPUT_DIR}" -name '*.ko' | wc -l) +if [ "${KO_COUNT}" -eq 0 ]; then + echo "ERROR: No .ko files produced. Check build output above." >&2 + exit 1 +fi + +echo "" +echo "==> Collecting firmware" + +# GSP firmware is included in the open-gpu-kernel-modules source tree. +FW_SRC="${NVIDIA_SRC}/src/nvidia/firmware" +FW_OUTPUT="${BUILD_DIR}/nvidia-firmware/${NVIDIA_OPEN_VERSION}" +if [ -d "${FW_SRC}" ] && ls "${FW_SRC}"/*.bin >/dev/null 2>&1; then + mkdir -p "${FW_OUTPUT}" + cp "${FW_SRC}"/*.bin "${FW_OUTPUT}/" + FW_COUNT=$(find "${FW_OUTPUT}" -name '*.bin' 2>/dev/null | wc -l) + echo " Copied ${FW_COUNT} firmware files from source tree" +else + # Fall back to host firmware. + HOST_FW="" + for candidate in "/lib/firmware/nvidia/${HOST_DRIVER_VERSION}" /lib/firmware/nvidia; do + if [ -d "${candidate}" ] && ls "${candidate}"/*.bin >/dev/null 2>&1; then + HOST_FW="${candidate}" + break + fi + done + if [ -n "${HOST_FW}" ]; then + mkdir -p "${FW_OUTPUT}" + cp -r "${HOST_FW}"/* "${FW_OUTPUT}/" 2>/dev/null || true + FW_COUNT=$(find "${FW_OUTPUT}" -name '*.bin' 2>/dev/null | wc -l) + echo " Copied ${FW_COUNT} firmware files from host ${HOST_FW}" + else + echo " WARNING: No firmware found. GPU guests may fail without GSP firmware." + fi +fi + +echo "" +echo "==> Done! ${KO_COUNT} kernel modules built for guest kernel 6.12.76." +echo " The VM driver will auto-discover them at:" +echo " ${OUTPUT_DIR}" +echo "" +echo " Next: mise run gateway:vm -- --gpu" diff --git a/tasks/scripts/vm/build-supervisor-bundle.sh b/tasks/scripts/vm/build-supervisor-bundle.sh index 90f5b517d..a823201d7 100755 --- a/tasks/scripts/vm/build-supervisor-bundle.sh +++ b/tasks/scripts/vm/build-supervisor-bundle.sh @@ -133,14 +133,31 @@ run_supervisor_build() { cargo_prefix=(env -u RUSTC_WRAPPER) fi - if command -v cargo-zigbuild >/dev/null 2>&1; then - "${cargo_prefix[@]}" cargo zigbuild --release -p openshell-sandbox --target "${RUST_TARGET}" \ - --manifest-path "${ROOT}/Cargo.toml" - else + # When running under sudo, de-escalate the build to the original user. + # The target/ dir is owned by that user and root may lack write access + # (e.g. NFS root_squash). Only the final gateway execution needs root. + # Pass PATH explicitly so cargo/rustc/sccache remain reachable. + # Also reclaim any root-owned artifacts left by prior sudo builds. + if [ "$(id -u)" = "0" ] && [ -n "${SUDO_USER:-}" ]; then + if [ -d "${ROOT}/target" ]; then + chown -R "${SUDO_USER}" "${ROOT}/target" 2>/dev/null || true + fi + cargo_prefix=(sudo -u "${SUDO_USER}" env "PATH=${PATH}" "${cargo_prefix[@]}") + fi + + local host_arch + host_arch="$(uname -m)" + local cargo_build_cmd="build" + local cargo_bin="cargo" + + if [ "${host_arch}" != "${GUEST_ARCH}" ] && command -v cargo-zigbuild >/dev/null 2>&1; then + cargo_build_cmd="zigbuild" + elif [ "${host_arch}" != "${GUEST_ARCH}" ]; then echo " cargo-zigbuild not found, falling back to cargo build..." - "${cargo_prefix[@]}" cargo build --release -p openshell-sandbox --target "${RUST_TARGET}" \ - --manifest-path "${ROOT}/Cargo.toml" fi + + "${cargo_prefix[@]}" ${cargo_bin} ${cargo_build_cmd} --release -p openshell-sandbox \ + --target "${RUST_TARGET}" --manifest-path "${ROOT}/Cargo.toml" } print_build_failure() { diff --git a/tasks/vm.toml b/tasks/vm.toml index e9eb22561..d288dd2e3 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -45,6 +45,10 @@ run = "tasks/scripts/vm/build-supervisor-bundle.sh" description = "Build the VM rootfs tarball (use -- --base for lightweight)" run = "tasks/scripts/vm/build-rootfs-tarball.sh" +["vm:nvidia-modules"] +description = "Build NVIDIA kernel modules for the guest VM kernel" +run = "tasks/scripts/vm/build-nvidia-modules.sh" + ["vm:clean"] description = "Remove all VM cached artifacts (runtime, rootfs, builds)" run = "tasks/scripts/vm/vm-clean.sh" From aba978c63b822efa16be924fe12a12587bd7ddaa Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Sun, 3 May 2026 23:43:25 -0700 Subject: [PATCH 2/3] fix(driver-vm): satisfy branch checks --- Cargo.lock | 1 + crates/openshell-driver-vm/Cargo.toml | 3 + crates/openshell-driver-vm/src/rootfs.rs | 102 ++++++++++++----------- sandboxes/nvidia-gpu/README.md | 1 + 4 files changed, 60 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e74e89a6..f15d84c3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3473,6 +3473,7 @@ dependencies = [ "serde_json", "sha2 0.10.9", "tar", + "temp-env", "tokio", "tokio-stream", "tonic", diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml index c13d904a6..47ceeb587 100644 --- a/crates/openshell-driver-vm/Cargo.toml +++ b/crates/openshell-driver-vm/Cargo.toml @@ -44,6 +44,9 @@ flate2 = "1" sha2 = "0.10" zstd = "0.13" +[dev-dependencies] +temp-env = "0.3" + # smol-rs/polling drives the BSD/macOS parent-death detection in # procguard via kqueue's EVFILT_PROC / NOTE_EXIT filter. We could use # it on Linux too (via epoll + pidfd) but sticking with diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs index cc4d21c22..b6deb2828 100644 --- a/crates/openshell-driver-vm/src/rootfs.rs +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -221,7 +221,7 @@ pub fn validate_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { /// this exact version; a mismatch causes `modprobe` failures at boot. /// /// Keep in sync with: -/// - `tasks/scripts/vm/build-nvidia-modules.sh` (KERNEL_TREE path) +/// - `tasks/scripts/vm/build-nvidia-modules.sh` (`KERNEL_TREE` path) /// - `openshell-vm-sandbox-init.sh` `setup_gpu()` expected version const GUEST_KERNEL_VERSION: &str = "6.12.76"; @@ -271,13 +271,8 @@ pub fn inject_gpu_modules(rootfs: &Path, state_dir: &Path) -> Result<(), String> for ko in &ko_files { let dest = modules_dst.join(ko.file_name().unwrap()); - let bytes_copied = fs::copy(ko, &dest).map_err(|e| { - format!( - "copy {} -> {}: {e}", - ko.display(), - dest.display() - ) - })?; + let bytes_copied = fs::copy(ko, &dest) + .map_err(|e| format!("copy {} -> {}: {e}", ko.display(), dest.display()))?; tracing::info!( module = %ko.file_name().unwrap().to_string_lossy(), size_bytes = bytes_copied, @@ -323,7 +318,7 @@ fn warn_missing_gpu_userspace(rootfs: &Path) { /// 1. `OPENSHELL_GPU_MODULES_DIR` env var (explicit override) /// 2. `/gpu-modules/` (operator pre-provisioned) /// 3. `/target/libkrun-build/nvidia-modules/` (build tree, -/// discovered relative to the driver executable) +/// discovered relative to the driver executable) /// 4. Host `/lib/modules//kernel/drivers/nvidia/` fn resolve_gpu_modules_dir(state_dir: &Path) -> Result { if let Ok(dir) = std::env::var("OPENSHELL_GPU_MODULES_DIR") { @@ -379,9 +374,11 @@ fn resolve_gpu_modules_dir(state_dir: &Path) -> Result { /// `OPENSHELL_GPU_MODULES_DIR` or pre-provision `/gpu-modules/`. fn discover_build_tree_modules() -> Option { #[cfg(unix)] - if unsafe { libc::getuid() } == 0 { - tracing::debug!("build-tree GPU module discovery running as root; \ - prefer OPENSHELL_GPU_MODULES_DIR in production"); + if nix::unistd::Uid::effective().is_root() { + tracing::debug!( + "build-tree GPU module discovery running as root; \ + prefer OPENSHELL_GPU_MODULES_DIR in production" + ); } let exe = std::env::current_exe().ok()?; // exe is typically target/{debug,release}/openshell-driver-vm @@ -398,7 +395,9 @@ fn discover_build_tree_modules() -> Option { // Also try CWD-relative (for `cargo run` or `mise run` from project root). let cwd_candidate = PathBuf::from("target/libkrun-build/nvidia-modules"); if dir_has_ko_files(&cwd_candidate) { - let abs = cwd_candidate.canonicalize().unwrap_or(cwd_candidate.clone()); + let abs = cwd_candidate + .canonicalize() + .unwrap_or_else(|_| cwd_candidate.clone()); tracing::info!( path = %abs.display(), "auto-discovered GPU modules relative to CWD" @@ -422,10 +421,13 @@ fn dir_has_ko_files(dir: &Path) -> bool { let path = entry.path(); match path.extension().and_then(|e| e.to_str()) { Some("ko") => has_uncompressed = true, - Some("zst" | "xz") => { - if path.file_stem().and_then(|s| std::path::Path::new(s).extension()).is_some_and(|ext| ext == "ko") { - has_compressed = true; - } + Some("zst" | "xz") + if path + .file_stem() + .and_then(|s| Path::new(s).extension()) + .is_some_and(|ext| ext == "ko") => + { + has_compressed = true; } _ => {} } @@ -458,18 +460,16 @@ fn inject_gpu_firmware(rootfs: &Path, modules_dir: &Path) { } // Try version-matched firmware next to the modules directory. - let fw_parent = modules_dir - .parent() - .map(|p| p.join("nvidia-firmware")); - - if let Some(ref fw_dir) = fw_parent { - if fw_dir.is_dir() { - if let Err(e) = copy_dir_contents(fw_dir, &fw_dst) { - tracing::warn!(error = %e, "failed to copy version-matched firmware"); - } else { - tracing::info!(src = %fw_dir.display(), "injected GPU firmware (version-matched)"); - return; - } + let fw_parent = modules_dir.parent().map(|p| p.join("nvidia-firmware")); + + if let Some(ref fw_dir) = fw_parent + && fw_dir.is_dir() + { + if let Err(e) = copy_dir_contents(fw_dir, &fw_dst) { + tracing::warn!(error = %e, "failed to copy version-matched firmware"); + } else { + tracing::info!(src = %fw_dir.display(), "injected GPU firmware (version-matched)"); + return; } } @@ -489,7 +489,9 @@ fn inject_gpu_firmware(rootfs: &Path, modules_dir: &Path) { tracing::warn!( "no NVIDIA GSP firmware found; GPU guests may fail to initialize. \ Place firmware in {:?} or host /lib/firmware/nvidia/", - fw_parent.as_deref().unwrap_or(Path::new("(unknown)")) + fw_parent + .as_deref() + .unwrap_or_else(|| Path::new("(unknown)")) ); } @@ -517,13 +519,13 @@ fn rootfs_has_firmware_bins(fw_dir: &Path) -> bool { /// images install `kmod` but lack the convenience symlinks in `/usr/sbin`. fn ensure_kmod_symlinks(rootfs: &Path) { let kmod_candidates = ["bin/kmod", "usr/bin/kmod", "sbin/kmod", "usr/sbin/kmod"]; - let kmod_exists = kmod_candidates - .iter() - .any(|p| rootfs.join(p).exists()); + let kmod_exists = kmod_candidates.iter().any(|p| rootfs.join(p).exists()); if !kmod_exists { - tracing::warn!("kmod not found in rootfs; modprobe will fail. \ - Ensure the sandbox image installs the 'kmod' package."); + tracing::warn!( + "kmod not found in rootfs; modprobe will fail. \ + Ensure the sandbox image installs the 'kmod' package." + ); return; } @@ -845,13 +847,16 @@ mod tests { fs::create_dir_all(&modules_dir).expect("create modules dir"); fs::create_dir_all(&rootfs).expect("create rootfs dir"); - fs::write(modules_dir.join("nvidia.ko"), b"\x7fELF-fake-module-1").expect("write nvidia.ko"); + fs::write(modules_dir.join("nvidia.ko"), b"\x7fELF-fake-module-1") + .expect("write nvidia.ko"); fs::write(modules_dir.join("nvidia-uvm.ko"), b"\x7fELF-fake-module-2") .expect("write nvidia-uvm.ko"); - unsafe { std::env::set_var("OPENSHELL_GPU_MODULES_DIR", &modules_dir) }; - let result = inject_gpu_modules(&rootfs, Path::new("/dummy/state")); - unsafe { std::env::remove_var("OPENSHELL_GPU_MODULES_DIR") }; + let result = temp_env::with_var( + "OPENSHELL_GPU_MODULES_DIR", + Some(modules_dir.as_os_str()), + || inject_gpu_modules(&rootfs, Path::new("/dummy/state")), + ); result.expect("inject_gpu_modules should succeed"); @@ -870,9 +875,11 @@ mod tests { fs::create_dir_all(&modules_dir).expect("create modules dir"); fs::write(modules_dir.join("readme.txt"), b"not a kernel module").expect("write txt"); - unsafe { std::env::set_var("OPENSHELL_GPU_MODULES_DIR", &modules_dir) }; - let result = inject_gpu_modules(Path::new("/dummy/rootfs"), Path::new("/dummy/state")); - unsafe { std::env::remove_var("OPENSHELL_GPU_MODULES_DIR") }; + let result = temp_env::with_var( + "OPENSHELL_GPU_MODULES_DIR", + Some(modules_dir.as_os_str()), + || inject_gpu_modules(Path::new("/dummy/rootfs"), Path::new("/dummy/state")), + ); let err = result.expect_err("should fail with no .ko files"); assert!( @@ -888,9 +895,11 @@ mod tests { let dir = unique_temp_dir(); let missing = dir.join("does-not-exist"); - unsafe { std::env::set_var("OPENSHELL_GPU_MODULES_DIR", &missing) }; - let result = inject_gpu_modules(Path::new("/dummy/rootfs"), Path::new("/dummy/state")); - unsafe { std::env::remove_var("OPENSHELL_GPU_MODULES_DIR") }; + let result = temp_env::with_var( + "OPENSHELL_GPU_MODULES_DIR", + Some(missing.as_os_str()), + || inject_gpu_modules(Path::new("/dummy/rootfs"), Path::new("/dummy/state")), + ); let err = result.expect_err("should fail with missing directory"); assert!( @@ -916,8 +925,7 @@ mod tests { let content = fs::read(fw_dir.join("gsp.bin")).expect("read gsp.bin after injection"); assert_eq!( - content, - b"original-firmware-content", + content, b"original-firmware-content", "firmware should not be overwritten when rootfs already has .bin files" ); diff --git a/sandboxes/nvidia-gpu/README.md b/sandboxes/nvidia-gpu/README.md index 7826b59cb..31eb818fc 100644 --- a/sandboxes/nvidia-gpu/README.md +++ b/sandboxes/nvidia-gpu/README.md @@ -77,6 +77,7 @@ docker build \ ### Changing the NVIDIA driver version Update all three locations: + 1. `sandboxes/nvidia-gpu/versions.env` 2. `sandboxes/nvidia-gpu/Dockerfile` ARG `NVIDIA_DRIVER_VERSION` 3. Rebuild kernel modules: `NVIDIA_OPEN_VERSION= mise run vm:nvidia-modules` From 1eb4cb37bf5575c4a561e3b1c94dedc5d8311044 Mon Sep 17 00:00:00 2001 From: Vincent Caux-Brisebois Date: Mon, 4 May 2026 20:34:04 +0000 Subject: [PATCH 3/3] address GPU container PR review feedback Signed-off-by: Vincent Caux-Brisebois --- crates/openshell-cli/src/run.rs | 34 ++++++++++- crates/openshell-driver-vm/README.md | 5 ++ crates/openshell-driver-vm/build.rs | 56 +++++++++++++++++ .../scripts/openshell-vm-sandbox-init.sh | 6 +- crates/openshell-driver-vm/src/driver.rs | 4 ++ crates/openshell-driver-vm/src/rootfs.rs | 61 ++++++++++++++++--- crates/openshell-driver-vm/src/runtime.rs | 5 ++ crates/openshell-vm/pins.env | 3 + sandboxes/nvidia-gpu/Dockerfile | 49 ++++++++------- sandboxes/nvidia-gpu/README.md | 42 ++++++------- sandboxes/nvidia-gpu/build.sh | 25 ++++++++ sandboxes/nvidia-gpu/versions.env | 11 ++-- tasks/scripts/vm/build-nvidia-modules.sh | 31 +++++----- tasks/scripts/vm/build-rootfs-tarball.sh | 4 +- 14 files changed, 257 insertions(+), 79 deletions(-) create mode 100755 sandboxes/nvidia-gpu/build.sh diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 2ad634cf2..130e7f1c4 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -2901,6 +2901,30 @@ fn dockerfile_sources_supported_for_gateway(metadata: Option<&GatewayMetadata>) !metadata.is_some_and(|metadata| metadata.is_remote) } +/// Load key=value pairs from a `versions.env` file in the given directory. +/// Returns an empty map if the file doesn't exist or can't be read. +fn load_versions_env(context: &Path) -> HashMap { + let env_file = context.join("versions.env"); + let Ok(contents) = std::fs::read_to_string(&env_file) else { + return HashMap::new(); + }; + contents + .lines() + .filter_map(|line| { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + return None; + } + let (key, value) = line.split_once('=')?; + let key = key.trim(); + if key.is_empty() { + return None; + } + Some((key.to_string(), value.trim().to_string())) + }) + .collect() +} + /// Build a Dockerfile and make the resulting image available to the gateway. /// /// For local Kubernetes gateways running in Docker, this imports the built image @@ -2935,6 +2959,14 @@ async fn build_from_dockerfile( eprintln!(" {} {}", "Gateway:".dimmed(), gateway_name); eprintln!(); + let build_args = load_versions_env(context); + if !build_args.is_empty() { + for (k, v) in &build_args { + eprintln!(" Build arg (from versions.env): {k}={v}"); + } + eprintln!(); + } + let mut on_log = |msg: String| { eprintln!(" {msg}"); }; @@ -2943,7 +2975,7 @@ async fn build_from_dockerfile( dockerfile, &tag, context, - &HashMap::new(), + &build_args, &mut on_log, ) .await?; diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index dbb90bb67..73d8e6eba 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -47,6 +47,11 @@ By default `mise run gateway:vm`: For GPU passthrough (VFIO), pass `-- --gpu` and run with root privileges: +> **Note:** GPU passthrough requires an **x86_64 host and guest**. The QEMU +> backend uses `qemu-system-x86_64`, and the NVIDIA driver installer / +> kernel module build scripts target x86_64 exclusively. ARM/aarch64 GPU +> passthrough is not yet supported. + ```shell sudo -E env "PATH=$PATH" mise run gateway:vm -- --gpu ``` diff --git a/crates/openshell-driver-vm/build.rs b/crates/openshell-driver-vm/build.rs index ea4c4d2e0..953ba5354 100644 --- a/crates/openshell-driver-vm/build.rs +++ b/crates/openshell-driver-vm/build.rs @@ -11,6 +11,8 @@ use std::path::{Path, PathBuf}; use std::{env, fs}; fn main() { + emit_guest_kernel_version(); + println!("cargo:rerun-if-env-changed=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR"); if let Ok(dir) = env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { @@ -143,3 +145,57 @@ fn generate_stub_resources(out_dir: &Path, names: &[&str]) { } } } + +/// Parse `GUEST_KERNEL_VERSION` from `pins.env` and emit it as a compile-time +/// environment variable so `rootfs.rs` can use `env!("GUEST_KERNEL_VERSION")`. +fn emit_guest_kernel_version() { + let manifest_dir = + PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set")); + let pins_path = manifest_dir.join("../../crates/openshell-vm/pins.env"); + + println!("cargo:rerun-if-changed={}", pins_path.display()); + println!("cargo:rerun-if-env-changed=GUEST_KERNEL_VERSION"); + + let version = if let Ok(v) = env::var("GUEST_KERNEL_VERSION") { + v + } else if let Ok(contents) = fs::read_to_string(&pins_path) { + parse_guest_kernel_version(&contents).unwrap_or_else(|| { + panic!( + "GUEST_KERNEL_VERSION not found in {}", + pins_path.display() + ) + }) + } else { + panic!( + "Cannot read {} and GUEST_KERNEL_VERSION env var not set", + pins_path.display() + ); + }; + + println!("cargo:rustc-env=GUEST_KERNEL_VERSION={version}"); +} + +/// Extract the default value from a `GUEST_KERNEL_VERSION="${GUEST_KERNEL_VERSION:-}"` +/// line in pins.env. +fn parse_guest_kernel_version(contents: &str) -> Option { + for line in contents.lines() { + let trimmed = line.trim(); + if trimmed.starts_with('#') || !trimmed.starts_with("GUEST_KERNEL_VERSION=") { + continue; + } + // Pattern: GUEST_KERNEL_VERSION="${GUEST_KERNEL_VERSION:-6.12.76}" + if let Some(start) = trimmed.find(":-") { + let after = &trimmed[start + 2..]; + if let Some(end) = after.find('}') { + let value = after[..end].trim_end_matches('"'); + return Some(value.to_string()); + } + } + // Fallback: simple assignment like GUEST_KERNEL_VERSION="6.12.76" + if let Some((_key, value)) = trimmed.split_once('=') { + let v = value.trim_matches('"').trim_matches('\''); + return Some(v.to_string()); + } + } + None +} diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index e590be195..0ef94438f 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -236,10 +236,12 @@ setup_gpu() { # Kernel modules are built for a specific guest kernel version. # If the running kernel doesn't match, depmod/modprobe will silently fail. - local expected_kver="6.12.76" + local expected_kver="${GUEST_KERNEL_VERSION:-}" local actual_kver actual_kver="$(uname -r)" - if [ "${actual_kver}" != "${expected_kver}" ]; then + if [ -z "${expected_kver}" ]; then + ts "GUEST_KERNEL_VERSION not set; skipping kernel version check" + elif [ "${actual_kver}" != "${expected_kver}" ]; then ts "WARNING: kernel version mismatch: expected ${expected_kver}, got ${actual_kver}" ts " GPU modules are installed under lib/modules/${expected_kver}/" ts " modprobe may fail to find them" diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 84338c85c..bf8d36401 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -2220,6 +2220,10 @@ fn build_guest_environment( "OPENSHELL_SSH_HANDSHAKE_SECRET".to_string(), config.ssh_handshake_secret.clone(), ), + ( + "GUEST_KERNEL_VERSION".to_string(), + env!("GUEST_KERNEL_VERSION").to_string(), + ), ]); if config.requires_tls_materials() { environment.extend(HashMap::from([ diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs index b6deb2828..1c7133075 100644 --- a/crates/openshell-driver-vm/src/rootfs.rs +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -35,7 +35,7 @@ pub fn prepare_sandbox_rootfs_from_image_root( /// ensures the guest always runs the init script and supervisor that match /// the running driver binary. pub fn refresh_runtime_artifacts(rootfs: &Path) -> Result<(), String> { - let init_path = rootfs.join("srv/openshell-vm-sandbox-init.sh"); + let init_path = rootfs.join(SANDBOX_GUEST_INIT_PATH.trim_start_matches('/')); if let Some(parent) = init_path.parent() { fs::create_dir_all(parent).map_err(|e| format!("create {}: {e}", parent.display()))?; } @@ -176,7 +176,7 @@ fn prepare_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { remove_rootfs_path(rootfs, relative)?; } - let init_path = rootfs.join("srv/openshell-vm-sandbox-init.sh"); + let init_path = rootfs.join(SANDBOX_GUEST_INIT_PATH.trim_start_matches('/')); if let Some(parent) = init_path.parent() { fs::create_dir_all(parent).map_err(|e| format!("create {}: {e}", parent.display()))?; } @@ -206,7 +206,7 @@ fn prepare_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { pub fn validate_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { require_rootfs_path(rootfs, SANDBOX_GUEST_INIT_PATH)?; - require_rootfs_path(rootfs, "/opt/openshell/bin/openshell-sandbox")?; + require_rootfs_path(rootfs, SANDBOX_SUPERVISOR_PATH)?; require_any_rootfs_path(rootfs, &["/bin/bash"])?; require_any_rootfs_path(rootfs, &["/bin/mount", "/usr/bin/mount"])?; require_any_rootfs_path( @@ -220,10 +220,9 @@ pub fn validate_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { /// Kernel version of the libkrunfw guest. Modules must be compiled against /// this exact version; a mismatch causes `modprobe` failures at boot. /// -/// Keep in sync with: -/// - `tasks/scripts/vm/build-nvidia-modules.sh` (`KERNEL_TREE` path) -/// - `openshell-vm-sandbox-init.sh` `setup_gpu()` expected version -const GUEST_KERNEL_VERSION: &str = "6.12.76"; +/// Single source of truth: `crates/openshell-vm/pins.env`. The build script +/// parses the pin and emits it as a compile-time env var. +const GUEST_KERNEL_VERSION: &str = env!("GUEST_KERNEL_VERSION"); /// Inject NVIDIA kernel modules, firmware, and `kmod` tooling into a prepared /// sandbox rootfs. Called by the driver when a sandbox requests GPU support. @@ -282,12 +281,57 @@ pub fn inject_gpu_modules(rootfs: &Path, state_dir: &Path) -> Result<(), String> } inject_gpu_firmware(rootfs, &modules_dir); + check_gpu_version_match(rootfs, &modules_dir); ensure_kmod_symlinks(rootfs); warn_missing_gpu_userspace(rootfs); Ok(()) } +/// Compare the driver version baked into the sandbox image against the +/// firmware version directory. A mismatch means the image userspace and the +/// injected kernel modules / firmware were built from different driver +/// releases, which usually causes GPU initialisation failures. +/// +/// Emits `tracing::warn!` (not an error) because operators may intentionally +/// run minor-revision mismatches during rolling upgrades. +fn check_gpu_version_match(rootfs: &Path, modules_dir: &Path) { + let stamp = rootfs.join("etc/openshell-gpu-driver-version"); + let Ok(image_ver) = fs::read_to_string(&stamp) else { + return; + }; + let image_ver = image_ver.trim(); + if image_ver.is_empty() { + return; + } + + let fw_dir = modules_dir.parent().map(|p| p.join("nvidia-firmware")); + let Some(ref fw) = fw_dir else { return }; + let Ok(entries) = fs::read_dir(fw) else { + return; + }; + + let version_dirs: Vec = entries + .flatten() + .filter(|e| e.path().is_dir()) + .map(|e| e.file_name().to_string_lossy().into_owned()) + .collect(); + + if version_dirs.is_empty() { + return; + } + + if !version_dirs.iter().any(|v| v == image_ver) { + tracing::warn!( + image_version = image_ver, + firmware_versions = ?version_dirs, + "GPU driver version mismatch: image userspace ({}) does not match \ + any firmware version ({:?}). Sandbox GPU may fail to initialise.", + image_ver, version_dirs, + ); + } +} + /// Check whether the rootfs contains essential GPU userspace binaries. /// Emits actionable warnings when the sandbox image lacks nvidia-smi /// or CUDA libraries — common when `--gpu` is used with a non-GPU base @@ -295,6 +339,7 @@ pub fn inject_gpu_modules(rootfs: &Path, state_dir: &Path) -> Result<(), String> fn warn_missing_gpu_userspace(rootfs: &Path) { let nvidia_smi_candidates = [ "usr/bin/nvidia-smi", + "usr/sbin/nvidia-smi", "usr/local/bin/nvidia-smi", "bin/nvidia-smi", ]; @@ -860,7 +905,7 @@ mod tests { result.expect("inject_gpu_modules should succeed"); - let dest = rootfs.join("lib/modules/6.12.76/kernel/drivers/nvidia"); + let dest = rootfs.join(format!("lib/modules/{GUEST_KERNEL_VERSION}/kernel/drivers/nvidia")); assert!(dest.join("nvidia.ko").is_file()); assert!(dest.join("nvidia-uvm.ko").is_file()); diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 6c4d49db5..1acfc7c60 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -315,6 +315,11 @@ fn qemu_guest_env_vars(config: &VmLaunchConfig, dns_server: Option) -> V env_vars.push("GPU_ENABLED=true".to_string()); } + env_vars.push(format!( + "GUEST_KERNEL_VERSION={}", + env!("GUEST_KERNEL_VERSION") + )); + env_vars } diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index b3d802292..eafe5baef 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -42,3 +42,6 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}" # Repo: https://github.com/containers/libkrunfw # Pinned: 2026-03-27 (main branch HEAD at time of pinning) LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}" + +# ── Guest kernel (determined by LIBKRUNFW_REF) ──────────────────────── +GUEST_KERNEL_VERSION="${GUEST_KERNEL_VERSION:-6.12.76}" diff --git a/sandboxes/nvidia-gpu/Dockerfile b/sandboxes/nvidia-gpu/Dockerfile index 372d58b58..23e7e13fe 100644 --- a/sandboxes/nvidia-gpu/Dockerfile +++ b/sandboxes/nvidia-gpu/Dockerfile @@ -1,6 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# Architecture: GPU sandbox images are currently x86_64-only (both host and +# guest). The .run installer and lib paths target Linux-x86_64 exclusively. +# ARM/aarch64 GPU passthrough is not yet supported. +# # GPU-enabled sandbox image for OpenShell VM driver. # # Provides userspace GPU tooling (nvidia-smi, NVML, CUDA driver libs, kmod) @@ -9,23 +13,22 @@ # injected at rootfs preparation time by the VM driver). # # Usage: +# ./build.sh # recommended — sources versions.env +# ./build.sh -t my-registry/gpu:v1 # custom tag # openshell sandbox create --gpu --from ./sandboxes/nvidia-gpu/Dockerfile -# openshell sandbox create --gpu --from nvidia-gpu # once published # # Build-time args: -# CUDA_VERSION - CUDA toolkit version (default: 12.8.1) -# UBUNTU_VERSION - Ubuntu release (default: 22.04) -# NVIDIA_DRIVER_VERSION - Must match the kernel modules built by -# `mise run vm:nvidia-modules` (default: 580.159.03) - -ARG CUDA_VERSION=12.8.1 -ARG UBUNTU_VERSION=22.04 +# NVIDIA_DRIVER_VERSION - **Required** (no default). Must match the kernel +# modules built by `mise run vm:nvidia-modules`. +# Use ./build.sh which sources versions.env, or pass +# --build-arg NVIDIA_DRIVER_VERSION= manually. -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION} +FROM ubuntu:latest -# Must match NVIDIA_DRIVER_VERSION in sandboxes/nvidia-gpu/versions.env -# and NVIDIA_OPEN_VERSION in tasks/scripts/vm/build-nvidia-modules.sh -ARG NVIDIA_DRIVER_VERSION=580.159.03 +# Must match NVIDIA_DRIVER_VERSION in sandboxes/nvidia-gpu/versions.env. +# The module build script also uses this version for the open-gpu-kernel-modules tag. +# No default — use ./build.sh or pass --build-arg explicitly. +ARG NVIDIA_DRIVER_VERSION # ── System packages required by the sandbox init script ────────────── RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -42,14 +45,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN mkdir -p /usr/share/udhcpc && ln -sf /bin/busybox /sbin/udhcpc # ── NVIDIA driver userspace ────────────────────────────────────────── -# The nvidia/cuda base image does NOT include the driver (nvidia-smi, -# libcuda.so, libnvidia-ml.so). It relies on the NVIDIA Container -# Runtime to mount them from the host. In a VM there is no container -# runtime, so we install the driver userspace via the .run installer -# with --no-kernel-module (kernel modules are injected separately). +# We install the full driver userspace via the official .run installer +# with --no-kernel-module (kernel modules are injected separately by +# the VM driver at rootfs preparation time). # TODO(gpu): Pin SHA-256 checksum for reproducible builds. Compute with: # curl -fsSL | sha256sum -RUN curl -fsSL \ +RUN test -n "${NVIDIA_DRIVER_VERSION}" || \ + { echo "ERROR: NVIDIA_DRIVER_VERSION not set. Use ./build.sh or pass --build-arg NVIDIA_DRIVER_VERSION="; exit 1; } \ + && curl -fsSL \ "https://us.download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" \ -o /tmp/nvidia.run \ && chmod +x /tmp/nvidia.run \ @@ -63,11 +66,11 @@ RUN curl -fsSL \ --no-distro-scripts \ && rm -f /tmp/nvidia.run -# Ensure library paths are indexed for dlopen. -RUN mkdir -p /etc/ld.so.conf.d \ - && echo "/usr/local/cuda/lib64" > /etc/ld.so.conf.d/cuda.conf \ - && echo "/usr/lib/x86_64-linux-gnu" >> /etc/ld.so.conf.d/cuda.conf \ - && ldconfig 2>/dev/null || true +RUN echo "${NVIDIA_DRIVER_VERSION}" > /etc/openshell-gpu-driver-version + +# The .run installer places libs in standard paths (/usr/lib/x86_64-linux-gnu). +# Run ldconfig to update the linker cache for dlopen resolution. +RUN ldconfig 2>/dev/null || true # ── Kernel modules ─────────────────────────────────────────────────── # NVIDIA kernel modules (.ko) must match the guest VM kernel (libkrunfw). diff --git a/sandboxes/nvidia-gpu/README.md b/sandboxes/nvidia-gpu/README.md index 31eb818fc..fcbfd9c96 100644 --- a/sandboxes/nvidia-gpu/README.md +++ b/sandboxes/nvidia-gpu/README.md @@ -3,6 +3,11 @@ # GPU Sandbox Image +> **Architecture:** GPU sandbox images are currently **x86_64-only** (both +> host and guest). The Dockerfile downloads the `NVIDIA-Linux-x86_64` +> installer, and the VM driver uses `qemu-system-x86_64` for GPU +> passthrough. ARM/aarch64 GPU passthrough is not yet supported. + GPU-enabled sandbox image for the OpenShell VM driver. Provides NVIDIA userspace tooling (nvidia-smi, NVML, CUDA driver libraries) on top of a minimal Ubuntu base. Kernel modules are injected separately by the VM @@ -16,7 +21,7 @@ VM driver: | Layer | Source | Contents | |-------|--------|----------| | **Userspace** | This Dockerfile | nvidia-smi, libcuda.so, libnvidia-ml.so, kmod, iproute2 | -| **Kernel modules** | VM driver injection | nvidia.ko, nvidia_uvm.ko, nvidia_modeset.ko (built for guest kernel 6.12.76) | +| **Kernel modules** | VM driver injection | nvidia.ko, nvidia_uvm.ko, nvidia_modeset.ko (built for the pinned guest kernel version in `crates/openshell-vm/pins.env`) | | **GSP firmware** | `.run` installer in image OR host fallback | gsp_ga10x.bin, gsp_tu10x.bin | The kernel modules must be compiled against the exact guest kernel version @@ -51,36 +56,25 @@ openshell sandbox create --gpu --from nvidia-gpu:latest ## Version Coupling -The NVIDIA driver version must match across three components: +All components use a single version variable — `NVIDIA_DRIVER_VERSION` — +pinned in `sandboxes/nvidia-gpu/versions.env`: -| Component | Variable | Default | -|-----------|----------|---------| -| Dockerfile (userspace) | `NVIDIA_DRIVER_VERSION` | `580.159.03` | -| Module build script | `NVIDIA_OPEN_VERSION` | `580.159.03` | -| Shared reference | `sandboxes/nvidia-gpu/versions.env` | `580.159.03` | +| Component | Where `NVIDIA_DRIVER_VERSION` is consumed | +|-----------|-------------------------------------------| +| Dockerfile (userspace) | `ARG NVIDIA_DRIVER_VERSION` — `.run` installer URL | +| Module build script | `build-nvidia-modules.sh` — open-gpu-kernel-modules tag | +| Shared pin | `versions.env` — single source of truth | -A mismatch causes `modprobe` "version magic" errors or nvidia-smi ABI -failures at sandbox boot time. +A mismatch between kernel modules and userspace causes `modprobe` +"version magic" errors or nvidia-smi ABI failures at sandbox boot time. ## Customization -### Changing the CUDA version - -```shell -docker build \ - --build-arg CUDA_VERSION=12.6.0 \ - --build-arg UBUNTU_VERSION=22.04 \ - -t my-gpu-sandbox:latest \ - ./sandboxes/nvidia-gpu/ -``` - ### Changing the NVIDIA driver version -Update all three locations: - -1. `sandboxes/nvidia-gpu/versions.env` -2. `sandboxes/nvidia-gpu/Dockerfile` ARG `NVIDIA_DRIVER_VERSION` -3. Rebuild kernel modules: `NVIDIA_OPEN_VERSION= mise run vm:nvidia-modules` +1. Update `NVIDIA_DRIVER_VERSION` in `sandboxes/nvidia-gpu/versions.env` +2. Rebuild kernel modules: `mise run vm:nvidia-modules` +3. Rebuild the sandbox image: `./sandboxes/nvidia-gpu/build.sh` ### Adding packages diff --git a/sandboxes/nvidia-gpu/build.sh b/sandboxes/nvidia-gpu/build.sh new file mode 100755 index 000000000..b34b188a8 --- /dev/null +++ b/sandboxes/nvidia-gpu/build.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build the GPU sandbox image with the correct driver version. +# Sources versions.env so the version is never typed manually. +# +# Usage: +# ./build.sh # default: tags as openshell-gpu-sandbox +# ./build.sh -t my-registry/gpu:v1 # custom tag + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=versions.env +source "${SCRIPT_DIR}/versions.env" + +if [ $# -eq 0 ]; then + set -- -t openshell-gpu-sandbox +fi + +exec docker build \ + --build-arg NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION}" \ + "$@" \ + "${SCRIPT_DIR}" diff --git a/sandboxes/nvidia-gpu/versions.env b/sandboxes/nvidia-gpu/versions.env index d2f086da8..187e9c18e 100644 --- a/sandboxes/nvidia-gpu/versions.env +++ b/sandboxes/nvidia-gpu/versions.env @@ -1,6 +1,9 @@ -# Shared NVIDIA driver/module version for GPU sandbox images. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Shared NVIDIA driver version for GPU sandbox images. # Referenced by: -# - sandboxes/nvidia-gpu/Dockerfile (ARG NVIDIA_DRIVER_VERSION) -# - tasks/scripts/vm/build-nvidia-modules.sh (NVIDIA_OPEN_VERSION) -# These MUST match for kernel modules and userspace to be compatible. +# - sandboxes/nvidia-gpu/build.sh (passes as --build-arg to Dockerfile) +# - tasks/scripts/vm/build-nvidia-modules.sh (open-gpu-kernel-modules tag) +# Kernel modules and userspace MUST use the same driver release. NVIDIA_DRIVER_VERSION=580.159.03 diff --git a/tasks/scripts/vm/build-nvidia-modules.sh b/tasks/scripts/vm/build-nvidia-modules.sh index 96287fc8c..f2e1702aa 100755 --- a/tasks/scripts/vm/build-nvidia-modules.sh +++ b/tasks/scripts/vm/build-nvidia-modules.sh @@ -25,9 +25,9 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" BUILD_DIR="${ROOT}/target/libkrun-build" OUTPUT_DIR="${BUILD_DIR}/nvidia-modules" -# Guest kernel version — keep in sync with GUEST_KERNEL_VERSION in -# crates/openshell-driver-vm/src/rootfs.rs and the init script. -KERNEL_TREE="${BUILD_DIR}/libkrunfw/linux-6.12.76" +# shellcheck source=../../crates/openshell-vm/pins.env +source "${ROOT}/crates/openshell-vm/pins.env" +KERNEL_TREE="${BUILD_DIR}/libkrunfw/linux-${GUEST_KERNEL_VERSION}" if [ ! -d "${KERNEL_TREE}" ]; then echo "ERROR: Guest kernel tree not found at ${KERNEL_TREE}" >&2 echo " Run: mise run vm:setup" >&2 @@ -57,20 +57,19 @@ if [ -z "${HOST_DRIVER_VERSION}" ]; then HOST_DRIVER_VERSION="$(modinfo -F version /lib/modules/$(uname -r)/updates/dkms/nvidia.ko 2>/dev/null || true)" fi -# Use the open-gpu-kernel-modules release matching the host driver major -# version. The open modules support newer kernels better than the -# proprietary DKMS source shipped in /usr/src/. -# Must match NVIDIA_DRIVER_VERSION in sandboxes/nvidia-gpu/versions.env -# and sandboxes/nvidia-gpu/Dockerfile ARG NVIDIA_DRIVER_VERSION -NVIDIA_OPEN_VERSION="${NVIDIA_OPEN_VERSION:-580.159.03}" -NVIDIA_SRC_DIR="${BUILD_DIR}/open-gpu-kernel-modules-${NVIDIA_OPEN_VERSION}" +# Use the open-gpu-kernel-modules release matching the driver version. +# The open modules support newer kernels better than the proprietary +# DKMS source shipped in /usr/src/. +# shellcheck source=sandboxes/nvidia-gpu/versions.env +source "${ROOT}/sandboxes/nvidia-gpu/versions.env" +NVIDIA_SRC_DIR="${BUILD_DIR}/open-gpu-kernel-modules-${NVIDIA_DRIVER_VERSION}" if [ ! -d "${NVIDIA_SRC_DIR}/kernel-open" ]; then - echo "==> Downloading NVIDIA open kernel modules ${NVIDIA_OPEN_VERSION}" - TARBALL="${BUILD_DIR}/nvidia-open-${NVIDIA_OPEN_VERSION}.tar.gz" + echo "==> Downloading NVIDIA open kernel modules ${NVIDIA_DRIVER_VERSION}" + TARBALL="${BUILD_DIR}/nvidia-open-${NVIDIA_DRIVER_VERSION}.tar.gz" if [ ! -f "${TARBALL}" ]; then curl -fSL \ - "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${NVIDIA_OPEN_VERSION}.tar.gz" \ + "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${NVIDIA_DRIVER_VERSION}.tar.gz" \ -o "${TARBALL}" # TODO(gpu): Add SHA-256 verification for supply chain integrity. # echo " ${TARBALL}" | sha256sum -c - @@ -90,7 +89,7 @@ if [ -f "${NV_PAT}" ] && grep -q '__flush_tlb()' "${NV_PAT}"; then sed -i 's/__flush_tlb()/__flush_tlb_all()/g' "${NV_PAT}" fi -echo "==> Building NVIDIA ${NVIDIA_OPEN_VERSION} open kernel modules for guest kernel 6.12.76" +echo "==> Building NVIDIA ${NVIDIA_DRIVER_VERSION} open kernel modules for guest kernel ${GUEST_KERNEL_VERSION}" echo " NVIDIA source: ${NVIDIA_SRC}" echo " Kernel tree: ${KERNEL_TREE}" echo " Output: ${OUTPUT_DIR}" @@ -143,7 +142,7 @@ echo "==> Collecting firmware" # GSP firmware is included in the open-gpu-kernel-modules source tree. FW_SRC="${NVIDIA_SRC}/src/nvidia/firmware" -FW_OUTPUT="${BUILD_DIR}/nvidia-firmware/${NVIDIA_OPEN_VERSION}" +FW_OUTPUT="${BUILD_DIR}/nvidia-firmware/${NVIDIA_DRIVER_VERSION}" if [ -d "${FW_SRC}" ] && ls "${FW_SRC}"/*.bin >/dev/null 2>&1; then mkdir -p "${FW_OUTPUT}" cp "${FW_SRC}"/*.bin "${FW_OUTPUT}/" @@ -169,7 +168,7 @@ else fi echo "" -echo "==> Done! ${KO_COUNT} kernel modules built for guest kernel 6.12.76." +echo "==> Done! ${KO_COUNT} kernel modules built for guest kernel ${GUEST_KERNEL_VERSION}." echo " The VM driver will auto-discover them at:" echo " ${OUTPUT_DIR}" echo "" diff --git a/tasks/scripts/vm/build-rootfs-tarball.sh b/tasks/scripts/vm/build-rootfs-tarball.sh index 87abca27e..1f34dae03 100755 --- a/tasks/scripts/vm/build-rootfs-tarball.sh +++ b/tasks/scripts/vm/build-rootfs-tarball.sh @@ -33,7 +33,9 @@ ROOTFS_BUILD_DIR="${ROOT}/target/rootfs-build" OUTPUT_DIR="${ROOT}/target/vm-runtime-compressed" OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" -KERNEL_VERSION="6.12.76" +# shellcheck source=../../crates/openshell-vm/pins.env +source "${ROOT}/crates/openshell-vm/pins.env" +KERNEL_VERSION="${GUEST_KERNEL_VERSION}" NVIDIA_MODULES_DIR="${ROOT}/target/libkrun-build/nvidia-modules" NVIDIA_USERSPACE_DIR="${ROOT}/target/libkrun-build/nvidia-userspace"