diff --git a/architecture/docker-driver.md b/architecture/docker-driver.md index 25d48f740..5d7b20ee3 100644 --- a/architecture/docker-driver.md +++ b/architecture/docker-driver.md @@ -60,7 +60,7 @@ enforced by the supervisor in that nested namespace. | `cap_add` | `SYS_ADMIN`, `NET_ADMIN`, `SYS_PTRACE`, `SYSLOG` | Required for supervisor isolation setup and process inspection | | `security_opt` | `apparmor=unconfined` | Docker's default AppArmor profile blocks mount operations required by network namespace setup | | `restart_policy` | `unless-stopped` | Resume managed sandboxes after Docker or gateway restarts | -| `device_requests` | CDI all-GPU request when `spec.gpu` is true | Enables Docker CDI GPU sandboxes when daemon support is detected | +| `device_requests` | CDI GPU request when `spec.gpu` is present | Uses the shared CDI GPU resolver: empty `device_id` defaults to `nvidia.com/gpu=all`, explicit IDs pass through | ## Gateway Callback diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 01b69b2f5..d99497135 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -391,7 +391,7 @@ When `openshell sandbox create` cannot connect to a gateway (connection refused, 1. `should_attempt_bootstrap()` in `crates/openshell-cli/src/bootstrap.rs` checks the error type. It returns `true` for connectivity errors and missing default TLS materials, but `false` for TLS handshake/auth errors. 2. If running in a terminal, the user is prompted to confirm. 3. `run_bootstrap()` deploys a gateway named `"openshell"`, sets it as active, and returns fresh `TlsOptions` pointing to the newly-written mTLS certs. -4. When `sandbox create` requests GPU explicitly (`--gpu`) or infers it from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation, using the same CDI-or-fallback selection as `gateway start --gpu`. +4. When `sandbox create` sends a present GPU request, either explicitly (`--gpu`) or from image-name inference on a final component containing `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation, using the same CDI-or-fallback selection as `gateway start --gpu`. ## Container Environment Variables diff --git a/architecture/podman-driver.md b/architecture/podman-driver.md index 155937a77..d4c382307 100644 --- a/architecture/podman-driver.md +++ b/architecture/podman-driver.md @@ -43,7 +43,7 @@ graph TB | Supervisor delivery | hostPath volume (read-only) | Embedded in rootfs tarball | OCI image volume (read-only) | | Network model | Supervisor creates netns inside pod | gvproxy virtio-net (192.168.127.0/24) | Supervisor creates netns inside container | | Credential injection | Plaintext env var + K8s Secret volume (0400) | Rootfs file copy (0600) + env vars | Podman `secret_env` API + env vars | -| GPU support | Yes (nvidia.com/gpu resource) | No | Yes (CDI device) | +| GPU support | Yes (nvidia.com/gpu resource) | Yes (single passthrough GPU) | Yes (CDI device IDs via shared resolver) | | `stop_sandbox` | Unimplemented | Unimplemented | Implemented (graceful stop) | | State storage | Kubernetes API (CRD) | In-memory HashMap + filesystem | Podman daemon (container state) | | Endpoint resolution | Pod IP / cluster DNS | 127.0.0.1 + allocated port | 127.0.0.1 + ephemeral port | diff --git a/architecture/sandbox-custom-containers.md b/architecture/sandbox-custom-containers.md index 7718ac934..e29218a36 100644 --- a/architecture/sandbox-custom-containers.md +++ b/architecture/sandbox-custom-containers.md @@ -27,7 +27,7 @@ The community registry prefix defaults to `ghcr.io/nvidia/openshell-community/sa ### GPU image-name detection -`sandbox create` also infers GPU intent from the final image name. The current rule matches when the last image name component contains `gpu` (for example `ghcr.io/nvidia/openshell-community/sandboxes/nvidia-gpu:latest` or `registry.example.com/team/my-gpu-image:latest`). When that rule matches, the sandbox request is treated the same as passing `--gpu`. +`sandbox create` also infers GPU intent from the final image name. The current rule matches when the last image name component contains `gpu` (for example `ghcr.io/nvidia/openshell-community/sandboxes/nvidia-gpu:latest` or `registry.example.com/team/my-gpu-image:latest`). When that rule matches, the sandbox request includes a present GPU request with no device IDs, the same shape produced by passing `--gpu`. ### Dockerfile build flow diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 2ad634cf2..276c43f8a 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -27,11 +27,12 @@ use openshell_core::proto::{ CreateProviderRequest, CreateSandboxRequest, DeleteProviderRequest, DeleteSandboxRequest, ExecSandboxRequest, GetClusterInferenceRequest, GetDraftHistoryRequest, GetDraftPolicyRequest, GetGatewayConfigRequest, GetProviderRequest, GetSandboxConfigRequest, GetSandboxLogsRequest, - GetSandboxPolicyStatusRequest, GetSandboxRequest, HealthRequest, ListProvidersRequest, - ListSandboxPoliciesRequest, ListSandboxesRequest, PolicySource, PolicyStatus, Provider, - RejectDraftChunkRequest, Sandbox, SandboxPhase, SandboxPolicy, SandboxSpec, SandboxTemplate, - SetClusterInferenceRequest, SettingScope, SettingValue, UpdateConfigRequest, - UpdateProviderRequest, WatchSandboxRequest, exec_sandbox_event, setting_value, + GetSandboxPolicyStatusRequest, GetSandboxRequest, GpuRequestSpec, HealthRequest, + ListProvidersRequest, ListSandboxPoliciesRequest, ListSandboxesRequest, PolicySource, + PolicyStatus, Provider, RejectDraftChunkRequest, Sandbox, SandboxPhase, SandboxPolicy, + SandboxSpec, SandboxTemplate, SetClusterInferenceRequest, SettingScope, SettingValue, + UpdateConfigRequest, UpdateProviderRequest, WatchSandboxRequest, exec_sandbox_event, + setting_value, }; use openshell_core::settings::{self, SettingValueKind}; use openshell_core::{ObjectId, ObjectName}; @@ -2324,8 +2325,7 @@ pub async fn sandbox_create( let request = CreateSandboxRequest { spec: Some(SandboxSpec { - gpu: requested_gpu, - gpu_device: gpu_device.unwrap_or_default().to_string(), + gpu: gpu_request_from_cli(requested_gpu, gpu_device), policy, providers: configured_providers, template, @@ -2757,6 +2757,15 @@ pub async fn sandbox_create( } } +fn gpu_request_from_cli(requested_gpu: bool, gpu_device: Option<&str>) -> Option { + requested_gpu.then(|| GpuRequestSpec { + device_id: gpu_device + .filter(|device_id| !device_id.is_empty()) + .map(|device_id| vec![device_id.to_string()]) + .unwrap_or_default(), + }) +} + /// Resolved source for the `--from` flag on `sandbox create`. #[derive(Debug)] enum ResolvedSource { @@ -5794,8 +5803,8 @@ mod tests { use super::{ GatewayControlTarget, TlsOptions, dockerfile_sources_supported_for_gateway, format_gateway_select_header, format_gateway_select_items, gateway_add, gateway_auth_label, - gateway_select_with, gateway_type_label, git_sync_files, http_health_check, - image_requests_gpu, inferred_provider_type, parse_cli_setting_value, + gateway_select_with, gateway_type_label, git_sync_files, gpu_request_from_cli, + http_health_check, image_requests_gpu, inferred_provider_type, parse_cli_setting_value, parse_credential_pairs, plaintext_gateway_is_remote, provisioning_timeout_message, ready_false_condition_message, resolve_from, resolve_gateway_control_target_from, sandbox_should_persist, shell_escape, source_requests_gpu, validate_gateway_name, @@ -6045,6 +6054,26 @@ mod tests { assert!(!source_requests_gpu("base")); } + #[test] + fn gpu_request_from_cli_uses_presence_with_empty_device_ids_for_default_gpu() { + let request = gpu_request_from_cli(true, None).expect("gpu request should be present"); + + assert!(request.device_id.is_empty()); + } + + #[test] + fn gpu_request_from_cli_maps_gpu_device_to_one_device_id() { + let request = gpu_request_from_cli(true, Some("0000:2d:00.0")) + .expect("gpu request should be present"); + + assert_eq!(request.device_id, vec!["0000:2d:00.0"]); + } + + #[test] + fn gpu_request_from_cli_omits_gpu_request_when_not_requested() { + assert!(gpu_request_from_cli(false, Some("0")).is_none()); + } + #[test] fn resolve_from_classifies_existing_dockerfile_path() { let temp = tempfile::tempdir().expect("failed to create tempdir"); diff --git a/crates/openshell-core/src/gpu.rs b/crates/openshell-core/src/gpu.rs new file mode 100644 index 000000000..10e5eaaeb --- /dev/null +++ b/crates/openshell-core/src/gpu.rs @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Shared GPU request helpers. + +use crate::config::CDI_GPU_DEVICE_ALL; +use crate::proto::compute::v1::GpuRequestSpec; + +/// Resolve a driver GPU request into CDI device identifiers. +/// +/// `None` means no GPU was requested. Presence with no explicit device IDs +/// uses the CDI all-GPU request; otherwise the driver-native IDs pass through. +#[must_use] +pub fn cdi_gpu_device_ids(gpu: Option<&GpuRequestSpec>) -> Option> { + match gpu { + Some(gpu) if gpu.device_id.is_empty() => Some(vec![CDI_GPU_DEVICE_ALL.to_string()]), + Some(gpu) => Some(gpu.device_id.clone()), + None => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cdi_gpu_device_ids_returns_none_when_absent() { + assert_eq!(cdi_gpu_device_ids(None), None); + } + + #[test] + fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() { + let request = GpuRequestSpec { device_id: vec![] }; + + assert_eq!( + cdi_gpu_device_ids(Some(&request)), + Some(vec![CDI_GPU_DEVICE_ALL.to_string()]) + ); + } + + #[test] + fn cdi_gpu_device_ids_passes_single_device_id_through() { + let request = GpuRequestSpec { + device_id: vec!["nvidia.com/gpu=0".to_string()], + }; + + assert_eq!( + cdi_gpu_device_ids(Some(&request)), + Some(vec!["nvidia.com/gpu=0".to_string()]) + ); + } + + #[test] + fn cdi_gpu_device_ids_passes_multiple_device_ids_through() { + let request = GpuRequestSpec { + device_id: vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ], + }; + + assert_eq!( + cdi_gpu_device_ids(Some(&request)), + Some(vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string() + ]) + ); + } +} diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index a4a1ea822..893b01f5f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -12,6 +12,7 @@ pub mod config; pub mod error; pub mod forward; +pub mod gpu; pub mod image; pub mod inference; pub mod metadata; diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 2241e3e90..9604d59cf 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -18,17 +18,16 @@ use bollard::query_parameters::{ }; use bytes::Bytes; use futures::{Stream, StreamExt}; -use openshell_core::config::{ - CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS, -}; +use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS}; +use openshell_core::gpu::cdi_gpu_device_ids; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate, GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, - ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, - ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, - WatchSandboxesEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, - compute_driver_server::ComputeDriver, watch_sandboxes_event, + GpuRequestSpec, ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, + StopSandboxResponse, ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, + WatchSandboxesDeletedEvent, WatchSandboxesEvent, WatchSandboxesRequest, + WatchSandboxesSandboxEvent, compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_core::{Config, Error, Result as CoreResult}; use std::collections::HashMap; @@ -306,11 +305,7 @@ impl DockerComputeDriver { "docker sandboxes require a template image", )); } - if spec.gpu && !config.supports_gpu { - return Err(Status::failed_precondition( - "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", - )); - } + Self::validate_gpu_request(spec.gpu.as_ref(), config.supports_gpu)?; if !template.agent_socket_path.trim().is_empty() { return Err(Status::failed_precondition( "docker compute driver does not support template.agent_socket_path", @@ -330,6 +325,18 @@ impl DockerComputeDriver { Ok(()) } + fn validate_gpu_request( + gpu: Option<&GpuRequestSpec>, + supports_gpu: bool, + ) -> Result<(), Status> { + if gpu.is_some() && !supports_gpu { + return Err(Status::failed_precondition( + "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", + )); + } + Ok(()) + } + async fn get_sandbox_snapshot( &self, sandbox_id: &str, @@ -932,11 +939,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig .collect() } -fn docker_gpu_device_requests(gpu: bool) -> Option> { - gpu.then(|| { +fn docker_gpu_device_requests(gpu: Option<&GpuRequestSpec>) -> Option> { + cdi_gpu_device_ids(gpu).map(|device_ids| { vec![DeviceRequest { driver: Some("cdi".to_string()), - device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]), + device_ids: Some(device_ids), ..Default::default() }] }) @@ -983,7 +990,7 @@ fn build_container_create_body( host_config: Some(HostConfig { nano_cpus: resource_limits.nano_cpus, memory: resource_limits.memory_bytes, - device_requests: docker_gpu_device_requests(spec.gpu), + device_requests: docker_gpu_device_requests(spec.gpu.as_ref()), mounts: Some(build_mounts(config)), restart_policy: Some(RestartPolicy { name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index ae93f5b66..86f0f280c 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -2,9 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 use super::*; -use openshell_core::config::DEFAULT_SERVER_PORT; +use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT}; use openshell_core::proto::compute::v1::{ - DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, + DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, GpuRequestSpec, }; use std::fs; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; @@ -31,8 +31,7 @@ fn test_sandbox() -> DriverSandbox { resources: None, platform_config: None, }), - gpu: false, - gpu_device: String::new(), + gpu: None, }), status: None, } @@ -348,7 +347,7 @@ fn build_container_create_body_clears_inherited_cmd() { fn validate_sandbox_rejects_gpu_when_cdi_unavailable() { let config = runtime_config(); let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().gpu = true; + sandbox.spec.as_mut().unwrap().gpu = Some(GpuRequestSpec { device_id: vec![] }); let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err(); @@ -361,7 +360,7 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() { let mut config = runtime_config(); config.supports_gpu = true; let mut sandbox = test_sandbox(); - sandbox.spec.as_mut().unwrap().gpu = true; + sandbox.spec.as_mut().unwrap().gpu = Some(GpuRequestSpec { device_id: vec![] }); let create_body = build_container_create_body(&sandbox, &config).unwrap(); let request = create_body @@ -378,6 +377,36 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() { ); } +#[test] +fn build_container_create_body_passes_explicit_cdi_device_ids_through() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + sandbox.spec.as_mut().unwrap().gpu = Some(GpuRequestSpec { + device_id: vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ], + }); + + let create_body = build_container_create_body(&sandbox, &config).unwrap(); + let request = create_body + .host_config + .as_ref() + .and_then(|host_config| host_config.device_requests.as_ref()) + .and_then(|requests| requests.first()) + .expect("GPU request should add a Docker device request"); + + assert_eq!(request.driver.as_deref(), Some("cdi")); + assert_eq!( + request.device_ids.as_ref().unwrap(), + &vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string() + ] + ); +} + #[test] fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() { // Regression test: `delete_sandbox` (and the other identifier-keyed diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 360a94152..cc06cf235 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -15,7 +15,7 @@ use openshell_core::proto::compute::v1::{ DriverCondition as SandboxCondition, DriverPlatformEvent as PlatformEvent, DriverSandbox as Sandbox, DriverSandboxSpec as SandboxSpec, DriverSandboxStatus as SandboxStatus, DriverSandboxTemplate as SandboxTemplate, - GetCapabilitiesResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, + GetCapabilitiesResponse, GpuRequestSpec, WatchSandboxesDeletedEvent, WatchSandboxesEvent, WatchSandboxesPlatformEvent, WatchSandboxesSandboxEvent, watch_sandboxes_event, }; use std::collections::BTreeMap; @@ -61,6 +61,10 @@ const SANDBOX_MANAGED_VALUE: &str = "openshell"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const GPU_RESOURCE_QUANTITY: &str = "1"; +fn gpu_has_explicit_device_ids(gpu: Option<&GpuRequestSpec>) -> bool { + gpu.is_some_and(|gpu| !gpu.device_id.is_empty()) +} + // --------------------------------------------------------------------------- // Default workspace persistence (temporary — will be replaced by snapshotting) // --------------------------------------------------------------------------- @@ -194,8 +198,20 @@ impl KubernetesComputeDriver { } pub async fn validate_sandbox_create(&self, sandbox: &Sandbox) -> Result<(), tonic::Status> { - let gpu_requested = sandbox.spec.as_ref().is_some_and(|spec| spec.gpu); - if gpu_requested + let gpu = sandbox.spec.as_ref().and_then(|spec| spec.gpu.as_ref()); + self.validate_gpu_request(gpu).await + } + + async fn validate_gpu_request( + &self, + gpu: Option<&GpuRequestSpec>, + ) -> Result<(), tonic::Status> { + if gpu_has_explicit_device_ids(gpu) { + return Err(tonic::Status::invalid_argument( + "kubernetes compute driver does not support explicit GPU device IDs", + )); + } + if gpu.is_some() && !self.has_gpu_capacity().await.map_err(|err| { tonic::Status::internal(format!("check GPU node capacity failed: {err}")) })? @@ -291,6 +307,12 @@ impl KubernetesComputeDriver { } pub async fn create_sandbox(&self, sandbox: &Sandbox) -> Result<(), KubernetesDriverError> { + if gpu_has_explicit_device_ids(sandbox.spec.as_ref().and_then(|spec| spec.gpu.as_ref())) { + return Err(KubernetesDriverError::Precondition( + "kubernetes compute driver does not support explicit GPU device IDs".to_string(), + )); + } + let name = sandbox.name.as_str(); info!( sandbox_id = %sandbox.id, @@ -918,7 +940,7 @@ fn sandbox_to_k8s_spec( "podTemplate".to_string(), sandbox_template_to_k8s( template, - spec.gpu, + spec.gpu.is_some(), default_image, image_pull_policy, sandbox_id, @@ -964,7 +986,7 @@ fn sandbox_to_k8s_spec( "podTemplate".to_string(), sandbox_template_to_k8s( &SandboxTemplate::default(), - spec.as_ref().is_some_and(|s| s.gpu), + spec.as_ref().is_some_and(|s| s.gpu.is_some()), default_image, image_pull_policy, sandbox_id, @@ -1594,6 +1616,19 @@ mod tests { ); } + #[test] + fn gpu_has_explicit_device_ids_only_when_ids_are_present() { + use openshell_core::proto::compute::v1::GpuRequestSpec; + + assert!(!gpu_has_explicit_device_ids(None)); + assert!(!gpu_has_explicit_device_ids(Some(&GpuRequestSpec { + device_id: vec![], + }))); + assert!(gpu_has_explicit_device_ids(Some(&GpuRequestSpec { + device_id: vec!["nvidia.com/gpu=0".to_string()], + }))); + } + #[test] fn gpu_sandbox_uses_template_runtime_class_name_when_set() { let template = SandboxTemplate { diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index cc7bbc519..a482ca140 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -4,7 +4,7 @@ //! Container spec construction for the Podman driver. use crate::config::PodmanComputeConfig; -use openshell_core::config::CDI_GPU_DEVICE_ALL; +use openshell_core::gpu::cdi_gpu_device_ids; use openshell_core::proto::compute::v1::DriverSandbox; use serde::Serialize; use serde_json::Value; @@ -313,13 +313,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits { /// Build CDI GPU device list if GPU is requested. fn build_devices(sandbox: &DriverSandbox) -> Option> { - if sandbox.spec.as_ref().is_some_and(|s| s.gpu) { - Some(vec![LinuxDevice { - path: CDI_GPU_DEVICE_ALL.into(), - }]) - } else { - None - } + let gpu = sandbox.spec.as_ref().and_then(|spec| spec.gpu.as_ref()); + cdi_gpu_device_ids(gpu).map(|device_ids| { + device_ids + .into_iter() + .map(|path| LinuxDevice { path }) + .collect() + }) } /// Build the Podman container creation JSON spec. @@ -609,6 +609,61 @@ mod tests { assert!(caps.contains(&"DAC_READ_SEARCH"), "missing DAC_READ_SEARCH"); } + #[test] + fn container_spec_omits_devices_without_gpu_request() { + let sandbox = test_sandbox("test-id", "test-name"); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert!(spec.get("devices").is_none()); + } + + #[test] + fn container_spec_maps_empty_gpu_request_to_all_cdi_device() { + use openshell_core::config::CDI_GPU_DEVICE_ALL; + use openshell_core::proto::compute::v1::{DriverSandboxSpec, GpuRequestSpec}; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + gpu: Some(GpuRequestSpec { device_id: vec![] }), + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some(CDI_GPU_DEVICE_ALL) + ); + } + + #[test] + fn container_spec_passes_explicit_cdi_device_ids_through() { + use openshell_core::proto::compute::v1::{DriverSandboxSpec, GpuRequestSpec}; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + gpu: Some(GpuRequestSpec { + device_id: vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ], + }), + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some("nvidia.com/gpu=0") + ); + assert_eq!( + spec["devices"][1]["path"].as_str(), + Some("nvidia.com/gpu=1") + ); + } + #[test] fn container_spec_uses_secret_env_not_plaintext() { let sandbox = test_sandbox("test-id", "test-name"); diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index ae9492d74..2e57c7d3d 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -10,7 +10,7 @@ use crate::watcher::{ self, WatchStream, driver_sandbox_from_inspect, driver_sandbox_from_list_entry, }; use openshell_core::ComputeDriverError; -use openshell_core::proto::compute::v1::{DriverSandbox, GetCapabilitiesResponse}; +use openshell_core::proto::compute::v1::{DriverSandbox, GetCapabilitiesResponse, GpuRequestSpec}; use tracing::{info, warn}; impl From for ComputeDriverError { @@ -184,8 +184,12 @@ impl PodmanComputeDriver { &self, sandbox: &DriverSandbox, ) -> Result<(), ComputeDriverError> { - let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu); - if gpu_requested && !Self::has_gpu_capacity() { + let gpu = sandbox.spec.as_ref().and_then(|s| s.gpu.as_ref()); + Self::validate_gpu_request(gpu) + } + + fn validate_gpu_request(gpu: Option<&GpuRequestSpec>) -> Result<(), ComputeDriverError> { + if gpu.is_some() && !Self::has_gpu_capacity() { return Err(ComputeDriverError::Precondition( "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(), )); diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index d79e5d922..e00766c93 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -25,11 +25,11 @@ use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition as SandboxCondition, DriverPlatformEvent as PlatformEvent, DriverSandbox as Sandbox, DriverSandboxStatus as SandboxStatus, GetCapabilitiesRequest, - GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, ListSandboxesRequest, - ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, - ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, - WatchSandboxesPlatformEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, - compute_driver_server::ComputeDriver, watch_sandboxes_event, + GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, GpuRequestSpec, + ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, + ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, + WatchSandboxesEvent, WatchSandboxesPlatformEvent, WatchSandboxesRequest, + WatchSandboxesSandboxEvent, compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_vfio::SysfsRoot; use sha2::{Digest, Sha256}; @@ -358,9 +358,8 @@ impl VmDriver { return Err(Status::already_exists("sandbox already exists")); } - let spec = sandbox.spec.as_ref(); - let is_gpu = spec.is_some_and(|s| s.gpu); - let gpu_device = spec.map_or("", |s| s.gpu_device.as_str()); + let gpu_device = + requested_gpu_device(sandbox.spec.as_ref().and_then(|spec| spec.gpu.as_ref())); let state_dir = sandbox_state_dir(&self.config.state_dir, &sandbox.id); let rootfs = state_dir.join("rootfs"); @@ -437,7 +436,7 @@ impl VmDriver { ))); } - let gpu_bdf = if is_gpu { + let gpu_bdf = if let Some(gpu_device) = gpu_device { let inventory = self .gpu_inventory .as_ref() @@ -1457,15 +1456,7 @@ fn validate_vm_sandbox(sandbox: &Sandbox, gpu_enabled: bool) -> Result<(), Statu .as_ref() .ok_or_else(|| Status::invalid_argument("sandbox spec is required"))?; - if spec.gpu && !gpu_enabled { - return Err(Status::failed_precondition( - "GPU support is not enabled on this driver; start with --gpu", - )); - } - - if !spec.gpu && !spec.gpu_device.is_empty() { - return Err(Status::invalid_argument("gpu_device requires gpu=true")); - } + validate_gpu_request(spec.gpu.as_ref(), gpu_enabled)?; if let Some(template) = spec.template.as_ref() { if !template.agent_socket_path.is_empty() { @@ -1487,6 +1478,27 @@ fn validate_vm_sandbox(sandbox: &Sandbox, gpu_enabled: bool) -> Result<(), Statu Ok(()) } +fn requested_gpu_device(gpu: Option<&GpuRequestSpec>) -> Option<&str> { + let gpu = gpu?; + Some(gpu.device_id.first().map_or("", String::as_str)) +} + +#[allow(clippy::result_large_err)] +fn validate_gpu_request(gpu: Option<&GpuRequestSpec>, gpu_enabled: bool) -> Result<(), Status> { + if gpu.is_some() && !gpu_enabled { + return Err(Status::failed_precondition( + "GPU support is not enabled on this driver; start with --gpu", + )); + } + + if gpu.is_some_and(|gpu| gpu.device_id.len() > 1) { + return Err(Status::invalid_argument( + "vm compute driver supports at most one GPU device ID", + )); + } + Ok(()) +} + #[allow(clippy::result_large_err)] fn parse_registry_reference(image_ref: &str) -> Result { Reference::try_from(image_ref).map_err(|err| { @@ -2426,7 +2438,7 @@ mod tests { use super::*; use crate::gpu::{SubnetAllocator, allocate_vsock_cid, mac_from_sandbox_id, tap_device_name}; use openshell_core::proto::compute::v1::{ - DriverSandboxSpec as SandboxSpec, DriverSandboxTemplate as SandboxTemplate, + DriverSandboxSpec as SandboxSpec, DriverSandboxTemplate as SandboxTemplate, GpuRequestSpec, }; use prost_types::{Struct, Value, value::Kind}; use std::fs; @@ -2438,7 +2450,7 @@ mod tests { fn validate_vm_sandbox_rejects_gpu_when_not_enabled() { let sandbox = Sandbox { spec: Some(SandboxSpec { - gpu: true, + gpu: Some(GpuRequestSpec { device_id: vec![] }), ..Default::default() }), ..Default::default() @@ -2453,7 +2465,7 @@ mod tests { fn validate_vm_sandbox_accepts_gpu_when_enabled() { let sandbox = Sandbox { spec: Some(SandboxSpec { - gpu: true, + gpu: Some(GpuRequestSpec { device_id: vec![] }), ..Default::default() }), ..Default::default() @@ -2462,19 +2474,41 @@ mod tests { } #[test] - fn validate_vm_sandbox_rejects_gpu_device_without_gpu() { + fn validate_vm_sandbox_rejects_multiple_gpu_device_ids() { let sandbox = Sandbox { spec: Some(SandboxSpec { - gpu: false, - gpu_device: "0000:2d:00.0".to_string(), + gpu: Some(GpuRequestSpec { + device_id: vec!["0000:2d:00.0".to_string(), "0000:3d:00.0".to_string()], + }), ..Default::default() }), ..Default::default() }; let err = validate_vm_sandbox(&sandbox, true) - .expect_err("gpu_device without gpu should be rejected"); + .expect_err("multiple GPU device IDs should be rejected"); assert_eq!(err.code(), Code::InvalidArgument); - assert!(err.message().contains("gpu_device requires gpu=true")); + assert!(err.message().contains("at most one GPU device ID")); + } + + #[test] + fn requested_gpu_device_returns_none_without_gpu_request() { + assert_eq!(requested_gpu_device(None), None); + } + + #[test] + fn requested_gpu_device_defaults_empty_request_to_inventory_choice() { + let gpu = GpuRequestSpec { device_id: vec![] }; + + assert_eq!(requested_gpu_device(Some(&gpu)), Some("")); + } + + #[test] + fn requested_gpu_device_returns_first_explicit_device_id() { + let gpu = GpuRequestSpec { + device_id: vec!["0000:2d:00.0".to_string()], + }; + + assert_eq!(requested_gpu_device(Some(&gpu)), Some("0000:2d:00.0")); } #[test] diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 2d6351637..fbd741091 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -18,10 +18,10 @@ use futures::{Stream, StreamExt}; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, DeleteSandboxRequest, DriverCondition, DriverPlatformEvent, DriverResourceRequirements, DriverSandbox, DriverSandboxSpec, DriverSandboxStatus, - DriverSandboxTemplate, GetCapabilitiesRequest, GetSandboxRequest, ListSandboxesRequest, - ValidateSandboxCreateRequest, WatchSandboxesEvent, WatchSandboxesRequest, - compute_driver_client::ComputeDriverClient, compute_driver_server::ComputeDriver, - watch_sandboxes_event, + DriverSandboxTemplate, GetCapabilitiesRequest, GetSandboxRequest, + GpuRequestSpec as DriverGpuRequestSpec, ListSandboxesRequest, ValidateSandboxCreateRequest, + WatchSandboxesEvent, WatchSandboxesRequest, compute_driver_client::ComputeDriverClient, + compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_core::proto::{ PlatformEvent, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, SandboxStatus, @@ -1120,8 +1120,9 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .template .as_ref() .map(driver_sandbox_template_from_public), - gpu: spec.gpu, - gpu_device: spec.gpu_device.clone(), + gpu: spec.gpu.as_ref().map(|gpu| DriverGpuRequestSpec { + device_id: gpu.device_id.clone(), + }), } } @@ -1468,7 +1469,7 @@ fn derive_phase(status: Option<&DriverSandboxStatus>) -> SandboxPhase { } fn rewrite_user_facing_conditions(status: &mut Option, spec: Option<&SandboxSpec>) { - let gpu_requested = spec.is_some_and(|sandbox_spec| sandbox_spec.gpu); + let gpu_requested = spec.is_some_and(|sandbox_spec| sandbox_spec.gpu.is_some()); if !gpu_requested { return; } @@ -1626,6 +1627,7 @@ pub async fn new_test_runtime(store: Arc) -> ComputeRuntime { mod tests { use super::*; use futures::stream; + use openshell_core::proto::GpuRequestSpec; use openshell_core::proto::compute::v1::{ CreateSandboxResponse, DeleteSandboxResponse, GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateResponse, @@ -1646,6 +1648,26 @@ mod tests { } } + #[test] + fn driver_sandbox_spec_from_public_preserves_gpu_request_device_ids() { + let public = SandboxSpec { + gpu: Some(GpuRequestSpec { + device_id: vec!["nvidia.com/gpu=0".to_string()], + }), + ..Default::default() + }; + + let driver = driver_sandbox_spec_from_public(&public); + + assert_eq!( + driver + .gpu + .expect("driver GPU request should be present") + .device_id, + vec!["nvidia.com/gpu=0".to_string()] + ); + } + fn struct_value( fields: impl IntoIterator, prost_types::Value)>, ) -> prost_types::Value { @@ -2094,7 +2116,7 @@ mod tests { rewrite_user_facing_conditions( &mut status, Some(&SandboxSpec { - gpu: true, + gpu: Some(GpuRequestSpec { device_id: vec![] }), ..Default::default() }), ); @@ -2126,7 +2148,7 @@ mod tests { rewrite_user_facing_conditions( &mut status, Some(&SandboxSpec { - gpu: false, + gpu: None, ..Default::default() }), ); @@ -2353,7 +2375,7 @@ mod tests { let sandbox = Sandbox { spec: Some(SandboxSpec { - gpu: true, + gpu: Some(GpuRequestSpec { device_id: vec![] }), ..Default::default() }), ..sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning) @@ -2376,7 +2398,7 @@ mod tests { SandboxPhase::try_from(stored.phase).unwrap(), SandboxPhase::Ready ); - assert!(stored.spec.as_ref().is_some_and(|spec| spec.gpu)); + assert!(stored.spec.as_ref().is_some_and(|spec| spec.gpu.is_some())); } #[tokio::test] diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 160b7e031..1b9503ad4 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -642,7 +642,7 @@ pub(super) fn level_matches(log_level: &str, min_level: &str) -> bool { #[cfg(test)] mod tests { use super::*; - use openshell_core::proto::SandboxSpec; + use openshell_core::proto::{GpuRequestSpec, SandboxSpec}; use std::collections::HashMap; use tonic::Code; @@ -668,7 +668,7 @@ mod tests { #[test] fn validate_sandbox_spec_accepts_gpu_flag() { let spec = SandboxSpec { - gpu: true, + gpu: Some(GpuRequestSpec { device_id: vec![] }), ..Default::default() }; assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh index 38c6e6b7c..0099b9705 100755 --- a/e2e/rust/e2e-podman.sh +++ b/e2e/rust/e2e-podman.sh @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Run the Rust e2e smoke test against a Podman-backed gateway. +# Run Rust e2e tests against a Podman-backed gateway. # # Usage: # mise run e2e:podman # start a gateway with Podman driver +# mise run e2e:podman:gpu # run the CDI GPU test binary # mise run e2e:podman -- --port=9090 # use a specific port # # Options: @@ -15,7 +16,7 @@ # 1. Verifies Podman is available and the socket is reachable # 2. Starts openshell-gateway with --drivers podman --disable-tls # 3. Waits for the gateway to become healthy -# 4. Runs the Rust smoke test +# 4. Runs the selected Rust e2e test # 5. Cleans up the gateway process and any leftover sandbox containers # # Prerequisites: @@ -28,6 +29,8 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway" TIMEOUT=120 +E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-smoke}" +GPU_MODE="${OPENSHELL_E2E_PODMAN_GPU:-0}" # ── Parse arguments ────────────────────────────────────────────────── PORT="" @@ -54,6 +57,11 @@ if ! podman info &>/dev/null; then echo " systemctl --user start podman.socket" exit 1 fi +if [ "${GPU_MODE}" = "1" ] && [ ! -e /dev/nvidia0 ]; then + echo "ERROR: e2e:podman:gpu requires NVIDIA GPU devices and CDI support." >&2 + echo " Verify /dev/nvidia0 exists and Podman can resolve nvidia.com/gpu=0." >&2 + exit 2 +fi if [ ! -f "${GATEWAY_BIN}" ]; then echo "Building openshell-gateway..." @@ -63,7 +71,7 @@ fi # ── Resolve images ─────────────────────────────────────────────────── # Use the same image defaults as the driver, allowing env overrides. SUPERVISOR_IMAGE="${OPENSHELL_SUPERVISOR_IMAGE:-openshell/supervisor:dev}" -SANDBOX_IMAGE="${OPENSHELL_SANDBOX_IMAGE:-}" +SANDBOX_IMAGE="${OPENSHELL_E2E_PODMAN_SANDBOX_IMAGE:-${OPENSHELL_SANDBOX_IMAGE:-}}" # Verify the supervisor image exists locally. if ! podman image exists "${SUPERVISOR_IMAGE}" 2>/dev/null; then @@ -155,14 +163,14 @@ if [ "${healthy}" != "true" ]; then fi echo "Gateway is ready (${elapsed}s)." -# ── Run the smoke test ─────────────────────────────────────────────── +# ── Run the selected test ──────────────────────────────────────────── export OPENSHELL_GATEWAY_ENDPOINT="http://127.0.0.1:${PORT}" # Use a synthetic gateway name so the CLI does not require stored mTLS creds. export OPENSHELL_GATEWAY="e2e-podman" export OPENSHELL_PROVISION_TIMEOUT=300 -echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})..." +echo "Running e2e ${E2E_TEST} test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})..." cargo build -p openshell-cli --features openshell-core/dev-settings -cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture +cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test "${E2E_TEST}" -- --nocapture -echo "Smoke test passed." +echo "${E2E_TEST} test passed." diff --git a/e2e/rust/tests/cdi_gpu.rs b/e2e/rust/tests/cdi_gpu.rs new file mode 100644 index 000000000..621af0fdf --- /dev/null +++ b/e2e/rust/tests/cdi_gpu.rs @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! CDI GPU e2e tests. +//! +//! Requires a CDI-enabled gateway backed by Docker or Podman. The +//! `e2e:docker:gpu` and `e2e:podman:gpu` mise tasks start the corresponding +//! gateway with the default sandbox image unless the task-specific sandbox +//! image environment variable is set. + +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; + +async fn assert_nvidia_smi(args: &[&str]) { + let mut create_args = Vec::from(args); + create_args.extend([ + "--", + "sh", + "-lc", + "gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1); \ + test -n \"$gpu_name\"; \ + printf 'gpu-ok:%s\\n' \"$gpu_name\"", + ]); + + let mut guard = SandboxGuard::create(&create_args) + .await + .expect("GPU sandbox create should succeed"); + + let output = strip_ansi(&guard.create_output); + assert!( + output.contains("gpu-ok:"), + "expected GPU smoke marker in sandbox output:\n{output}" + ); + + guard.cleanup().await; +} + +#[tokio::test] +async fn cdi_gpu_sandbox_runs_nvidia_smi() { + assert_nvidia_smi(&["--gpu"]).await; +} + +#[tokio::test] +async fn cdi_gpu_sandbox_runs_nvidia_smi_with_specific_device() { + assert_nvidia_smi(&["--gpu", "--gpu-device", "nvidia.com/gpu=0"]).await; +} diff --git a/e2e/rust/tests/docker_gpu.rs b/e2e/rust/tests/docker_gpu.rs deleted file mode 100644 index f85dc48b8..000000000 --- a/e2e/rust/tests/docker_gpu.rs +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#![cfg(feature = "e2e")] - -//! Docker GPU e2e test. -//! -//! Requires a Docker-backed gateway started with Docker CDI support. The -//! `e2e:docker:gpu` mise task starts that gateway with the default sandbox image -//! unless OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE is set. - -use openshell_e2e::harness::output::strip_ansi; -use openshell_e2e::harness::sandbox::SandboxGuard; - -#[tokio::test] -async fn docker_gpu_sandbox_runs_nvidia_smi() { - let mut guard = SandboxGuard::create(&[ - "--gpu", - "--", - "sh", - "-lc", - "gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1); \ - test -n \"$gpu_name\"; \ - printf 'gpu-ok:%s\n' \"$gpu_name\"", - ]) - .await - .expect("GPU sandbox create should succeed"); - - let output = strip_ansi(&guard.create_output); - assert!( - output.contains("gpu-ok:"), - "expected GPU smoke marker in sandbox output:\n{output}" - ); - - guard.cleanup().await; -} diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..5a2f3ffca 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -78,18 +78,23 @@ message DriverSandbox { // Driver-owned provisioning inputs required to create a sandbox. message DriverSandboxSpec { + reserved 10; + // Log level exposed to processes running inside the sandbox. string log_level = 1; // Environment variables injected into the sandbox runtime. map environment = 5; // Runtime template consumed by the driver during provisioning. DriverSandboxTemplate template = 6; - // Request NVIDIA GPU resources for this sandbox. - bool gpu = 9; - // Optional PCI BDF address (e.g. "0000:2d:00.0") or device index - // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the - // first available GPU. - string gpu_device = 10; + // Request GPU resources for this sandbox. Presence indicates a GPU request. + GpuRequestSpec gpu = 9; +} + +// Driver-native GPU request details. +message GpuRequestSpec { + // Optional driver-native device identifiers. Empty means the driver chooses + // its default GPU assignment behavior. + repeated string device_id = 1; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index 75490f338..0709a0984 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -192,6 +192,8 @@ message Sandbox { // Desired sandbox configuration provided through the public API. message SandboxSpec { + reserved 10; + // Log level exposed to processes running inside the sandbox. string log_level = 1; // Environment variables injected into the sandbox runtime. @@ -202,12 +204,16 @@ message SandboxSpec { openshell.sandbox.v1.SandboxPolicy policy = 7; // Provider names to attach to this sandbox. repeated string providers = 8; - // Request NVIDIA GPU resources for this sandbox. - bool gpu = 9; - // Optional PCI BDF address (e.g. "0000:2d:00.0") or device index - // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the - // first available GPU. - string gpu_device = 10; + // Request GPU resources for this sandbox. Presence indicates a GPU request. + GpuRequestSpec gpu = 9; +} + +// Public GPU request details. Device identifiers are interpreted by the +// selected compute driver. +message GpuRequestSpec { + // Optional driver-native device identifiers. Empty means the driver chooses + // its default GPU assignment behavior. + repeated string device_id = 1; } // Public sandbox template mapped onto compute-driver template inputs. diff --git a/tasks/test.toml b/tasks/test.toml index dd1e88941..06565baf8 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -35,7 +35,7 @@ run = [ # gateway_resume and Docker GPU tests run in dedicated jobs with their own clusters. # Dockerfile sources build into the host Docker daemon, so the custom image # test belongs with Docker-backed gateway coverage rather than the k3s suite. - "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- --skip gateway_resume_scenarios --skip docker_gpu_sandbox_runs_nvidia_smi --skip sandbox_from_custom_dockerfile", + "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- --skip gateway_resume_scenarios --skip cdi_gpu_sandbox_runs_nvidia_smi --skip sandbox_from_custom_dockerfile", ] ["e2e:python"] @@ -55,6 +55,12 @@ description = "Start a Podman-backed gateway and run smoke e2e (requires rootles depends = ["build:docker:supervisor-sideload"] run = "e2e/rust/e2e-podman.sh" +["e2e:podman:gpu"] +description = "Run GPU e2e against a standalone gateway with the Podman compute driver" +depends = ["build:docker:supervisor-sideload"] +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "cdi_gpu" } +run = "e2e/rust/e2e-podman.sh" + ["e2e:vm"] description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" @@ -65,5 +71,5 @@ run = "e2e/rust/e2e-docker.sh" ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "docker_gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "cdi_gpu" } run = "e2e/rust/e2e-docker.sh"