From 6bd8391e5c737368cc0e4abc8024cd00ee7ea314 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Tue, 14 Apr 2026 22:16:17 -0700 Subject: [PATCH 1/2] orchestratord: version-gate pod security context UID/GID for distroless Distroless images run as nonroot (UID 65534) instead of root. Add version-gating so orchestratord sets the correct runAsUser/runAsGroup based on the Materialize version, avoiding UID mismatches during rolling upgrades from Debian-based to distroless images. Gate versions (verified against release history, 2026-06): - balancerd: V26_18_0. Its ci/Dockerfile switched to distroless-prod-base in v26.18.0 (prod-base in v26.17.x). The original V26_19_0 was off by one and would have forced UID 999 onto v26.18.x balancerd pods that actually run as 65534. - environmentd/clusterd: V26_28_0, matching the release that ships their distroless migration (#36099). The original V26_20_0 predated the actual landing by ~8 releases (main is now 26.28-dev) and would have applied UID 65534 to v26.20-v26.27 images that still run as UID 999. NOTE: the env/clusterd gate assumes #36099 lands in the 26.28 cycle. If it slips, bump V26_28_0 to the actual release. The three distroless PRs (#36099 image, #36100 SIGTERM, #36101 this) must ship in the same release. Co-Authored-By: Claude Opus 4.6 (1M context) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cloud-resources/src/crd/materialize.rs | 2 +- src/orchestratord/src/controller/balancer.rs | 69 +++++++++++++++++-- .../src/controller/materialize/generation.rs | 36 ++++++++-- 3 files changed, 95 insertions(+), 12 deletions(-) diff --git a/src/cloud-resources/src/crd/materialize.rs b/src/cloud-resources/src/crd/materialize.rs index 62101f25215b1..56f2af76144b8 100644 --- a/src/cloud-resources/src/crd/materialize.rs +++ b/src/cloud-resources/src/crd/materialize.rs @@ -1616,7 +1616,7 @@ pub mod v1 { } } -fn parse_image_ref(image_ref: &str) -> Option { +pub fn parse_image_ref(image_ref: &str) -> Option { image_ref .rsplit_once(':') .and_then(|(_repo, tag)| tag.strip_prefix('v')) diff --git a/src/orchestratord/src/controller/balancer.rs b/src/orchestratord/src/controller/balancer.rs index 8687e99dd57d9..83799cb86fadd 100644 --- a/src/orchestratord/src/controller/balancer.rs +++ b/src/orchestratord/src/controller/balancer.rs @@ -42,6 +42,7 @@ use mz_cloud_resources::crd::{ ManagedResource, balancer::v1alpha1::{Balancer, Routing}, generated::cert_manager::certificates::{Certificate, CertificatePrivateKeyAlgorithm}, + materialize::parse_image_ref, }; use mz_orchestrator_kubernetes::KubernetesImagePullPolicy; use mz_ore::{cli::KeyValueArg, instrument}; @@ -160,6 +161,34 @@ impl Context { ) } + fn pod_uid_gid(image_ref: &str) -> i64 { + // Distroless images (v26.18+) run as the `nonroot` user (uid/gid 65534). + // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + // Note: balancerd transitioned to distroless earlier than + // environmentd/clusterd (which use V26_28_0 in generation.rs). + // Verified against release history: balancerd's ci/Dockerfile switched + // to distroless-prod-base in v26.18.0 (prod-base in v26.17.x). + static V26_18_0: std::sync::LazyLock = + std::sync::LazyLock::new(|| semver::Version { + major: 26, + minor: 18, + patch: 0, + pre: semver::Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), + build: semver::BuildMetadata::new("").expect("empty string is valid buildmetadata"), + }); + let is_distroless = match parse_image_ref(image_ref) { + Some(v) => v.cmp_precedence(&V26_18_0).is_ge(), + None => { + tracing::warn!( + image_ref, + "failed to parse balancerd image ref; assuming distroless" + ); + true + } + }; + if is_distroless { 65534 } else { 999 } + } + fn create_deployment_object(&self, balancer: &Balancer) -> anyhow::Result { let security_context = if self.config.enable_security_context { // Since we want to adhere to the most restrictive security context, all @@ -392,12 +421,15 @@ impl Context { ), affinity: self.config.balancerd_affinity.clone(), tolerations: self.config.balancerd_tolerations.clone(), - security_context: Some(PodSecurityContext { - fs_group: Some(999), - run_as_user: Some(999), - run_as_group: Some(999), - ..Default::default() - }), + security_context: { + let uid_gid = Self::pod_uid_gid(&balancer.spec.balancerd_image_ref); + Some(PodSecurityContext { + fs_group: Some(uid_gid), + run_as_user: Some(uid_gid), + run_as_group: Some(uid_gid), + ..Default::default() + }) + }, scheduler_name: self.config.scheduler_name.clone(), volumes: Some(volumes), ..Default::default() @@ -601,3 +633,28 @@ impl k8s_controller::Context for Context { Ok(None) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + fn test_pod_uid_gid() { + // Boundary: v26.18.0 is the first distroless balancerd version + assert_eq!(Context::pod_uid_gid("materialize/balancerd:v26.17.0"), 999); + assert_eq!( + Context::pod_uid_gid("materialize/balancerd:v26.18.0"), + 65534 + ); + // Pre-release below threshold gets Ubuntu + assert_eq!( + Context::pod_uid_gid("materialize/balancerd:v26.18.0-dev"), + 999 + ); + // Unparseable refs assume distroless + assert_eq!( + Context::pod_uid_gid("materialize/balancerd@sha256:abc"), + 65534 + ); + } +} diff --git a/src/orchestratord/src/controller/materialize/generation.rs b/src/orchestratord/src/controller/materialize/generation.rs index a9ff7ce30d980..963419c713e25 100644 --- a/src/orchestratord/src/controller/materialize/generation.rs +++ b/src/orchestratord/src/controller/materialize/generation.rs @@ -92,6 +92,17 @@ static PER_ROUTE_GROUP_ROLES_VERSION: LazyLock = LazyLock::new(|| Versi build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), }); +/// Minimum version for distroless environmentd/clusterd images (nonroot +/// uid/gid 65534). Balancerd transitioned earlier at V26_18_0 (see +/// balancer.rs). +static V26_28_0: LazyLock = LazyLock::new(|| Version { + major: 26, + minor: 28, + patch: 0, + pre: Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), + build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), +}); + /// Describes the status of a deployment. /// /// This is a simplified representation of `DeploymentState`, suitable for @@ -887,8 +898,23 @@ fn create_environmentd_statefulset_object( ephemeral_volume_class )); } - // The `materialize` user used by clusterd always has gid 999. - args.push("--orchestrator-kubernetes-service-fs-group=999".to_string()); + // Distroless images (v26.28+) run as the `nonroot` user (uid/gid 65534). + // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + // This value is used for both the environmentd pod security context and + // the --orchestrator-kubernetes-service-fs-group arg (which controls + // clusterd pod security contexts). Both transition at the same version. + // Note: Kubernetes fsGroup re-chowns volume contents on mount, so + // existing PVCs with UID 999 files will be migrated automatically + // (may add startup latency for large volumes). + let service_fs_group: i64 = if mz.meets_minimum_version(&V26_28_0) { + 65534 + } else { + 999 + }; + args.push(format!( + "--orchestrator-kubernetes-service-fs-group={}", + service_fs_group + )); // Add system_param configmap // This feature was enabled in 0.163 but did not have testing until after 0.164. @@ -1241,9 +1267,9 @@ fn create_environmentd_statefulset_object( service_account_name: Some(mz.service_account_name()), volumes: Some(volumes), security_context: Some(PodSecurityContext { - fs_group: Some(999), - run_as_user: Some(999), - run_as_group: Some(999), + fs_group: Some(service_fs_group), + run_as_user: Some(service_fs_group), + run_as_group: Some(service_fs_group), ..Default::default() }), tolerations, From cbe3af5a77231353ac2ac125bb8b66a869976dab Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Tue, 30 Jun 2026 14:50:26 -0700 Subject: [PATCH 2/2] orchestratord: bump distroless env/clusterd version gate to 26.32 main has moved to 26.32.0-dev and 26.28.0 already shipped as a Debian image. Leaving the gate at 26.28 would apply the nonroot 65534 securityContext to the still-Debian 26.28-26.31 env/clusterd images, which expect uid/gid 999. Bump the gate so only the genuinely distroless images get the nonroot context. The gate must equal the release the distroless env/clusterd images (#36099) first ship in. Set to 26.32 on the assumption this lands in the 26.32 cycle. Re-confirm against the actual release cut before merge. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orchestratord/src/controller/balancer.rs | 2 +- src/orchestratord/src/controller/materialize/generation.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/orchestratord/src/controller/balancer.rs b/src/orchestratord/src/controller/balancer.rs index 83799cb86fadd..2d46163dc415c 100644 --- a/src/orchestratord/src/controller/balancer.rs +++ b/src/orchestratord/src/controller/balancer.rs @@ -165,7 +165,7 @@ impl Context { // Distroless images (v26.18+) run as the `nonroot` user (uid/gid 65534). // Older Ubuntu-based images use the `materialize` user (uid/gid 999). // Note: balancerd transitioned to distroless earlier than - // environmentd/clusterd (which use V26_28_0 in generation.rs). + // environmentd/clusterd (which use V26_32_0 in generation.rs). // Verified against release history: balancerd's ci/Dockerfile switched // to distroless-prod-base in v26.18.0 (prod-base in v26.17.x). static V26_18_0: std::sync::LazyLock = diff --git a/src/orchestratord/src/controller/materialize/generation.rs b/src/orchestratord/src/controller/materialize/generation.rs index 963419c713e25..8f80ee74c2f99 100644 --- a/src/orchestratord/src/controller/materialize/generation.rs +++ b/src/orchestratord/src/controller/materialize/generation.rs @@ -95,9 +95,9 @@ static PER_ROUTE_GROUP_ROLES_VERSION: LazyLock = LazyLock::new(|| Versi /// Minimum version for distroless environmentd/clusterd images (nonroot /// uid/gid 65534). Balancerd transitioned earlier at V26_18_0 (see /// balancer.rs). -static V26_28_0: LazyLock = LazyLock::new(|| Version { +static V26_32_0: LazyLock = LazyLock::new(|| Version { major: 26, - minor: 28, + minor: 32, patch: 0, pre: Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), @@ -906,7 +906,7 @@ fn create_environmentd_statefulset_object( // Note: Kubernetes fsGroup re-chowns volume contents on mount, so // existing PVCs with UID 999 files will be migrated automatically // (may add startup latency for large volumes). - let service_fs_group: i64 = if mz.meets_minimum_version(&V26_28_0) { + let service_fs_group: i64 = if mz.meets_minimum_version(&V26_32_0) { 65534 } else { 999