diff --git a/misc/images/distroless-prod-base/Dockerfile b/misc/images/distroless-prod-base/Dockerfile index a685f9b0e8136..b05618c0c14a3 100644 --- a/misc/images/distroless-prod-base/Dockerfile +++ b/misc/images/distroless-prod-base/Dockerfile @@ -10,7 +10,19 @@ # This is a separate mzimage so that we don't have to re-install the apt things # every time we get a CI builder with a cold cache. +# Extract libeatmydata from a Debian image for CI use. eatmydata disables +# fsync for faster test execution. Activated via LD_PRELOAD=libeatmydata.so +# set as a container environment variable (must be set before process start). +FROM debian:13-slim AS eatmydata +RUN apt-get update && apt-get install -y --no-install-recommends eatmydata \ + && rm -rf /var/lib/apt/lists/* + FROM gcr.io/distroless/cc-debian13:nonroot-28078d2e5e77671d2046dcc9e2c75334e31efa4d + +# Copy libeatmydata for CI performance optimization (no-op unless +# LD_PRELOAD=libeatmydata.so is set as a container env var). +COPY --from=eatmydata /usr/lib/*/libeatmydata.so /usr/lib/ + USER nonroot ENV HOME=/home/nonroot ENTRYPOINT [] diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile new file mode 100644 index 0000000000000..d4642a1e4cd0d --- /dev/null +++ b/misc/images/openssh-static/Dockerfile @@ -0,0 +1,95 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Build a statically-linked OpenSSH `ssh` client binary using AWS-LC +# as the crypto backend. AWS-LC is a faster, smaller alternative to +# OpenSSL that also supports FIPS 140-3 validation when needed. +# +# OpenSSH natively supports AWS-LC as a crypto backend (no patches needed). +# See: https://github.com/openssh/openssh-portable/blob/master/INSTALL +# +# To enable FIPS mode, build with: --build-arg AWS_LC_FIPS=1 +# (requires Go for the FIPS delocator) +# +# Usage: +# docker build -t openssh-static . +# docker create --name extract openssh-static +# docker cp extract:/output/ssh ./ssh +# docker rm extract + +FROM ubuntu:noble-20260210.1 AS builder + +ARG AWS_LC_VERSION=v1.54.0 +ARG AWS_LC_FIPS=0 +ARG OPENSSH_VERSION=V_9_9_P2 +ARG ZLIB_VERSION=1.3.1 + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + git \ + golang \ + ninja-build \ + perl \ + pkg-config \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Build AWS-LC as a static library. +# When AWS_LC_FIPS=1, enables FIPS mode (requires Go for the delocator). +WORKDIR /build/aws-lc +RUN git clone --depth 1 --branch ${AWS_LC_VERSION} https://github.com/aws/aws-lc.git . \ + && cmake -GNinja -B build \ + $([ "$AWS_LC_FIPS" = "1" ] && echo "-DFIPS=1") \ + -DBUILD_SHARED_LIBS=0 \ + -DBUILD_TESTING=OFF \ + -DBUILD_TOOL=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/opt/aws-lc \ + -DCMAKE_C_FLAGS="-fPIC" \ + && ninja -C build \ + && ninja -C build install + +# Build zlib as a static library. +WORKDIR /build/zlib +RUN wget -qO- https://github.com/madler/zlib/releases/download/v${ZLIB_VERSION}/zlib-${ZLIB_VERSION}.tar.gz | tar xz --strip-components=1 \ + && ./configure --static --prefix=/opt/zlib \ + && make -j"$(nproc)" \ + && make install + +# Build OpenSSH ssh client against AWS-LC. +WORKDIR /build/openssh +RUN git clone --depth 1 --branch ${OPENSSH_VERSION} https://github.com/openssh/openssh-portable.git . \ + && autoreconf \ + && ./configure \ + --with-ssl-dir=/opt/aws-lc \ + --with-zlib=/opt/zlib \ + --with-ldflags=-static \ + --without-pam \ + --without-libedit \ + --disable-pkcs11 \ + # AWS-LC does not define the legacy OpenSSL BN_FLG_CONSTTIME flag. + # Setting it to 0 satisfies #ifdef checks in OpenSSH source code. + # This is safe: AWS-LC handles constant-time bignum operations + # internally and does not rely on this flag. + # --disable-pkcs11 avoids link errors from ssh-pkcs11.c calling + # RSA_meth_dup/EC_KEY_METHOD_get_sign which AWS-LC does not provide. + && make -j"$(nproc)" ssh CFLAGS="-DBN_FLG_CONSTTIME=0" \ + && strip ssh + +# Verify the binary is not dynamically linked and is functional. +RUN ! ldd ssh 2>/dev/null \ + && ./ssh -V + +# Output stage: just the binary. +FROM scratch +COPY --from=builder /build/openssh/ssh /output/ssh diff --git a/src/environmentd/ci/entrypoint.sh b/misc/images/openssh-static/mzbuild.yml old mode 100755 new mode 100644 similarity index 57% rename from src/environmentd/ci/entrypoint.sh rename to misc/images/openssh-static/mzbuild.yml index ff2f2337c2404..d7c1b2b4e4cc5 --- a/src/environmentd/ci/entrypoint.sh +++ b/misc/images/openssh-static/mzbuild.yml @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Copyright Materialize, Inc. and contributors. All rights reserved. # # Use of this software is governed by the Business Source License @@ -9,17 +7,5 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -set -euo pipefail - -if [ -z "${MZ_EAT_MY_DATA:-}" ]; then - unset LD_PRELOAD -else - export LD_PRELOAD=libeatmydata.so -fi - -if environmentd "$@"; then - echo "environmentd exited gracefully; sleeping forever" >&2 - sleep infinity -else - exit $? -fi +name: openssh-static +description: Statically-linked OpenSSH ssh client built against AWS-LC (FIPS optional). diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index e07ca490a5355..cb035dae09e7e 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -41,13 +41,13 @@ def __init__( "CLUSTERD_USE_CTP=true", "MZ_SOFT_ASSERTIONS=1", "MZ_EAT_MY_DATA=1", + "LD_PRELOAD=libeatmydata.so", # Defaults that were previously set by the clusterd entrypoint.sh. "CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100", "CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101", "CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878", "CLUSTERD_SECRETS_READER=local-file", "CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets", - "LD_PRELOAD=libeatmydata.so", f"CLUSTERD_PERSIST_PUBSUB_URL=http://{mz_service}:6879", *environment_extra, ] @@ -80,6 +80,9 @@ def __init__( # Override the materialized entrypoint so that `clusterd` is invoked # via the command rather than via the entrypoint. This keeps # `c.exec()` working (it prepends the entrypoint to exec commands). + # Note: mzcompose uses the Ubuntu-based `materialized` image (with + # tini/bash), while production uses the distroless `clusterd` image. + # Keep this in mind when debugging CI-vs-prod discrepancies. config["entrypoint"] = ["tini", "--"] # Depending on the Docker Compose version, this may either work or be diff --git a/src/cloud-resources/src/crd/materialize.rs b/src/cloud-resources/src/crd/materialize.rs index 51d80127fa557..f4f6539b56c03 100644 --- a/src/cloud-resources/src/crd/materialize.rs +++ b/src/cloud-resources/src/crd/materialize.rs @@ -655,7 +655,7 @@ pub mod v1alpha1 { } } -fn parse_image_ref(image_ref: &str) -> Option { +pub fn parse_image_ref(image_ref: &str) -> Option { image_ref .rsplit_once(':') .and_then(|(_repo, tag)| tag.strip_prefix('v')) diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index 309f01048dc87..06a086bda981e 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -38,7 +38,7 @@ mz-storage-client = { path = "../storage-client" } mz-storage-types = { path = "../storage-types" } mz-timely-util = { path = "../timely-util" } mz-txn-wal = { path = "../txn-wal" } -nix.workspace = true +nix = { workspace = true, features = ["hostname", "signal"] } num_cpus.workspace = true serde.workspace = true tokio.workspace = true diff --git a/src/clusterd/ci/Dockerfile b/src/clusterd/ci/Dockerfile index 6df26f293a729..457a0d9c78b30 100644 --- a/src/clusterd/ci/Dockerfile +++ b/src/clusterd/ci/Dockerfile @@ -7,10 +7,11 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM openssh-static AS openssh -COPY clusterd entrypoint.sh /usr/local/bin/ +MZFROM distroless-prod-base -USER materialize +COPY clusterd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh -ENTRYPOINT ["tini", "--", "entrypoint.sh"] +ENTRYPOINT ["/usr/local/bin/clusterd"] diff --git a/src/clusterd/ci/entrypoint.sh b/src/clusterd/ci/entrypoint.sh deleted file mode 100755 index 6c3b749ac2fa5..0000000000000 --- a/src/clusterd/ci/entrypoint.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -set -euo pipefail - -# We pass default arguments as environment variables, and only if those -# environment variables do not already exist, to allow users to override these -# arguments when running the container via either environment variables or -# command-line arguments. -export CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=${CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR:-0.0.0.0:2100} -export CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=${CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR:-0.0.0.0:2101} -export CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=${CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR:-0.0.0.0:6878} -export CLUSTERD_SECRETS_READER=${CLUSTERD_SECRETS_READER:-local-file} -export CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=${CLUSTERD_SECRETS_READER_LOCAL_DIR:-/mzdata/secrets} - -if [[ "${KUBERNETES_SERVICE_HOST:-}" ]]; then - # Pass the host's FQDN as the host to be used for GRPC request validation - # only when running in Kubernetes. In other contexts (like when running - # locally, or in Docker), this is likely not desirable. - export CLUSTERD_GRPC_HOST=${CLUSTERD_GRPC_HOST:-$(hostname --fqdn)} - - # When running in Kubernetes, pass the StatefulSet replica's ordinal index - # as the process index. - export CLUSTERD_PROCESS=${CLUSTERD_PROCESS:-${HOSTNAME##*-}} -fi - -if [ -z "${MZ_EAT_MY_DATA:-}" ]; then - unset LD_PRELOAD -else - export LD_PRELOAD=libeatmydata.so -fi - -exec clusterd "$@" diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 2e0a6eb08c41a..96c2f8ed9e19d 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -47,6 +47,44 @@ mod usage_metrics; const BUILD_INFO: BuildInfo = build_info!(); +/// Resolves a short hostname to its FQDN by parsing `/etc/hosts`. +/// +/// In Kubernetes, the kubelet writes the pod's FQDN into `/etc/hosts` +/// for StatefulSet pods with hostname/subdomain configured, e.g.: +/// +/// ```text +/// 10.0.1.5 clusterd-0.svc.namespace.svc.cluster.local clusterd-0 +/// ``` +/// +/// This approach avoids FFI (`getaddrinfo`) and works even if CoreDNS +/// is temporarily unavailable at pod startup. Falls back to the short +/// hostname if no FQDN is found. +fn resolve_fqdn(short_hostname: &str) -> String { + if let Ok(hosts) = std::fs::read_to_string("/etc/hosts") { + for line in hosts.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + // /etc/hosts format: IP_ADDRESS CANONICAL_NAME [ALIASES...] + let mut fields = line.split_whitespace(); + let _ip = fields.next(); + let names: Vec<&str> = fields.collect(); + if names.contains(&short_hostname) { + if let Some(fqdn) = names.iter().find(|&&n| n.contains('.')) { + return fqdn.to_string(); + } + } + } + } + eprintln!( + "warning: could not resolve FQDN for {:?} from /etc/hosts; \ + falling back to short hostname. GRPC host validation may not work correctly.", + short_hostname, + ); + short_hostname.to_string() +} + pub static VERSION: LazyLock = LazyLock::new(|| BUILD_INFO.human_version(None)); /// Independent cluster server for Materialize. @@ -168,9 +206,96 @@ struct Args { worker_core_affinity: bool, } +/// Install signal handlers so that termination signals are not ignored. +/// +/// On Linux, PID 1 has special signal semantics: the kernel will not +/// deliver signals whose disposition is SIG_DFL (the default). Since +/// distroless containers run the binary directly as PID 1 (no tini), +/// signals like SIGTERM from Kubernetes pod termination would be silently +/// ignored without explicit handlers. This function registers a handler +/// that restores the default disposition and re-raises, producing the +/// expected termination behavior. +fn install_termination_signal_handlers() { + use nix::sys::signal; + + extern "C" fn handle_signal(signum: i32) { + // Restore default handler and re-raise so the process terminates + // with the correct signal and exit code. + let action = signal::SigAction::new( + signal::SigHandler::SigDfl, + signal::SaFlags::SA_NODEFER | signal::SaFlags::SA_ONSTACK, + signal::SigSet::empty(), + ); + unsafe { signal::sigaction(signum.try_into().unwrap(), &action) } + .unwrap_or_else(|_| panic!("failed to uninstall handler for {}", signum)); + signal::raise(signum.try_into().unwrap()) + .unwrap_or_else(|_| panic!("failed to re-raise signal {}", signum)); + } + + let action = signal::SigAction::new( + signal::SigHandler::Handler(handle_signal), + signal::SaFlags::SA_NODEFER | signal::SaFlags::SA_ONSTACK, + signal::SigSet::empty(), + ); + for signum in &[ + signal::SIGHUP, + signal::SIGINT, + signal::SIGALRM, + signal::SIGTERM, + ] { + unsafe { signal::sigaction(*signum, &action) } + .unwrap_or_else(|e| panic!("failed to install handler for {}: {}", signum, e)); + } +} + pub fn main() { + // Install signal handlers before anything else. As PID 1 in a + // distroless container, the kernel ignores signals without explicit + // handlers — without this, SIGTERM from Kubernetes would be ignored + // and the pod would hang until SIGKILL. + install_termination_signal_handlers(); + mz_ore::panic::install_enhanced_handler(); + // When running in Kubernetes, auto-detect the GRPC host from the pod's FQDN + // and the process index from the StatefulSet ordinal. These are set as env + // vars so that clap picks them up as defaults (they can still be overridden + // via explicit env vars or CLI args). + // + // SAFETY: Called before any threads are spawned. + // `install_enhanced_handler` above only registers a panic hook; it does + // not spawn threads. The hook spawns a thread only if a panic fires, + // which cannot happen between here and the first `unsafe` call below. + if std::env::var("KUBERNETES_SERVICE_HOST").is_ok() { + if std::env::var("CLUSTERD_GRPC_HOST").is_err() { + // Resolve the pod's FQDN via DNS, equivalent to `hostname --fqdn`. + // In Kubernetes, /etc/hostname only has the short name (e.g., + // "clusterd-0"), but GRPC validation needs the FQDN (e.g., + // "clusterd-0.clusterd.ns.svc.cluster.local"). We resolve the + // short hostname through DNS to get the canonical name. + // + // This avoids shelling out to `hostname --fqdn` which isn't + // available in distroless images. + if let Ok(hostname) = nix::unistd::gethostname() { + if let Some(short) = hostname.to_str() { + let fqdn = resolve_fqdn(short); + unsafe { std::env::set_var("CLUSTERD_GRPC_HOST", &fqdn) }; + } + } + } + if std::env::var("CLUSTERD_PROCESS").is_err() { + // Extract the ordinal index from the last segment of the + // StatefulSet hostname (e.g., "mz5ncn-cluster-s1-replica-s1-gen-1-0" + // → "0"). This matches orchestrator-kubernetes which also uses + // split('-').next_back() to extract the process ID from pod names. + if let Ok(hostname) = std::env::var("HOSTNAME") { + if let Some(ordinal) = hostname.rsplit('-').next() { + unsafe { std::env::set_var("CLUSTERD_PROCESS", ordinal) }; + } + } + } + } + let args = cli::parse_args(CliConfig { env_prefix: Some("CLUSTERD_"), enable_version_flag: true, diff --git a/src/environmentd/ci/Dockerfile b/src/environmentd/ci/Dockerfile index 4ddef7ead92aa..376bb13b2853c 100644 --- a/src/environmentd/ci/Dockerfile +++ b/src/environmentd/ci/Dockerfile @@ -7,10 +7,11 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM openssh-static AS openssh -COPY environmentd entrypoint.sh /usr/local/bin/ +MZFROM distroless-prod-base -USER materialize +COPY environmentd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh -ENTRYPOINT ["tini", "--", "entrypoint.sh"] +ENTRYPOINT ["/usr/local/bin/environmentd"] diff --git a/src/environmentd/src/environmentd/main.rs b/src/environmentd/src/environmentd/main.rs index fe3eaba7e874d..aa3acf7435ae2 100644 --- a/src/environmentd/src/environmentd/main.rs +++ b/src/environmentd/src/environmentd/main.rs @@ -650,6 +650,9 @@ pub fn main() { if let Err(err) = run(args) { panic!("environmentd: fatal: {}", err.display_with_causes()); } + // run() blocks forever via thread::park() and never returns Ok(()). + // If it somehow does, let the process exit so the issue is visible + // (Kubernetes will restart it and the logs will show the exit). } fn run(mut args: Args) -> Result<(), anyhow::Error> { diff --git a/src/orchestratord/src/controller/balancer.rs b/src/orchestratord/src/controller/balancer.rs index 08f689a9360f6..62c86ec462338 100644 --- a/src/orchestratord/src/controller/balancer.rs +++ b/src/orchestratord/src/controller/balancer.rs @@ -41,6 +41,7 @@ use mz_cloud_resources::crd::{ ManagedResource, balancer::v1alpha1::{Balancer, Routing}, generated::cert_manager::certificates::{Certificate, CertificatePrivateKeyAlgorithm}, + materialize::parse_image_ref, }; use mz_orchestrator_kubernetes::KubernetesImagePullPolicy; use mz_ore::{cli::KeyValueArg, instrument}; @@ -159,6 +160,32 @@ impl Context { ) } + fn pod_uid_gid(image_ref: &str) -> i64 { + // Distroless images (v26.19+) run as the `nonroot` user (uid/gid 65534). + // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + // Note: balancerd transitioned to distroless one release earlier than + // environmentd/clusterd (which use V26_20_0 in generation.rs). + static V26_19_0: std::sync::LazyLock = + std::sync::LazyLock::new(|| semver::Version { + major: 26, + minor: 19, + patch: 0, + pre: semver::Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), + build: semver::BuildMetadata::new("").expect("empty string is valid buildmetadata"), + }); + let is_distroless = match parse_image_ref(image_ref) { + Some(v) => v.cmp_precedence(&V26_19_0).is_ge(), + None => { + tracing::warn!( + image_ref, + "failed to parse balancerd image ref; assuming distroless" + ); + true + } + }; + if is_distroless { 65534 } else { 999 } + } + fn create_deployment_object(&self, balancer: &Balancer) -> anyhow::Result { let security_context = if self.config.enable_security_context { // Since we want to adhere to the most restrictive security context, all @@ -388,12 +415,15 @@ impl Context { ), affinity: self.config.balancerd_affinity.clone(), tolerations: self.config.balancerd_tolerations.clone(), - security_context: Some(PodSecurityContext { - fs_group: Some(999), - run_as_user: Some(999), - run_as_group: Some(999), - ..Default::default() - }), + security_context: { + let uid_gid = Self::pod_uid_gid(&balancer.spec.balancerd_image_ref); + Some(PodSecurityContext { + fs_group: Some(uid_gid), + run_as_user: Some(uid_gid), + run_as_group: Some(uid_gid), + ..Default::default() + }) + }, scheduler_name: self.config.scheduler_name.clone(), volumes: Some(volumes), ..Default::default() @@ -596,3 +626,28 @@ impl k8s_controller::Context for Context { Ok(None) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + fn test_pod_uid_gid() { + // Boundary: v26.19.0 is the first distroless version + assert_eq!(Context::pod_uid_gid("materialize/balancerd:v26.18.0"), 999); + assert_eq!( + Context::pod_uid_gid("materialize/balancerd:v26.19.0"), + 65534 + ); + // Pre-release below threshold gets Ubuntu + assert_eq!( + Context::pod_uid_gid("materialize/balancerd:v26.19.0-dev"), + 999 + ); + // Unparseable refs assume distroless + assert_eq!( + Context::pod_uid_gid("materialize/balancerd@sha256:abc"), + 65534 + ); + } +} diff --git a/src/orchestratord/src/controller/materialize/generation.rs b/src/orchestratord/src/controller/materialize/generation.rs index 3717a9b2ebd6f..ff32b7fd8a3fa 100644 --- a/src/orchestratord/src/controller/materialize/generation.rs +++ b/src/orchestratord/src/controller/materialize/generation.rs @@ -81,6 +81,17 @@ static V26_1_0: LazyLock = LazyLock::new(|| Version { build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), }); +/// Minimum version for distroless environmentd/clusterd images (nonroot +/// uid/gid 65534). Balancerd transitioned one release earlier at V26_19_0 +/// (see balancer.rs). +static V26_20_0: LazyLock = LazyLock::new(|| Version { + major: 26, + minor: 20, + patch: 0, + pre: Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), + build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), +}); + /// Describes the status of a deployment. /// /// This is a simplified representation of `DeploymentState`, suitable for @@ -864,8 +875,23 @@ fn create_environmentd_statefulset_object( ephemeral_volume_class )); } - // The `materialize` user used by clusterd always has gid 999. - args.push("--orchestrator-kubernetes-service-fs-group=999".to_string()); + // Distroless images (v26.20+) run as the `nonroot` user (uid/gid 65534). + // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + // This value is used for both the environmentd pod security context and + // the --orchestrator-kubernetes-service-fs-group arg (which controls + // clusterd pod security contexts). Both transition at the same version. + // Note: Kubernetes fsGroup re-chowns volume contents on mount, so + // existing PVCs with UID 999 files will be migrated automatically + // (may add startup latency for large volumes). + let service_fs_group: i64 = if mz.meets_minimum_version(&V26_20_0) { + 65534 + } else { + 999 + }; + args.push(format!( + "--orchestrator-kubernetes-service-fs-group={}", + service_fs_group + )); // Add system_param configmap // This feature was enabled in 0.163 but did not have testing until after 0.164. @@ -1218,9 +1244,9 @@ fn create_environmentd_statefulset_object( service_account_name: Some(mz.service_account_name()), volumes: Some(volumes), security_context: Some(PodSecurityContext { - fs_group: Some(999), - run_as_user: Some(999), - run_as_group: Some(999), + fs_group: Some(service_fs_group), + run_as_user: Some(service_fs_group), + run_as_group: Some(service_fs_group), ..Default::default() }), tolerations,