From b74c8c542e9fc08cd387858b5ba3fd4ae488257e Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:43:41 -0700 Subject: [PATCH 01/19] containers: prepare environmentd and clusterd for distroless migration Move bash entrypoint logic into Rust binaries so environmentd and clusterd can run in distroless containers without a shell: clusterd: - Auto-detect Kubernetes FQDN from /etc/hostname (replaces `hostname --fqdn`) - Auto-detect StatefulSet ordinal from HOSTNAME env var - Configure LD_PRELOAD for eatmydata (CI only, no-op in distroless) environmentd: - Configure LD_PRELOAD for eatmydata - Sleep forever after graceful exit (keeps container alive for debugging) Also add Dockerfile.distroless variants for both services that use the distroless-prod-base image and expect a static `ssh` binary to be copied in for SSH tunnel support. Part of SEC-236. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/mzbuild.yml | 11 +++ src/clusterd/Cargo.toml | 1 + src/clusterd/ci/Dockerfile.distroless | 21 ++++++ src/clusterd/src/lib.rs | 87 +++++++++++++++++++++++ src/environmentd/ci/Dockerfile.distroless | 21 ++++++ src/environmentd/src/environmentd/main.rs | 19 +++++ 6 files changed, 160 insertions(+) create mode 100644 misc/images/openssh-static/mzbuild.yml create mode 100644 src/clusterd/ci/Dockerfile.distroless create mode 100644 src/environmentd/ci/Dockerfile.distroless diff --git a/misc/images/openssh-static/mzbuild.yml b/misc/images/openssh-static/mzbuild.yml new file mode 100644 index 0000000000000..781522b8b9d3f --- /dev/null +++ b/misc/images/openssh-static/mzbuild.yml @@ -0,0 +1,11 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +name: openssh-static +description: Statically-linked OpenSSH ssh client built against AWS-LC-FIPS. diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index 309f01048dc87..f072e636a2a59 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -17,6 +17,7 @@ fail.workspace = true futures.workspace = true hyper.workspace = true hyper-util.workspace = true +libc.workspace = true mz-alloc = { path = "../alloc" } mz-alloc-default = { path = "../alloc-default", optional = true } mz-build-info = { path = "../build-info" } diff --git a/src/clusterd/ci/Dockerfile.distroless b/src/clusterd/ci/Dockerfile.distroless new file mode 100644 index 0000000000000..beb8f7800760c --- /dev/null +++ b/src/clusterd/ci/Dockerfile.distroless @@ -0,0 +1,21 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Distroless variant of the clusterd image. Requires the entrypoint +# logic to be compiled into the binary (eatmydata, Kubernetes detection, +# env var defaults) rather than handled by a bash script. + +MZFROM openssh-static AS openssh + +MZFROM distroless-prod-base + +COPY clusterd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh + +ENTRYPOINT ["/usr/local/bin/clusterd"] diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 2e0a6eb08c41a..84a326b57ab8c 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -47,6 +47,45 @@ mod usage_metrics; const BUILD_INFO: BuildInfo = build_info!(); +/// Resolves a short hostname to its FQDN using `getaddrinfo` with +/// `AI_CANONNAME`, equivalent to `hostname --fqdn`. Falls back to the +/// short hostname if DNS resolution fails. +fn resolve_fqdn(short_hostname: &str) -> String { + use std::ffi::{CStr, CString}; + use std::ptr; + + let Ok(c_host) = CString::new(short_hostname) else { + return short_hostname.to_string(); + }; + + let mut hints: libc::addrinfo = unsafe { std::mem::zeroed() }; + hints.ai_flags = libc::AI_CANONNAME; + hints.ai_family = libc::AF_UNSPEC; + + let mut result: *mut libc::addrinfo = ptr::null_mut(); + + let rc = unsafe { libc::getaddrinfo(c_host.as_ptr(), ptr::null(), &hints, &mut result) }; + + if rc != 0 || result.is_null() { + return short_hostname.to_string(); + } + + let fqdn = unsafe { + let info = &*result; + if info.ai_canonname.is_null() { + short_hostname.to_string() + } else { + CStr::from_ptr(info.ai_canonname) + .to_string_lossy() + .into_owned() + } + }; + + unsafe { libc::freeaddrinfo(result) }; + + fqdn +} + pub static VERSION: LazyLock = LazyLock::new(|| BUILD_INFO.human_version(None)); /// Independent cluster server for Materialize. @@ -171,6 +210,54 @@ struct Args { pub fn main() { mz_ore::panic::install_enhanced_handler(); + // When running in Kubernetes, auto-detect the GRPC host from the pod's FQDN + // and the process index from the StatefulSet ordinal. These are set as env + // vars so that clap picks them up as defaults (they can still be overridden + // via explicit env vars or CLI args). + // + // SAFETY: Called before any threads are spawned (main entry point, single + // threaded), so modifying env vars is safe. + if std::env::var("KUBERNETES_SERVICE_HOST").is_ok() { + if std::env::var("CLUSTERD_GRPC_HOST").is_err() { + // Resolve the pod's FQDN via DNS, equivalent to `hostname --fqdn`. + // In Kubernetes, /etc/hostname only has the short name (e.g., + // "clusterd-0"), but GRPC validation needs the FQDN (e.g., + // "clusterd-0.clusterd.ns.svc.cluster.local"). We resolve the + // short hostname through DNS to get the canonical name. + // + // This avoids shelling out to `hostname --fqdn` which isn't + // available in distroless images. + // Use the gethostname() syscall, matching what `hostname --fqdn` + // does internally, then resolve to FQDN via getaddrinfo. + let mut buf = [0u8; 256]; + if unsafe { libc::gethostname(buf.as_mut_ptr().cast(), buf.len()) } == 0 { + let hostname = unsafe { std::ffi::CStr::from_ptr(buf.as_ptr().cast()) }; + if let Ok(short) = hostname.to_str() { + let fqdn = resolve_fqdn(short); + unsafe { std::env::set_var("CLUSTERD_GRPC_HOST", &fqdn) }; + } + } + } + if std::env::var("CLUSTERD_PROCESS").is_err() { + // Extract the ordinal index from the StatefulSet hostname + // (e.g., "clusterd-0" → "0"). + if let Ok(hostname) = std::env::var("HOSTNAME") { + if let Some(ordinal) = hostname.rsplit('-').next() { + unsafe { std::env::set_var("CLUSTERD_PROCESS", ordinal) }; + } + } + } + } + + // Configure LD_PRELOAD for eatmydata if requested (CI performance + // optimization). In distroless images libeatmydata.so is not available, + // so this is a no-op. + if std::env::var("MZ_EAT_MY_DATA").is_ok() { + unsafe { std::env::set_var("LD_PRELOAD", "libeatmydata.so") }; + } else { + unsafe { std::env::remove_var("LD_PRELOAD") }; + } + let args = cli::parse_args(CliConfig { env_prefix: Some("CLUSTERD_"), enable_version_flag: true, diff --git a/src/environmentd/ci/Dockerfile.distroless b/src/environmentd/ci/Dockerfile.distroless new file mode 100644 index 0000000000000..839f042853e78 --- /dev/null +++ b/src/environmentd/ci/Dockerfile.distroless @@ -0,0 +1,21 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Distroless variant of the environmentd image. Requires the entrypoint +# logic to be compiled into the binary (eatmydata, sleep-on-exit) rather +# than handled by a bash script. + +MZFROM openssh-static AS openssh + +MZFROM distroless-prod-base + +COPY environmentd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh + +ENTRYPOINT ["/usr/local/bin/environmentd"] diff --git a/src/environmentd/src/environmentd/main.rs b/src/environmentd/src/environmentd/main.rs index fe3eaba7e874d..9f23668f87ecf 100644 --- a/src/environmentd/src/environmentd/main.rs +++ b/src/environmentd/src/environmentd/main.rs @@ -643,6 +643,18 @@ fn aws_secrets_controller_key_alias(env_id: &EnvironmentId) -> String { } pub fn main() { + // Configure LD_PRELOAD for eatmydata if requested (CI performance + // optimization). In distroless images libeatmydata.so is not available, + // so this is a no-op. + // + // SAFETY: Called before any threads are spawned (main entry point, single + // threaded), so modifying env vars is safe. + if std::env::var("MZ_EAT_MY_DATA").is_ok() { + unsafe { std::env::set_var("LD_PRELOAD", "libeatmydata.so") }; + } else { + unsafe { std::env::remove_var("LD_PRELOAD") }; + } + let args = cli::parse_args(CliConfig { env_prefix: Some("MZ_"), enable_version_flag: true, @@ -650,6 +662,13 @@ pub fn main() { if let Err(err) = run(args) { panic!("environmentd: fatal: {}", err.display_with_causes()); } + // In the previous bash entrypoint, environmentd would sleep forever after + // a graceful exit. This keeps the container alive for debugging. Replicate + // that behavior here. + eprintln!("environmentd exited gracefully; sleeping forever"); + loop { + std::thread::sleep(std::time::Duration::from_secs(86400)); + } } fn run(mut args: Args) -> Result<(), anyhow::Error> { From ff55219b28628008fb28ff46d0434494560d95d0 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:24:15 -0700 Subject: [PATCH 02/19] containers: switch environmentd and clusterd to distroless Replace the Ubuntu-based Dockerfiles with distroless variants directly, delete the now-unnecessary bash entrypoint scripts, and remove the explicit LD_PRELOAD=libeatmydata.so from the mzcompose clusterd service (the MZ_EAT_MY_DATA env var triggers the Rust-side LD_PRELOAD logic which is harmless when libeatmydata.so is absent in distroless). Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 1 + .../mzcompose/services/clusterd.py | 1 - src/clusterd/ci/Dockerfile | 9 ++-- src/clusterd/ci/Dockerfile.distroless | 21 ---------- src/clusterd/ci/entrypoint.sh | 41 ------------------- src/environmentd/ci/Dockerfile | 9 ++-- src/environmentd/ci/Dockerfile.distroless | 21 ---------- src/environmentd/ci/entrypoint.sh | 25 ----------- 8 files changed, 11 insertions(+), 117 deletions(-) delete mode 100644 src/clusterd/ci/Dockerfile.distroless delete mode 100755 src/clusterd/ci/entrypoint.sh delete mode 100644 src/environmentd/ci/Dockerfile.distroless delete mode 100755 src/environmentd/ci/entrypoint.sh diff --git a/Cargo.lock b/Cargo.lock index 845f9eaa7864f..ba3626d368980 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6050,6 +6050,7 @@ dependencies = [ "futures", "hyper 1.9.0", "hyper-util", + "libc", "mz-alloc", "mz-alloc-default", "mz-build-info", diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index e07ca490a5355..ce882b67c39d6 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -47,7 +47,6 @@ def __init__( "CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878", "CLUSTERD_SECRETS_READER=local-file", "CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets", - "LD_PRELOAD=libeatmydata.so", f"CLUSTERD_PERSIST_PUBSUB_URL=http://{mz_service}:6879", *environment_extra, ] diff --git a/src/clusterd/ci/Dockerfile b/src/clusterd/ci/Dockerfile index 6df26f293a729..457a0d9c78b30 100644 --- a/src/clusterd/ci/Dockerfile +++ b/src/clusterd/ci/Dockerfile @@ -7,10 +7,11 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM openssh-static AS openssh -COPY clusterd entrypoint.sh /usr/local/bin/ +MZFROM distroless-prod-base -USER materialize +COPY clusterd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh -ENTRYPOINT ["tini", "--", "entrypoint.sh"] +ENTRYPOINT ["/usr/local/bin/clusterd"] diff --git a/src/clusterd/ci/Dockerfile.distroless b/src/clusterd/ci/Dockerfile.distroless deleted file mode 100644 index beb8f7800760c..0000000000000 --- a/src/clusterd/ci/Dockerfile.distroless +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -# Distroless variant of the clusterd image. Requires the entrypoint -# logic to be compiled into the binary (eatmydata, Kubernetes detection, -# env var defaults) rather than handled by a bash script. - -MZFROM openssh-static AS openssh - -MZFROM distroless-prod-base - -COPY clusterd /usr/local/bin/ -COPY --from=openssh /output/ssh /usr/bin/ssh - -ENTRYPOINT ["/usr/local/bin/clusterd"] diff --git a/src/clusterd/ci/entrypoint.sh b/src/clusterd/ci/entrypoint.sh deleted file mode 100755 index 6c3b749ac2fa5..0000000000000 --- a/src/clusterd/ci/entrypoint.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -set -euo pipefail - -# We pass default arguments as environment variables, and only if those -# environment variables do not already exist, to allow users to override these -# arguments when running the container via either environment variables or -# command-line arguments. -export CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=${CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR:-0.0.0.0:2100} -export CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=${CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR:-0.0.0.0:2101} -export CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=${CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR:-0.0.0.0:6878} -export CLUSTERD_SECRETS_READER=${CLUSTERD_SECRETS_READER:-local-file} -export CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=${CLUSTERD_SECRETS_READER_LOCAL_DIR:-/mzdata/secrets} - -if [[ "${KUBERNETES_SERVICE_HOST:-}" ]]; then - # Pass the host's FQDN as the host to be used for GRPC request validation - # only when running in Kubernetes. In other contexts (like when running - # locally, or in Docker), this is likely not desirable. - export CLUSTERD_GRPC_HOST=${CLUSTERD_GRPC_HOST:-$(hostname --fqdn)} - - # When running in Kubernetes, pass the StatefulSet replica's ordinal index - # as the process index. - export CLUSTERD_PROCESS=${CLUSTERD_PROCESS:-${HOSTNAME##*-}} -fi - -if [ -z "${MZ_EAT_MY_DATA:-}" ]; then - unset LD_PRELOAD -else - export LD_PRELOAD=libeatmydata.so -fi - -exec clusterd "$@" diff --git a/src/environmentd/ci/Dockerfile b/src/environmentd/ci/Dockerfile index 4ddef7ead92aa..376bb13b2853c 100644 --- a/src/environmentd/ci/Dockerfile +++ b/src/environmentd/ci/Dockerfile @@ -7,10 +7,11 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM openssh-static AS openssh -COPY environmentd entrypoint.sh /usr/local/bin/ +MZFROM distroless-prod-base -USER materialize +COPY environmentd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh -ENTRYPOINT ["tini", "--", "entrypoint.sh"] +ENTRYPOINT ["/usr/local/bin/environmentd"] diff --git a/src/environmentd/ci/Dockerfile.distroless b/src/environmentd/ci/Dockerfile.distroless deleted file mode 100644 index 839f042853e78..0000000000000 --- a/src/environmentd/ci/Dockerfile.distroless +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -# Distroless variant of the environmentd image. Requires the entrypoint -# logic to be compiled into the binary (eatmydata, sleep-on-exit) rather -# than handled by a bash script. - -MZFROM openssh-static AS openssh - -MZFROM distroless-prod-base - -COPY environmentd /usr/local/bin/ -COPY --from=openssh /output/ssh /usr/bin/ssh - -ENTRYPOINT ["/usr/local/bin/environmentd"] diff --git a/src/environmentd/ci/entrypoint.sh b/src/environmentd/ci/entrypoint.sh deleted file mode 100755 index ff2f2337c2404..0000000000000 --- a/src/environmentd/ci/entrypoint.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -set -euo pipefail - -if [ -z "${MZ_EAT_MY_DATA:-}" ]; then - unset LD_PRELOAD -else - export LD_PRELOAD=libeatmydata.so -fi - -if environmentd "$@"; then - echo "environmentd exited gracefully; sleeping forever" >&2 - sleep infinity -else - exit $? -fi From 8b3aa6e369b37629e23f1222d8b75f0819181884 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:25:43 -0700 Subject: [PATCH 03/19] containers: add libeatmydata to distroless-prod-base for CI Copy libeatmydata.so from a Debian image into the distroless base so that CI tests using MZ_EAT_MY_DATA=1 continue to benefit from fsync elision. The library is inert in production (MZ_EAT_MY_DATA is unset). Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/distroless-prod-base/Dockerfile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/misc/images/distroless-prod-base/Dockerfile b/misc/images/distroless-prod-base/Dockerfile index a685f9b0e8136..42e737133729b 100644 --- a/misc/images/distroless-prod-base/Dockerfile +++ b/misc/images/distroless-prod-base/Dockerfile @@ -10,7 +10,18 @@ # This is a separate mzimage so that we don't have to re-install the apt things # every time we get a CI builder with a cold cache. +# Extract libeatmydata from a Debian image for CI use. eatmydata disables +# fsync for faster test execution. Services opt in via MZ_EAT_MY_DATA=1. +FROM debian:trixie-slim AS eatmydata +RUN apt-get update && apt-get install -y --no-install-recommends eatmydata \ + && rm -rf /var/lib/apt/lists/* + FROM gcr.io/distroless/cc-debian13:nonroot-28078d2e5e77671d2046dcc9e2c75334e31efa4d + +# Copy libeatmydata for CI performance optimization (no-op if MZ_EAT_MY_DATA +# is unset; harmless in production). +COPY --from=eatmydata /usr/lib/*/libeatmydata.so /usr/lib/ + USER nonroot ENV HOME=/home/nonroot ENTRYPOINT [] From 694419260a1d8657c0c48c7ad886cc3025e3f759 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:28:35 -0700 Subject: [PATCH 04/19] containers: switch jobs image to distroless The jobs image only contains Rust binaries (persistcli, mz-catalog-debug) with no shell or tool dependencies. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/jobs/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/images/jobs/Dockerfile b/misc/images/jobs/Dockerfile index 2fbc48d39861e..c0919ea533e3e 100644 --- a/misc/images/jobs/Dockerfile +++ b/misc/images/jobs/Dockerfile @@ -7,7 +7,7 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM distroless-prod-base COPY persistcli mz-catalog-debug /usr/local/bin/ From 48a43a964b08f6e82234b4bb94044f56de2c3e56 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:31:08 -0700 Subject: [PATCH 05/19] containers: include openssh-static Dockerfile for mzbuild The mzbuild system expects a Dockerfile next to every mzbuild.yml. Include the static OpenSSH build Dockerfile so the pipeline can resolve the openssh-static image dependency. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 84 +++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 misc/images/openssh-static/Dockerfile diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile new file mode 100644 index 0000000000000..8bf66de047acc --- /dev/null +++ b/misc/images/openssh-static/Dockerfile @@ -0,0 +1,84 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Build a statically-linked OpenSSH `ssh` client binary using AWS-LC-FIPS +# as the crypto backend. This enables FIPS 140-3 compliant SSH tunnels +# from distroless container images. +# +# OpenSSH natively supports AWS-LC as a crypto backend (no patches needed). +# See: https://github.com/openssh/openssh-portable/blob/master/INSTALL +# +# Usage: +# docker build -t openssh-static . +# docker create --name extract openssh-static +# docker cp extract:/output/ssh ./ssh +# docker rm extract + +FROM ubuntu:noble-20260210.1 AS builder + +ARG AWS_LC_VERSION=v1.54.0 +ARG OPENSSH_VERSION=V_9_9_P2 +ARG ZLIB_VERSION=1.3.1 + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + cmake \ + git \ + golang \ + ninja-build \ + perl \ + pkg-config \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Build AWS-LC in FIPS mode as a static library. +# The FIPS build requires Go for the delocator and Perl for configuration. +WORKDIR /build/aws-lc +RUN git clone --depth 1 --branch ${AWS_LC_VERSION} https://github.com/aws/aws-lc.git . \ + && cmake -GNinja -B build \ + -DFIPS=1 \ + -DBUILD_SHARED_LIBS=0 \ + -DBUILD_TESTING=OFF \ + -DBUILD_TOOL=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/opt/aws-lc \ + -DCMAKE_C_FLAGS="-fPIC" \ + && ninja -C build \ + && ninja -C build install + +# Build zlib as a static library. +WORKDIR /build/zlib +RUN wget -qO- https://zlib.net/zlib-${ZLIB_VERSION}.tar.gz | tar xz --strip-components=1 \ + && ./configure --static --prefix=/opt/zlib \ + && make -j"$(nproc)" \ + && make install + +# Build OpenSSH ssh client against AWS-LC. +WORKDIR /build/openssh +RUN git clone --depth 1 --branch ${OPENSSH_VERSION} https://github.com/openssh/openssh-portable.git . \ + && autoreconf \ + && ./configure \ + --with-ssl-dir=/opt/aws-lc \ + --with-zlib=/opt/zlib \ + --with-ldflags=-static \ + --without-pam \ + --without-libedit \ + --with-privsep-user=nobody \ + && make -j"$(nproc)" ssh \ + && strip ssh + +# Verify the binary is statically linked and functional. +RUN file ssh | grep -q 'statically linked' \ + && ./ssh -V + +# Output stage: just the binary. +FROM scratch +COPY --from=builder /build/openssh/ssh /output/ssh From 6bd5aff810f4312c91e668d06b8ba3cab0d8a075 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:00:14 -0700 Subject: [PATCH 06/19] containers: add ca-certificates to openssh-static builder The git clone of aws-lc from GitHub fails with "server certificate verification failed" because the ubuntu:noble base image doesn't include CA certificates by default. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile index 8bf66de047acc..45dc6260508f6 100644 --- a/misc/images/openssh-static/Dockerfile +++ b/misc/images/openssh-static/Dockerfile @@ -30,6 +30,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins autoconf \ automake \ build-essential \ + ca-certificates \ cmake \ git \ golang \ From 7064ea3b07f9a96055480ac225a242e112e6b2f9 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:16:42 -0700 Subject: [PATCH 07/19] containers: revert jobs image to prod-base The jobs image is used in CI tests with mzcompose's idle feature which overrides the entrypoint to ["sleep", "infinity"]. Distroless images don't have the sleep binary, so keep this CI-only image on Ubuntu. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/jobs/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/images/jobs/Dockerfile b/misc/images/jobs/Dockerfile index c0919ea533e3e..2fbc48d39861e 100644 --- a/misc/images/jobs/Dockerfile +++ b/misc/images/jobs/Dockerfile @@ -7,7 +7,7 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM distroless-prod-base +MZFROM prod-base COPY persistcli mz-catalog-debug /usr/local/bin/ From 05bbdc848026cb7f4296092bfeef4a36c51a20f8 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 3 Apr 2026 07:30:17 -0700 Subject: [PATCH 08/19] containers: make AWS-LC FIPS mode opt-in for openssh-static Change the static OpenSSH build to use plain AWS-LC by default (faster, no Go dependency) with FIPS mode available via --build-arg AWS_LC_FIPS=1. AWS-LC is a drop-in replacement for OpenSSL that's faster and smaller. FIPS 140-3 validation is an additional layer only needed for compliance builds, not for all builds. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile index 45dc6260508f6..b9270874ec152 100644 --- a/misc/images/openssh-static/Dockerfile +++ b/misc/images/openssh-static/Dockerfile @@ -7,13 +7,16 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -# Build a statically-linked OpenSSH `ssh` client binary using AWS-LC-FIPS -# as the crypto backend. This enables FIPS 140-3 compliant SSH tunnels -# from distroless container images. +# Build a statically-linked OpenSSH `ssh` client binary using AWS-LC +# as the crypto backend. AWS-LC is a faster, smaller alternative to +# OpenSSL that also supports FIPS 140-3 validation when needed. # # OpenSSH natively supports AWS-LC as a crypto backend (no patches needed). # See: https://github.com/openssh/openssh-portable/blob/master/INSTALL # +# To enable FIPS mode, build with: --build-arg AWS_LC_FIPS=1 +# (requires Go for the FIPS delocator) +# # Usage: # docker build -t openssh-static . # docker create --name extract openssh-static @@ -23,6 +26,7 @@ FROM ubuntu:noble-20260210.1 AS builder ARG AWS_LC_VERSION=v1.54.0 +ARG AWS_LC_FIPS=0 ARG OPENSSH_VERSION=V_9_9_P2 ARG ZLIB_VERSION=1.3.1 @@ -40,12 +44,12 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins wget \ && rm -rf /var/lib/apt/lists/* -# Build AWS-LC in FIPS mode as a static library. -# The FIPS build requires Go for the delocator and Perl for configuration. +# Build AWS-LC as a static library. +# When AWS_LC_FIPS=1, enables FIPS mode (requires Go for the delocator). WORKDIR /build/aws-lc RUN git clone --depth 1 --branch ${AWS_LC_VERSION} https://github.com/aws/aws-lc.git . \ && cmake -GNinja -B build \ - -DFIPS=1 \ + $([ "$AWS_LC_FIPS" = "1" ] && echo "-DFIPS=1") \ -DBUILD_SHARED_LIBS=0 \ -DBUILD_TESTING=OFF \ -DBUILD_TOOL=OFF \ From 56783d1531f9f9b7611fb20c52310752936b85e5 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 3 Apr 2026 08:39:08 -0700 Subject: [PATCH 09/19] ci: retrigger build From 8cc785bd6ad7e52fcab92d4a67ce690e3b4364b8 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:08:20 -0700 Subject: [PATCH 10/19] containers: use GitHub mirror for zlib download MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit zlib.net is unreliable in CI — the download has failed twice. Use the GitHub releases mirror which is more stable. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile index b9270874ec152..2df26051727a4 100644 --- a/misc/images/openssh-static/Dockerfile +++ b/misc/images/openssh-static/Dockerfile @@ -61,7 +61,7 @@ RUN git clone --depth 1 --branch ${AWS_LC_VERSION} https://github.com/aws/aws-lc # Build zlib as a static library. WORKDIR /build/zlib -RUN wget -qO- https://zlib.net/zlib-${ZLIB_VERSION}.tar.gz | tar xz --strip-components=1 \ +RUN wget -qO- https://github.com/madler/zlib/releases/download/v${ZLIB_VERSION}/zlib-${ZLIB_VERSION}.tar.gz | tar xz --strip-components=1 \ && ./configure --static --prefix=/opt/zlib \ && make -j"$(nproc)" \ && make install From 55ddda1f0d580f898c732b18a3226d123d4dbc81 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:37:45 -0700 Subject: [PATCH 11/19] containers: fix OpenSSH build with AWS-LC (BN_FLG_CONSTTIME) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AWS-LC (like BoringSSL) doesn't define BN_FLG_CONSTTIME. OpenSSH V_9_9_P2 uses it in ssh-rsa.c. Define it to 0 via CFLAGS — the constant is only used with BN_set_flags which AWS-LC already shims to a no-op. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile index 2df26051727a4..6b4374b1c4a23 100644 --- a/misc/images/openssh-static/Dockerfile +++ b/misc/images/openssh-static/Dockerfile @@ -77,7 +77,7 @@ RUN git clone --depth 1 --branch ${OPENSSH_VERSION} https://github.com/openssh/o --without-pam \ --without-libedit \ --with-privsep-user=nobody \ - && make -j"$(nproc)" ssh \ + && make -j"$(nproc)" ssh CFLAGS="-DBN_FLG_CONSTTIME=0" \ && strip ssh # Verify the binary is statically linked and functional. From 3d27ed871169020ea837895860a0d5d2d27f4b12 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 17:37:34 -0700 Subject: [PATCH 12/19] orchestratord: version-gate pod security context UID/GID for distroless During rolling upgrades, orchestratord may manage pods running old Ubuntu-based images (uid/gid 999 `materialize` user) alongside new distroless images (uid/gid 65534 `nonroot` user). Gate the PodSecurityContext uid/gid based on the image version to prevent permission mismatches: - environmentd/clusterd: use meets_minimum_version(V26_20_0) to select 65534 for distroless images, 999 for older Ubuntu-based images - balancerd: parse the image ref directly (balancerd switched to distroless in v26.19 via #35631, earlier than environmentd/clusterd) - Make parse_image_ref public for reuse by the balancer controller Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cloud-resources/src/crd/materialize.rs | 2 +- src/orchestratord/src/controller/balancer.rs | 38 ++++++++++++++++--- .../src/controller/materialize/generation.rs | 27 ++++++++++--- 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/cloud-resources/src/crd/materialize.rs b/src/cloud-resources/src/crd/materialize.rs index 51d80127fa557..f4f6539b56c03 100644 --- a/src/cloud-resources/src/crd/materialize.rs +++ b/src/cloud-resources/src/crd/materialize.rs @@ -655,7 +655,7 @@ pub mod v1alpha1 { } } -fn parse_image_ref(image_ref: &str) -> Option { +pub fn parse_image_ref(image_ref: &str) -> Option { image_ref .rsplit_once(':') .and_then(|(_repo, tag)| tag.strip_prefix('v')) diff --git a/src/orchestratord/src/controller/balancer.rs b/src/orchestratord/src/controller/balancer.rs index 08f689a9360f6..512367efac00c 100644 --- a/src/orchestratord/src/controller/balancer.rs +++ b/src/orchestratord/src/controller/balancer.rs @@ -41,6 +41,7 @@ use mz_cloud_resources::crd::{ ManagedResource, balancer::v1alpha1::{Balancer, Routing}, generated::cert_manager::certificates::{Certificate, CertificatePrivateKeyAlgorithm}, + materialize::parse_image_ref, }; use mz_orchestrator_kubernetes::KubernetesImagePullPolicy; use mz_ore::{cli::KeyValueArg, instrument}; @@ -159,6 +160,28 @@ impl Context { ) } + fn pod_uid_gid(image_ref: &str) -> i64 { + // Distroless images (v26.19+) run as the `nonroot` user (uid/gid 65534). + // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + static V26_19_0: std::sync::LazyLock = + std::sync::LazyLock::new(|| semver::Version { + major: 26, + minor: 19, + patch: 0, + pre: semver::Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), + build: semver::BuildMetadata::new("").expect("empty string is valid buildmetadata"), + }); + let is_distroless = parse_image_ref(image_ref) + .map(|v| v.cmp_precedence(&V26_19_0).is_ge()) + // Unparseable image refs are assumed to be recent dev builds. + .unwrap_or(true); + if is_distroless { + 65534 + } else { + 999 + } + } + fn create_deployment_object(&self, balancer: &Balancer) -> anyhow::Result { let security_context = if self.config.enable_security_context { // Since we want to adhere to the most restrictive security context, all @@ -388,12 +411,15 @@ impl Context { ), affinity: self.config.balancerd_affinity.clone(), tolerations: self.config.balancerd_tolerations.clone(), - security_context: Some(PodSecurityContext { - fs_group: Some(999), - run_as_user: Some(999), - run_as_group: Some(999), - ..Default::default() - }), + security_context: { + let uid_gid = Self::pod_uid_gid(&balancer.spec.balancerd_image_ref); + Some(PodSecurityContext { + fs_group: Some(uid_gid), + run_as_user: Some(uid_gid), + run_as_group: Some(uid_gid), + ..Default::default() + }) + }, scheduler_name: self.config.scheduler_name.clone(), volumes: Some(volumes), ..Default::default() diff --git a/src/orchestratord/src/controller/materialize/generation.rs b/src/orchestratord/src/controller/materialize/generation.rs index 3717a9b2ebd6f..7a9997e7914d4 100644 --- a/src/orchestratord/src/controller/materialize/generation.rs +++ b/src/orchestratord/src/controller/materialize/generation.rs @@ -81,6 +81,14 @@ static V26_1_0: LazyLock = LazyLock::new(|| Version { build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), }); +static V26_20_0: LazyLock = LazyLock::new(|| Version { + major: 26, + minor: 20, + patch: 0, + pre: Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), + build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), +}); + /// Describes the status of a deployment. /// /// This is a simplified representation of `DeploymentState`, suitable for @@ -864,8 +872,17 @@ fn create_environmentd_statefulset_object( ephemeral_volume_class )); } - // The `materialize` user used by clusterd always has gid 999. - args.push("--orchestrator-kubernetes-service-fs-group=999".to_string()); + // Distroless images (v26.20+) run as the `nonroot` user (uid/gid 65534). + // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + let service_fs_group: i64 = if mz.meets_minimum_version(&V26_20_0) { + 65534 + } else { + 999 + }; + args.push(format!( + "--orchestrator-kubernetes-service-fs-group={}", + service_fs_group + )); // Add system_param configmap // This feature was enabled in 0.163 but did not have testing until after 0.164. @@ -1218,9 +1235,9 @@ fn create_environmentd_statefulset_object( service_account_name: Some(mz.service_account_name()), volumes: Some(volumes), security_context: Some(PodSecurityContext { - fs_group: Some(999), - run_as_user: Some(999), - run_as_group: Some(999), + fs_group: Some(service_fs_group), + run_as_user: Some(service_fs_group), + run_as_group: Some(service_fs_group), ..Default::default() }), tolerations, From 3558bad2a5f38080bcda17fef6eff325a38c8314 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 18:27:04 -0700 Subject: [PATCH 13/19] distroless: address review feedback for safety, logging, and docs Add warning logs for DNS resolution failures and unparseable image refs, clarify version-gate comments (V26_19 balancerd vs V26_20 environmentd), document PVC ownership migration, refine unsafe SAFETY comments, and remove irrelevant --with-privsep-user from OpenSSH client build. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 5 ++++- misc/images/openssh-static/mzbuild.yml | 2 +- .../mzcompose/services/clusterd.py | 3 +++ src/clusterd/src/lib.rs | 11 ++++++++-- src/environmentd/src/environmentd/main.rs | 7 +++--- src/orchestratord/src/controller/balancer.rs | 22 +++++++++++-------- .../src/controller/materialize/generation.rs | 9 ++++++++ 7 files changed, 43 insertions(+), 16 deletions(-) diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile index 6b4374b1c4a23..3410ecbc78e67 100644 --- a/misc/images/openssh-static/Dockerfile +++ b/misc/images/openssh-static/Dockerfile @@ -76,7 +76,10 @@ RUN git clone --depth 1 --branch ${OPENSSH_VERSION} https://github.com/openssh/o --with-ldflags=-static \ --without-pam \ --without-libedit \ - --with-privsep-user=nobody \ + # AWS-LC does not define the legacy OpenSSL BN_FLG_CONSTTIME flag. + # Setting it to 0 satisfies #ifdef checks in OpenSSH source code. + # This is safe: AWS-LC handles constant-time bignum operations + # internally and does not rely on this flag. && make -j"$(nproc)" ssh CFLAGS="-DBN_FLG_CONSTTIME=0" \ && strip ssh diff --git a/misc/images/openssh-static/mzbuild.yml b/misc/images/openssh-static/mzbuild.yml index 781522b8b9d3f..d7c1b2b4e4cc5 100644 --- a/misc/images/openssh-static/mzbuild.yml +++ b/misc/images/openssh-static/mzbuild.yml @@ -8,4 +8,4 @@ # by the Apache License, Version 2.0. name: openssh-static -description: Statically-linked OpenSSH ssh client built against AWS-LC-FIPS. +description: Statically-linked OpenSSH ssh client built against AWS-LC (FIPS optional). diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index ce882b67c39d6..83e5089db10b7 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -79,6 +79,9 @@ def __init__( # Override the materialized entrypoint so that `clusterd` is invoked # via the command rather than via the entrypoint. This keeps # `c.exec()` working (it prepends the entrypoint to exec commands). + # Note: mzcompose uses the Ubuntu-based `materialized` image (with + # tini/bash), while production uses the distroless `clusterd` image. + # Keep this in mind when debugging CI-vs-prod discrepancies. config["entrypoint"] = ["tini", "--"] # Depending on the Docker Compose version, this may either work or be diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 84a326b57ab8c..32d5ce6181a49 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -67,6 +67,11 @@ fn resolve_fqdn(short_hostname: &str) -> String { let rc = unsafe { libc::getaddrinfo(c_host.as_ptr(), ptr::null(), &hints, &mut result) }; if rc != 0 || result.is_null() { + eprintln!( + "warning: getaddrinfo failed for {:?} (rc={}); falling back to short hostname. \ + GRPC host validation may not work correctly.", + short_hostname, rc + ); return short_hostname.to_string(); } @@ -215,8 +220,10 @@ pub fn main() { // vars so that clap picks them up as defaults (they can still be overridden // via explicit env vars or CLI args). // - // SAFETY: Called before any threads are spawned (main entry point, single - // threaded), so modifying env vars is safe. + // SAFETY: Called before any threads are spawned. + // `install_enhanced_handler` above only registers a panic hook; it does + // not spawn threads. The hook spawns a thread only if a panic fires, + // which cannot happen between here and the first `unsafe` call below. if std::env::var("KUBERNETES_SERVICE_HOST").is_ok() { if std::env::var("CLUSTERD_GRPC_HOST").is_err() { // Resolve the pod's FQDN via DNS, equivalent to `hostname --fqdn`. diff --git a/src/environmentd/src/environmentd/main.rs b/src/environmentd/src/environmentd/main.rs index 9f23668f87ecf..bccce3a07510d 100644 --- a/src/environmentd/src/environmentd/main.rs +++ b/src/environmentd/src/environmentd/main.rs @@ -647,8 +647,8 @@ pub fn main() { // optimization). In distroless images libeatmydata.so is not available, // so this is a no-op. // - // SAFETY: Called before any threads are spawned (main entry point, single - // threaded), so modifying env vars is safe. + // SAFETY: Called at the very start of main(), before any threads are + // spawned or other initialization runs. if std::env::var("MZ_EAT_MY_DATA").is_ok() { unsafe { std::env::set_var("LD_PRELOAD", "libeatmydata.so") }; } else { @@ -664,7 +664,8 @@ pub fn main() { } // In the previous bash entrypoint, environmentd would sleep forever after // a graceful exit. This keeps the container alive for debugging. Replicate - // that behavior here. + // that behavior here. In practice this is unreachable: run() blocks + // forever via thread::park(). Kept as a defensive safety net. eprintln!("environmentd exited gracefully; sleeping forever"); loop { std::thread::sleep(std::time::Duration::from_secs(86400)); diff --git a/src/orchestratord/src/controller/balancer.rs b/src/orchestratord/src/controller/balancer.rs index 512367efac00c..727523251d5d3 100644 --- a/src/orchestratord/src/controller/balancer.rs +++ b/src/orchestratord/src/controller/balancer.rs @@ -163,6 +163,8 @@ impl Context { fn pod_uid_gid(image_ref: &str) -> i64 { // Distroless images (v26.19+) run as the `nonroot` user (uid/gid 65534). // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + // Note: balancerd transitioned to distroless one release earlier than + // environmentd/clusterd (which use V26_20_0 in generation.rs). static V26_19_0: std::sync::LazyLock = std::sync::LazyLock::new(|| semver::Version { major: 26, @@ -171,15 +173,17 @@ impl Context { pre: semver::Prerelease::new("dev.0").expect("dev.0 is valid prerelease"), build: semver::BuildMetadata::new("").expect("empty string is valid buildmetadata"), }); - let is_distroless = parse_image_ref(image_ref) - .map(|v| v.cmp_precedence(&V26_19_0).is_ge()) - // Unparseable image refs are assumed to be recent dev builds. - .unwrap_or(true); - if is_distroless { - 65534 - } else { - 999 - } + let is_distroless = match parse_image_ref(image_ref) { + Some(v) => v.cmp_precedence(&V26_19_0).is_ge(), + None => { + tracing::warn!( + image_ref, + "failed to parse balancerd image ref; assuming distroless" + ); + true + } + }; + if is_distroless { 65534 } else { 999 } } fn create_deployment_object(&self, balancer: &Balancer) -> anyhow::Result { diff --git a/src/orchestratord/src/controller/materialize/generation.rs b/src/orchestratord/src/controller/materialize/generation.rs index 7a9997e7914d4..ff32b7fd8a3fa 100644 --- a/src/orchestratord/src/controller/materialize/generation.rs +++ b/src/orchestratord/src/controller/materialize/generation.rs @@ -81,6 +81,9 @@ static V26_1_0: LazyLock = LazyLock::new(|| Version { build: BuildMetadata::new("").expect("empty string is valid buildmetadata"), }); +/// Minimum version for distroless environmentd/clusterd images (nonroot +/// uid/gid 65534). Balancerd transitioned one release earlier at V26_19_0 +/// (see balancer.rs). static V26_20_0: LazyLock = LazyLock::new(|| Version { major: 26, minor: 20, @@ -874,6 +877,12 @@ fn create_environmentd_statefulset_object( } // Distroless images (v26.20+) run as the `nonroot` user (uid/gid 65534). // Older Ubuntu-based images use the `materialize` user (uid/gid 999). + // This value is used for both the environmentd pod security context and + // the --orchestrator-kubernetes-service-fs-group arg (which controls + // clusterd pod security contexts). Both transition at the same version. + // Note: Kubernetes fsGroup re-chowns volume contents on mount, so + // existing PVCs with UID 999 files will be migrated automatically + // (may add startup latency for large volumes). let service_fs_group: i64 = if mz.meets_minimum_version(&V26_20_0) { 65534 } else { From 63809b79f0e630c356562c067fb1ade7816bb58e Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 19:06:58 -0700 Subject: [PATCH 14/19] distroless: address Alphadelta14 review feedback - Remove dead LD_PRELOAD in-process logic from clusterd and environmentd: the dynamic linker processes LD_PRELOAD at _start, before main(), so setting it via std::env::set_var has no effect on the current process. - Add LD_PRELOAD=libeatmydata.so back to clusterd.py as a container env var, which is the mechanism that actually works. - Use nix::unistd::gethostname() instead of raw libc::gethostname for safer hostname resolution (libc retained for getaddrinfo which has no safe Rust wrapper). - Use debian:13-slim instead of debian:trixie-slim to make the version match with distroless cc-debian13 obvious. - Update Dockerfile comments to reflect that LD_PRELOAD must be set as a container env var, not in-process. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/distroless-prod-base/Dockerfile | 9 ++++---- .../mzcompose/services/clusterd.py | 1 + src/clusterd/Cargo.toml | 2 +- src/clusterd/src/lib.rs | 23 +++++-------------- src/environmentd/src/environmentd/main.rs | 12 ---------- 5 files changed, 13 insertions(+), 34 deletions(-) diff --git a/misc/images/distroless-prod-base/Dockerfile b/misc/images/distroless-prod-base/Dockerfile index 42e737133729b..b05618c0c14a3 100644 --- a/misc/images/distroless-prod-base/Dockerfile +++ b/misc/images/distroless-prod-base/Dockerfile @@ -11,15 +11,16 @@ # every time we get a CI builder with a cold cache. # Extract libeatmydata from a Debian image for CI use. eatmydata disables -# fsync for faster test execution. Services opt in via MZ_EAT_MY_DATA=1. -FROM debian:trixie-slim AS eatmydata +# fsync for faster test execution. Activated via LD_PRELOAD=libeatmydata.so +# set as a container environment variable (must be set before process start). +FROM debian:13-slim AS eatmydata RUN apt-get update && apt-get install -y --no-install-recommends eatmydata \ && rm -rf /var/lib/apt/lists/* FROM gcr.io/distroless/cc-debian13:nonroot-28078d2e5e77671d2046dcc9e2c75334e31efa4d -# Copy libeatmydata for CI performance optimization (no-op if MZ_EAT_MY_DATA -# is unset; harmless in production). +# Copy libeatmydata for CI performance optimization (no-op unless +# LD_PRELOAD=libeatmydata.so is set as a container env var). COPY --from=eatmydata /usr/lib/*/libeatmydata.so /usr/lib/ USER nonroot diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index 83e5089db10b7..cb035dae09e7e 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -41,6 +41,7 @@ def __init__( "CLUSTERD_USE_CTP=true", "MZ_SOFT_ASSERTIONS=1", "MZ_EAT_MY_DATA=1", + "LD_PRELOAD=libeatmydata.so", # Defaults that were previously set by the clusterd entrypoint.sh. "CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100", "CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101", diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index f072e636a2a59..b5d29d660689a 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -39,7 +39,7 @@ mz-storage-client = { path = "../storage-client" } mz-storage-types = { path = "../storage-types" } mz-timely-util = { path = "../timely-util" } mz-txn-wal = { path = "../txn-wal" } -nix.workspace = true +nix = { workspace = true, features = ["hostname"] } num_cpus.workspace = true serde.workspace = true tokio.workspace = true diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 32d5ce6181a49..b26971ecd8c63 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -234,20 +234,18 @@ pub fn main() { // // This avoids shelling out to `hostname --fqdn` which isn't // available in distroless images. - // Use the gethostname() syscall, matching what `hostname --fqdn` - // does internally, then resolve to FQDN via getaddrinfo. - let mut buf = [0u8; 256]; - if unsafe { libc::gethostname(buf.as_mut_ptr().cast(), buf.len()) } == 0 { - let hostname = unsafe { std::ffi::CStr::from_ptr(buf.as_ptr().cast()) }; - if let Ok(short) = hostname.to_str() { + if let Ok(hostname) = nix::unistd::gethostname() { + if let Some(short) = hostname.to_str() { let fqdn = resolve_fqdn(short); unsafe { std::env::set_var("CLUSTERD_GRPC_HOST", &fqdn) }; } } } if std::env::var("CLUSTERD_PROCESS").is_err() { - // Extract the ordinal index from the StatefulSet hostname - // (e.g., "clusterd-0" → "0"). + // Extract the ordinal index from the last segment of the + // StatefulSet hostname (e.g., "mz5ncn-cluster-s1-replica-s1-gen-1-0" + // → "0"). This matches orchestrator-kubernetes which also uses + // split('-').next_back() to extract the process ID from pod names. if let Ok(hostname) = std::env::var("HOSTNAME") { if let Some(ordinal) = hostname.rsplit('-').next() { unsafe { std::env::set_var("CLUSTERD_PROCESS", ordinal) }; @@ -256,15 +254,6 @@ pub fn main() { } } - // Configure LD_PRELOAD for eatmydata if requested (CI performance - // optimization). In distroless images libeatmydata.so is not available, - // so this is a no-op. - if std::env::var("MZ_EAT_MY_DATA").is_ok() { - unsafe { std::env::set_var("LD_PRELOAD", "libeatmydata.so") }; - } else { - unsafe { std::env::remove_var("LD_PRELOAD") }; - } - let args = cli::parse_args(CliConfig { env_prefix: Some("CLUSTERD_"), enable_version_flag: true, diff --git a/src/environmentd/src/environmentd/main.rs b/src/environmentd/src/environmentd/main.rs index bccce3a07510d..9528033c44fe5 100644 --- a/src/environmentd/src/environmentd/main.rs +++ b/src/environmentd/src/environmentd/main.rs @@ -643,18 +643,6 @@ fn aws_secrets_controller_key_alias(env_id: &EnvironmentId) -> String { } pub fn main() { - // Configure LD_PRELOAD for eatmydata if requested (CI performance - // optimization). In distroless images libeatmydata.so is not available, - // so this is a no-op. - // - // SAFETY: Called at the very start of main(), before any threads are - // spawned or other initialization runs. - if std::env::var("MZ_EAT_MY_DATA").is_ok() { - unsafe { std::env::set_var("LD_PRELOAD", "libeatmydata.so") }; - } else { - unsafe { std::env::remove_var("LD_PRELOAD") }; - } - let args = cli::parse_args(CliConfig { env_prefix: Some("MZ_"), enable_version_flag: true, From 088152bee38e5bf5b0b9e474b3bad6f50312d777 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 19:23:13 -0700 Subject: [PATCH 15/19] distroless: add SIGTERM handler, remove dead code, add tests - Add termination signal handlers to clusterd. As PID 1 in distroless containers (no tini), the Linux kernel ignores signals without explicit handlers. Without this, SIGTERM from Kubernetes pod termination would be silently ignored and the pod would hang until SIGKILL. - Remove unreachable sleep-forever loop from environmentd main(). The run() function blocks forever via thread::park() and never returns Ok(()). If it somehow does, let the process exit so the issue is visible rather than silently sleeping. - Add blocking DNS note to resolve_fqdn doc comment. - Add unit tests for pod_uid_gid version boundary in balancer. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/clusterd/Cargo.toml | 2 +- src/clusterd/src/lib.rs | 55 ++++++++++++++++++++ src/environmentd/src/environmentd/main.rs | 11 ++-- src/orchestratord/src/controller/balancer.rs | 25 +++++++++ 4 files changed, 84 insertions(+), 9 deletions(-) diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index b5d29d660689a..56a73de0ec266 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -39,7 +39,7 @@ mz-storage-client = { path = "../storage-client" } mz-storage-types = { path = "../storage-types" } mz-timely-util = { path = "../timely-util" } mz-txn-wal = { path = "../txn-wal" } -nix = { workspace = true, features = ["hostname"] } +nix = { workspace = true, features = ["hostname", "signal"] } num_cpus.workspace = true serde.workspace = true tokio.workspace = true diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index b26971ecd8c63..05e6af614f15c 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -50,6 +50,11 @@ const BUILD_INFO: BuildInfo = build_info!(); /// Resolves a short hostname to its FQDN using `getaddrinfo` with /// `AI_CANONNAME`, equivalent to `hostname --fqdn`. Falls back to the /// short hostname if DNS resolution fails. +/// +/// Note: `getaddrinfo` is a blocking call with no timeout. If DNS is +/// unavailable at pod startup (e.g., CoreDNS restart), this will block +/// the main thread indefinitely. In practice this is rare since CoreDNS +/// runs as a DaemonSet and is available before user pods start. fn resolve_fqdn(short_hostname: &str) -> String { use std::ffi::{CStr, CString}; use std::ptr; @@ -212,7 +217,57 @@ struct Args { worker_core_affinity: bool, } +/// Install signal handlers so that termination signals are not ignored. +/// +/// On Linux, PID 1 has special signal semantics: the kernel will not +/// deliver signals whose disposition is SIG_DFL (the default). Since +/// distroless containers run the binary directly as PID 1 (no tini), +/// signals like SIGTERM from Kubernetes pod termination would be silently +/// ignored without explicit handlers. This function registers a handler +/// that restores the default disposition and re-raises, producing the +/// expected termination behavior. +fn install_termination_signal_handlers() { + use nix::sys::signal; + + extern "C" fn handle_signal(signum: i32) { + // Restore default handler and re-raise so the process terminates + // with the correct signal and exit code. + let action = signal::SigAction::new( + signal::SigHandler::SigDfl, + signal::SaFlags::SA_NODEFER | signal::SaFlags::SA_ONSTACK, + signal::SigSet::empty(), + ); + unsafe { signal::sigaction(signum.try_into().unwrap(), &action) } + .unwrap_or_else(|_| panic!("failed to uninstall handler for {}", signum)); + let ret = unsafe { libc::raise(signum) }; + if ret == -1 { + panic!("failed to re-raise signal {}", signum); + } + } + + let action = signal::SigAction::new( + signal::SigHandler::Handler(handle_signal), + signal::SaFlags::SA_NODEFER | signal::SaFlags::SA_ONSTACK, + signal::SigSet::empty(), + ); + for signum in &[ + signal::SIGHUP, + signal::SIGINT, + signal::SIGALRM, + signal::SIGTERM, + ] { + unsafe { signal::sigaction(*signum, &action) } + .unwrap_or_else(|e| panic!("failed to install handler for {}: {}", signum, e)); + } +} + pub fn main() { + // Install signal handlers before anything else. As PID 1 in a + // distroless container, the kernel ignores signals without explicit + // handlers — without this, SIGTERM from Kubernetes would be ignored + // and the pod would hang until SIGKILL. + install_termination_signal_handlers(); + mz_ore::panic::install_enhanced_handler(); // When running in Kubernetes, auto-detect the GRPC host from the pod's FQDN diff --git a/src/environmentd/src/environmentd/main.rs b/src/environmentd/src/environmentd/main.rs index 9528033c44fe5..aa3acf7435ae2 100644 --- a/src/environmentd/src/environmentd/main.rs +++ b/src/environmentd/src/environmentd/main.rs @@ -650,14 +650,9 @@ pub fn main() { if let Err(err) = run(args) { panic!("environmentd: fatal: {}", err.display_with_causes()); } - // In the previous bash entrypoint, environmentd would sleep forever after - // a graceful exit. This keeps the container alive for debugging. Replicate - // that behavior here. In practice this is unreachable: run() blocks - // forever via thread::park(). Kept as a defensive safety net. - eprintln!("environmentd exited gracefully; sleeping forever"); - loop { - std::thread::sleep(std::time::Duration::from_secs(86400)); - } + // run() blocks forever via thread::park() and never returns Ok(()). + // If it somehow does, let the process exit so the issue is visible + // (Kubernetes will restart it and the logs will show the exit). } fn run(mut args: Args) -> Result<(), anyhow::Error> { diff --git a/src/orchestratord/src/controller/balancer.rs b/src/orchestratord/src/controller/balancer.rs index 727523251d5d3..62c86ec462338 100644 --- a/src/orchestratord/src/controller/balancer.rs +++ b/src/orchestratord/src/controller/balancer.rs @@ -626,3 +626,28 @@ impl k8s_controller::Context for Context { Ok(None) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + fn test_pod_uid_gid() { + // Boundary: v26.19.0 is the first distroless version + assert_eq!(Context::pod_uid_gid("materialize/balancerd:v26.18.0"), 999); + assert_eq!( + Context::pod_uid_gid("materialize/balancerd:v26.19.0"), + 65534 + ); + // Pre-release below threshold gets Ubuntu + assert_eq!( + Context::pod_uid_gid("materialize/balancerd:v26.19.0-dev"), + 999 + ); + // Unparseable refs assume distroless + assert_eq!( + Context::pod_uid_gid("materialize/balancerd@sha256:abc"), + 65534 + ); + } +} From 6d042f2ede44f85fcccbe2d7e333cd98a62158e9 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 21:19:33 -0700 Subject: [PATCH 16/19] clusterd: remove direct libc dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace getaddrinfo-based FQDN resolution with /etc/hosts parsing. In Kubernetes, the kubelet writes the pod's FQDN into /etc/hosts for StatefulSet pods, so we can read it directly — no FFI needed and no risk of blocking indefinitely on DNS. Replace libc::raise with nix::sys::signal::raise in the termination signal handler. Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 1 - src/clusterd/Cargo.toml | 1 - src/clusterd/src/lib.rs | 83 +++++++++++++++++------------------------ 3 files changed, 35 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba3626d368980..845f9eaa7864f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6050,7 +6050,6 @@ dependencies = [ "futures", "hyper 1.9.0", "hyper-util", - "libc", "mz-alloc", "mz-alloc-default", "mz-build-info", diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index 56a73de0ec266..06a086bda981e 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -17,7 +17,6 @@ fail.workspace = true futures.workspace = true hyper.workspace = true hyper-util.workspace = true -libc.workspace = true mz-alloc = { path = "../alloc" } mz-alloc-default = { path = "../alloc-default", optional = true } mz-build-info = { path = "../build-info" } diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 05e6af614f15c..680d5d490af31 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -47,53 +47,42 @@ mod usage_metrics; const BUILD_INFO: BuildInfo = build_info!(); -/// Resolves a short hostname to its FQDN using `getaddrinfo` with -/// `AI_CANONNAME`, equivalent to `hostname --fqdn`. Falls back to the -/// short hostname if DNS resolution fails. +/// Resolves a short hostname to its FQDN by parsing `/etc/hosts`. /// -/// Note: `getaddrinfo` is a blocking call with no timeout. If DNS is -/// unavailable at pod startup (e.g., CoreDNS restart), this will block -/// the main thread indefinitely. In practice this is rare since CoreDNS -/// runs as a DaemonSet and is available before user pods start. +/// In Kubernetes, the kubelet writes the pod's FQDN into `/etc/hosts` +/// for StatefulSet pods with hostname/subdomain configured, e.g.: +/// +/// ```text +/// 10.0.1.5 clusterd-0.svc.namespace.svc.cluster.local clusterd-0 +/// ``` +/// +/// This approach avoids FFI (`getaddrinfo`) and works even if CoreDNS +/// is temporarily unavailable at pod startup. Falls back to the short +/// hostname if no FQDN is found. fn resolve_fqdn(short_hostname: &str) -> String { - use std::ffi::{CStr, CString}; - use std::ptr; - - let Ok(c_host) = CString::new(short_hostname) else { - return short_hostname.to_string(); - }; - - let mut hints: libc::addrinfo = unsafe { std::mem::zeroed() }; - hints.ai_flags = libc::AI_CANONNAME; - hints.ai_family = libc::AF_UNSPEC; - - let mut result: *mut libc::addrinfo = ptr::null_mut(); - - let rc = unsafe { libc::getaddrinfo(c_host.as_ptr(), ptr::null(), &hints, &mut result) }; - - if rc != 0 || result.is_null() { - eprintln!( - "warning: getaddrinfo failed for {:?} (rc={}); falling back to short hostname. \ - GRPC host validation may not work correctly.", - short_hostname, rc - ); - return short_hostname.to_string(); - } - - let fqdn = unsafe { - let info = &*result; - if info.ai_canonname.is_null() { - short_hostname.to_string() - } else { - CStr::from_ptr(info.ai_canonname) - .to_string_lossy() - .into_owned() + if let Ok(hosts) = std::fs::read_to_string("/etc/hosts") { + for line in hosts.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + // /etc/hosts format: IP_ADDRESS CANONICAL_NAME [ALIASES...] + let mut fields = line.split_whitespace(); + let _ip = fields.next(); + let names: Vec<&str> = fields.collect(); + if names.iter().any(|&n| n == short_hostname) { + if let Some(fqdn) = names.iter().find(|&&n| n.contains('.')) { + return fqdn.to_string(); + } + } } - }; - - unsafe { libc::freeaddrinfo(result) }; - - fqdn + } + eprintln!( + "warning: could not resolve FQDN for {:?} from /etc/hosts; \ + falling back to short hostname. GRPC host validation may not work correctly.", + short_hostname, + ); + short_hostname.to_string() } pub static VERSION: LazyLock = LazyLock::new(|| BUILD_INFO.human_version(None)); @@ -239,10 +228,8 @@ fn install_termination_signal_handlers() { ); unsafe { signal::sigaction(signum.try_into().unwrap(), &action) } .unwrap_or_else(|_| panic!("failed to uninstall handler for {}", signum)); - let ret = unsafe { libc::raise(signum) }; - if ret == -1 { - panic!("failed to re-raise signal {}", signum); - } + signal::raise(signum.try_into().unwrap()) + .unwrap_or_else(|_| panic!("failed to re-raise signal {}", signum)); } let action = signal::SigAction::new( From 240f44ab496d8f2470c55195935a48576b4ce04f Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 21:20:36 -0700 Subject: [PATCH 17/19] Revert "clusterd: remove direct libc dependency" This reverts commit 534c26cb25477ca69b5cead7598d5858452b9550. --- Cargo.lock | 1 + src/clusterd/Cargo.toml | 1 + src/clusterd/src/lib.rs | 83 ++++++++++++++++++++++++----------------- 3 files changed, 50 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 845f9eaa7864f..ba3626d368980 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6050,6 +6050,7 @@ dependencies = [ "futures", "hyper 1.9.0", "hyper-util", + "libc", "mz-alloc", "mz-alloc-default", "mz-build-info", diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index 06a086bda981e..56a73de0ec266 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -17,6 +17,7 @@ fail.workspace = true futures.workspace = true hyper.workspace = true hyper-util.workspace = true +libc.workspace = true mz-alloc = { path = "../alloc" } mz-alloc-default = { path = "../alloc-default", optional = true } mz-build-info = { path = "../build-info" } diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 680d5d490af31..05e6af614f15c 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -47,42 +47,53 @@ mod usage_metrics; const BUILD_INFO: BuildInfo = build_info!(); -/// Resolves a short hostname to its FQDN by parsing `/etc/hosts`. +/// Resolves a short hostname to its FQDN using `getaddrinfo` with +/// `AI_CANONNAME`, equivalent to `hostname --fqdn`. Falls back to the +/// short hostname if DNS resolution fails. /// -/// In Kubernetes, the kubelet writes the pod's FQDN into `/etc/hosts` -/// for StatefulSet pods with hostname/subdomain configured, e.g.: -/// -/// ```text -/// 10.0.1.5 clusterd-0.svc.namespace.svc.cluster.local clusterd-0 -/// ``` -/// -/// This approach avoids FFI (`getaddrinfo`) and works even if CoreDNS -/// is temporarily unavailable at pod startup. Falls back to the short -/// hostname if no FQDN is found. +/// Note: `getaddrinfo` is a blocking call with no timeout. If DNS is +/// unavailable at pod startup (e.g., CoreDNS restart), this will block +/// the main thread indefinitely. In practice this is rare since CoreDNS +/// runs as a DaemonSet and is available before user pods start. fn resolve_fqdn(short_hostname: &str) -> String { - if let Ok(hosts) = std::fs::read_to_string("/etc/hosts") { - for line in hosts.lines() { - let line = line.trim(); - if line.is_empty() || line.starts_with('#') { - continue; - } - // /etc/hosts format: IP_ADDRESS CANONICAL_NAME [ALIASES...] - let mut fields = line.split_whitespace(); - let _ip = fields.next(); - let names: Vec<&str> = fields.collect(); - if names.iter().any(|&n| n == short_hostname) { - if let Some(fqdn) = names.iter().find(|&&n| n.contains('.')) { - return fqdn.to_string(); - } - } - } + use std::ffi::{CStr, CString}; + use std::ptr; + + let Ok(c_host) = CString::new(short_hostname) else { + return short_hostname.to_string(); + }; + + let mut hints: libc::addrinfo = unsafe { std::mem::zeroed() }; + hints.ai_flags = libc::AI_CANONNAME; + hints.ai_family = libc::AF_UNSPEC; + + let mut result: *mut libc::addrinfo = ptr::null_mut(); + + let rc = unsafe { libc::getaddrinfo(c_host.as_ptr(), ptr::null(), &hints, &mut result) }; + + if rc != 0 || result.is_null() { + eprintln!( + "warning: getaddrinfo failed for {:?} (rc={}); falling back to short hostname. \ + GRPC host validation may not work correctly.", + short_hostname, rc + ); + return short_hostname.to_string(); } - eprintln!( - "warning: could not resolve FQDN for {:?} from /etc/hosts; \ - falling back to short hostname. GRPC host validation may not work correctly.", - short_hostname, - ); - short_hostname.to_string() + + let fqdn = unsafe { + let info = &*result; + if info.ai_canonname.is_null() { + short_hostname.to_string() + } else { + CStr::from_ptr(info.ai_canonname) + .to_string_lossy() + .into_owned() + } + }; + + unsafe { libc::freeaddrinfo(result) }; + + fqdn } pub static VERSION: LazyLock = LazyLock::new(|| BUILD_INFO.human_version(None)); @@ -228,8 +239,10 @@ fn install_termination_signal_handlers() { ); unsafe { signal::sigaction(signum.try_into().unwrap(), &action) } .unwrap_or_else(|_| panic!("failed to uninstall handler for {}", signum)); - signal::raise(signum.try_into().unwrap()) - .unwrap_or_else(|_| panic!("failed to re-raise signal {}", signum)); + let ret = unsafe { libc::raise(signum) }; + if ret == -1 { + panic!("failed to re-raise signal {}", signum); + } } let action = signal::SigAction::new( From 1980a5d53b1a7bafe28f49b660ee4cb5f6054259 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 22:28:05 -0700 Subject: [PATCH 18/19] containers: disable PKCS#11 in openssh-static build AWS-LC does not provide RSA_meth_dup, RSA_meth_set1_name, or EC_KEY_METHOD_get_sign which ssh-pkcs11.c requires. Since we only need the ssh client for tunnels (no smartcard/PKCS#11 auth), disable it at configure time. Co-Authored-By: Claude Opus 4.6 (1M context) --- misc/images/openssh-static/Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/misc/images/openssh-static/Dockerfile b/misc/images/openssh-static/Dockerfile index 3410ecbc78e67..d4642a1e4cd0d 100644 --- a/misc/images/openssh-static/Dockerfile +++ b/misc/images/openssh-static/Dockerfile @@ -76,15 +76,18 @@ RUN git clone --depth 1 --branch ${OPENSSH_VERSION} https://github.com/openssh/o --with-ldflags=-static \ --without-pam \ --without-libedit \ + --disable-pkcs11 \ # AWS-LC does not define the legacy OpenSSL BN_FLG_CONSTTIME flag. # Setting it to 0 satisfies #ifdef checks in OpenSSH source code. # This is safe: AWS-LC handles constant-time bignum operations # internally and does not rely on this flag. + # --disable-pkcs11 avoids link errors from ssh-pkcs11.c calling + # RSA_meth_dup/EC_KEY_METHOD_get_sign which AWS-LC does not provide. && make -j"$(nproc)" ssh CFLAGS="-DBN_FLG_CONSTTIME=0" \ && strip ssh -# Verify the binary is statically linked and functional. -RUN file ssh | grep -q 'statically linked' \ +# Verify the binary is not dynamically linked and is functional. +RUN ! ldd ssh 2>/dev/null \ && ./ssh -V # Output stage: just the binary. From dfdbdf715fe531a94c746411e7e030f883fa378f Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Thu, 9 Apr 2026 21:19:33 -0700 Subject: [PATCH 19/19] clusterd: remove direct libc dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace getaddrinfo-based FQDN resolution with /etc/hosts parsing. In Kubernetes, the kubelet writes the pod's FQDN into /etc/hosts for StatefulSet pods, so we can read it directly — no FFI needed and no risk of blocking indefinitely on DNS. Replace libc::raise with nix::sys::signal::raise in the termination signal handler. Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 1 - src/clusterd/Cargo.toml | 1 - src/clusterd/src/lib.rs | 83 +++++++++++++++++------------------------ 3 files changed, 35 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba3626d368980..845f9eaa7864f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6050,7 +6050,6 @@ dependencies = [ "futures", "hyper 1.9.0", "hyper-util", - "libc", "mz-alloc", "mz-alloc-default", "mz-build-info", diff --git a/src/clusterd/Cargo.toml b/src/clusterd/Cargo.toml index 56a73de0ec266..06a086bda981e 100644 --- a/src/clusterd/Cargo.toml +++ b/src/clusterd/Cargo.toml @@ -17,7 +17,6 @@ fail.workspace = true futures.workspace = true hyper.workspace = true hyper-util.workspace = true -libc.workspace = true mz-alloc = { path = "../alloc" } mz-alloc-default = { path = "../alloc-default", optional = true } mz-build-info = { path = "../build-info" } diff --git a/src/clusterd/src/lib.rs b/src/clusterd/src/lib.rs index 05e6af614f15c..96c2f8ed9e19d 100644 --- a/src/clusterd/src/lib.rs +++ b/src/clusterd/src/lib.rs @@ -47,53 +47,42 @@ mod usage_metrics; const BUILD_INFO: BuildInfo = build_info!(); -/// Resolves a short hostname to its FQDN using `getaddrinfo` with -/// `AI_CANONNAME`, equivalent to `hostname --fqdn`. Falls back to the -/// short hostname if DNS resolution fails. +/// Resolves a short hostname to its FQDN by parsing `/etc/hosts`. /// -/// Note: `getaddrinfo` is a blocking call with no timeout. If DNS is -/// unavailable at pod startup (e.g., CoreDNS restart), this will block -/// the main thread indefinitely. In practice this is rare since CoreDNS -/// runs as a DaemonSet and is available before user pods start. +/// In Kubernetes, the kubelet writes the pod's FQDN into `/etc/hosts` +/// for StatefulSet pods with hostname/subdomain configured, e.g.: +/// +/// ```text +/// 10.0.1.5 clusterd-0.svc.namespace.svc.cluster.local clusterd-0 +/// ``` +/// +/// This approach avoids FFI (`getaddrinfo`) and works even if CoreDNS +/// is temporarily unavailable at pod startup. Falls back to the short +/// hostname if no FQDN is found. fn resolve_fqdn(short_hostname: &str) -> String { - use std::ffi::{CStr, CString}; - use std::ptr; - - let Ok(c_host) = CString::new(short_hostname) else { - return short_hostname.to_string(); - }; - - let mut hints: libc::addrinfo = unsafe { std::mem::zeroed() }; - hints.ai_flags = libc::AI_CANONNAME; - hints.ai_family = libc::AF_UNSPEC; - - let mut result: *mut libc::addrinfo = ptr::null_mut(); - - let rc = unsafe { libc::getaddrinfo(c_host.as_ptr(), ptr::null(), &hints, &mut result) }; - - if rc != 0 || result.is_null() { - eprintln!( - "warning: getaddrinfo failed for {:?} (rc={}); falling back to short hostname. \ - GRPC host validation may not work correctly.", - short_hostname, rc - ); - return short_hostname.to_string(); - } - - let fqdn = unsafe { - let info = &*result; - if info.ai_canonname.is_null() { - short_hostname.to_string() - } else { - CStr::from_ptr(info.ai_canonname) - .to_string_lossy() - .into_owned() + if let Ok(hosts) = std::fs::read_to_string("/etc/hosts") { + for line in hosts.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + // /etc/hosts format: IP_ADDRESS CANONICAL_NAME [ALIASES...] + let mut fields = line.split_whitespace(); + let _ip = fields.next(); + let names: Vec<&str> = fields.collect(); + if names.contains(&short_hostname) { + if let Some(fqdn) = names.iter().find(|&&n| n.contains('.')) { + return fqdn.to_string(); + } + } } - }; - - unsafe { libc::freeaddrinfo(result) }; - - fqdn + } + eprintln!( + "warning: could not resolve FQDN for {:?} from /etc/hosts; \ + falling back to short hostname. GRPC host validation may not work correctly.", + short_hostname, + ); + short_hostname.to_string() } pub static VERSION: LazyLock = LazyLock::new(|| BUILD_INFO.human_version(None)); @@ -239,10 +228,8 @@ fn install_termination_signal_handlers() { ); unsafe { signal::sigaction(signum.try_into().unwrap(), &action) } .unwrap_or_else(|_| panic!("failed to uninstall handler for {}", signum)); - let ret = unsafe { libc::raise(signum) }; - if ret == -1 { - panic!("failed to re-raise signal {}", signum); - } + signal::raise(signum.try_into().unwrap()) + .unwrap_or_else(|_| panic!("failed to re-raise signal {}", signum)); } let action = signal::SigAction::new(