From 2dfe22fa078d197014b37a39db95f43e8c9814c9 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Tue, 14 Apr 2026 22:13:44 -0700 Subject: [PATCH] containers: switch environmentd and clusterd to distroless base image Replace Debian-based prod images with distroless. Remove shell entrypoint scripts (no shell in distroless). Add libeatmydata to distroless-prod-base for CI compatibility. Update clusterd mzcompose service to drop shell-dependent options. Distroless ships no init, so add a tini-static mzbuild image (static tini built from source, mirroring openssh-static) and copy it into distroless-prod-base. tini stays PID 1 to forward signals and reap zombies, notably the ssh subprocesses spawned by the SSH tunnel feature. Leaf images wrap their binary in ENTRYPOINT ["/usr/bin/tini", "--", ...]. The deleted environmentd entrypoint slept forever on graceful exit so a fenced-out generation would not be restarted. Distroless has no shell to do this, and a Kubernetes StatefulSet (restartPolicy: Always) would crash-loop a process that exits. Replace it with an in-process idle: when fenced out and supervised directly by Kubernetes, environmentd idles until the orchestrator deletes the StatefulSet (SIGTERM, then SIGKILL) instead of calling exit!(0). The all-in-one image keeps its entrypoint and exit-0 behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- misc/images/distroless-prod-base/Dockerfile | 19 +++++++ misc/images/tini-static/Dockerfile | 49 +++++++++++++++++++ .../images/tini-static/mzbuild.yml | 18 +------ .../mzcompose/services/clusterd.py | 5 +- src/clusterd/ci/Dockerfile | 10 ++-- src/clusterd/ci/entrypoint.sh | 41 ---------------- src/environmentd/ci/Dockerfile | 10 ++-- src/environmentd/src/deployment/preflight.rs | 12 +++++ src/environmentd/src/environmentd/main.rs | 1 + src/environmentd/src/lib.rs | 6 +++ src/environmentd/src/test_util.rs | 1 + src/sqllogictest/src/runner.rs | 1 + 12 files changed, 107 insertions(+), 66 deletions(-) create mode 100644 misc/images/tini-static/Dockerfile rename src/environmentd/ci/entrypoint.sh => misc/images/tini-static/mzbuild.yml (57%) mode change 100755 => 100644 delete mode 100755 src/clusterd/ci/entrypoint.sh diff --git a/misc/images/distroless-prod-base/Dockerfile b/misc/images/distroless-prod-base/Dockerfile index a685f9b0e8136..967e571de5514 100644 --- a/misc/images/distroless-prod-base/Dockerfile +++ b/misc/images/distroless-prod-base/Dockerfile @@ -10,7 +10,26 @@ # This is a separate mzimage so that we don't have to re-install the apt things # every time we get a CI builder with a cold cache. +# Extract libeatmydata from a Debian image for CI use. eatmydata disables +# fsync for faster test execution. Activated via LD_PRELOAD=libeatmydata.so +# set as a container environment variable (must be set before process start). +FROM debian:13-slim AS eatmydata +RUN apt-get update && apt-get install -y --no-install-recommends eatmydata \ + && rm -rf /var/lib/apt/lists/* + +# Statically-linked `tini`, the PID 1 init for all distroless prod images. +# Distroless ships no init, so without this each leaf binary would run as PID 1 +# and have to forward signals and reap zombies (e.g. ssh tunnel subprocesses) +# itself. Leaf images opt in via ENTRYPOINT ["/usr/bin/tini", "--", ]. +MZFROM tini-static AS tini + FROM gcr.io/distroless/cc-debian13:nonroot-28078d2e5e77671d2046dcc9e2c75334e31efa4d + +# Copy libeatmydata for CI performance optimization (no-op unless +# LD_PRELOAD=libeatmydata.so is set as a container env var). +COPY --from=eatmydata /usr/lib/*/libeatmydata.so /usr/lib/ +COPY --from=tini /output/tini /usr/bin/tini + USER nonroot ENV HOME=/home/nonroot ENTRYPOINT [] diff --git a/misc/images/tini-static/Dockerfile b/misc/images/tini-static/Dockerfile new file mode 100644 index 0000000000000..f6e0f9e9f7c87 --- /dev/null +++ b/misc/images/tini-static/Dockerfile @@ -0,0 +1,49 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Build a statically-linked `tini` init binary. tini is the PID 1 we run in +# every production image: it forwards signals to the child and reaps orphaned +# zombies (e.g. ssh tunnel subprocesses). The static binary (~600KB, glibc) +# lets distroless images, which ship no init and no apt, keep using tini +# without a package manager. This replaces the apt-installed tini from the +# Debian-based prod-base. +# +# tini's CMake build emits both a dynamic `tini` and a static `tini-static` +# target. We keep only the static one. +# +# Usage: +# docker build -t tini-static . +# docker create --name extract tini-static +# docker cp extract:/output/tini ./tini +# docker rm extract + +FROM ubuntu:noble-20260210.1 AS builder + +ARG TINI_VERSION=v0.19.0 + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build/tini +RUN git clone --depth 1 --branch ${TINI_VERSION} https://github.com/krallin/tini.git . \ + && cmake . \ + && make tini-static \ + && strip tini-static + +# Verify the binary is not dynamically linked and is functional. +RUN ! ldd tini-static 2>/dev/null \ + && ./tini-static --version + +# Output stage: just the binary. +FROM scratch +COPY --from=builder /build/tini/tini-static /output/tini diff --git a/src/environmentd/ci/entrypoint.sh b/misc/images/tini-static/mzbuild.yml old mode 100755 new mode 100644 similarity index 57% rename from src/environmentd/ci/entrypoint.sh rename to misc/images/tini-static/mzbuild.yml index ff2f2337c2404..11c37f89e4390 --- a/src/environmentd/ci/entrypoint.sh +++ b/misc/images/tini-static/mzbuild.yml @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Copyright Materialize, Inc. and contributors. All rights reserved. # # Use of this software is governed by the Business Source License @@ -9,17 +7,5 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -set -euo pipefail - -if [ -z "${MZ_EAT_MY_DATA:-}" ]; then - unset LD_PRELOAD -else - export LD_PRELOAD=libeatmydata.so -fi - -if environmentd "$@"; then - echo "environmentd exited gracefully; sleeping forever" >&2 - sleep infinity -else - exit $? -fi +name: tini-static +description: Statically-linked tini init, the PID 1 used in distroless prod images. diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index 850043a6222f9..f96a0da900b37 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -47,13 +47,13 @@ def __init__( "CLUSTERD_USE_CTP=true", "MZ_SOFT_ASSERTIONS=1", "MZ_EAT_MY_DATA=1", + "LD_PRELOAD=libeatmydata.so", # Defaults that were previously set by the clusterd entrypoint.sh. "CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100", "CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101", "CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878", "CLUSTERD_SECRETS_READER=local-file", "CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets", - "LD_PRELOAD=libeatmydata.so", f"CLUSTERD_PERSIST_PUBSUB_URL=http://{mz_service}:6879", *environment_extra, ] @@ -90,6 +90,9 @@ def __init__( # Override the materialized entrypoint so that `clusterd` is invoked # via the command rather than via the entrypoint. This keeps # `c.exec()` working (it prepends the entrypoint to exec commands). + # Note: mzcompose uses the Ubuntu-based `materialized` image (with + # tini/bash), while production uses the distroless `clusterd` image. + # Keep this in mind when debugging CI-vs-prod discrepancies. config["entrypoint"] = ["tini", "--"] # Depending on the Docker Compose version, this may either work or be diff --git a/src/clusterd/ci/Dockerfile b/src/clusterd/ci/Dockerfile index 6df26f293a729..aa9764390f71a 100644 --- a/src/clusterd/ci/Dockerfile +++ b/src/clusterd/ci/Dockerfile @@ -7,10 +7,12 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM openssh-static AS openssh -COPY clusterd entrypoint.sh /usr/local/bin/ +MZFROM distroless-prod-base -USER materialize +COPY clusterd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh -ENTRYPOINT ["tini", "--", "entrypoint.sh"] +# tini (from distroless-prod-base) is PID 1 so ssh tunnel subprocesses get reaped. +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/clusterd"] diff --git a/src/clusterd/ci/entrypoint.sh b/src/clusterd/ci/entrypoint.sh deleted file mode 100755 index 6c3b749ac2fa5..0000000000000 --- a/src/clusterd/ci/entrypoint.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -set -euo pipefail - -# We pass default arguments as environment variables, and only if those -# environment variables do not already exist, to allow users to override these -# arguments when running the container via either environment variables or -# command-line arguments. -export CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=${CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR:-0.0.0.0:2100} -export CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=${CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR:-0.0.0.0:2101} -export CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=${CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR:-0.0.0.0:6878} -export CLUSTERD_SECRETS_READER=${CLUSTERD_SECRETS_READER:-local-file} -export CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=${CLUSTERD_SECRETS_READER_LOCAL_DIR:-/mzdata/secrets} - -if [[ "${KUBERNETES_SERVICE_HOST:-}" ]]; then - # Pass the host's FQDN as the host to be used for GRPC request validation - # only when running in Kubernetes. In other contexts (like when running - # locally, or in Docker), this is likely not desirable. - export CLUSTERD_GRPC_HOST=${CLUSTERD_GRPC_HOST:-$(hostname --fqdn)} - - # When running in Kubernetes, pass the StatefulSet replica's ordinal index - # as the process index. - export CLUSTERD_PROCESS=${CLUSTERD_PROCESS:-${HOSTNAME##*-}} -fi - -if [ -z "${MZ_EAT_MY_DATA:-}" ]; then - unset LD_PRELOAD -else - export LD_PRELOAD=libeatmydata.so -fi - -exec clusterd "$@" diff --git a/src/environmentd/ci/Dockerfile b/src/environmentd/ci/Dockerfile index 4ddef7ead92aa..d29a03f113bb2 100644 --- a/src/environmentd/ci/Dockerfile +++ b/src/environmentd/ci/Dockerfile @@ -7,10 +7,12 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -MZFROM prod-base +MZFROM openssh-static AS openssh -COPY environmentd entrypoint.sh /usr/local/bin/ +MZFROM distroless-prod-base -USER materialize +COPY environmentd /usr/local/bin/ +COPY --from=openssh /output/ssh /usr/bin/ssh -ENTRYPOINT ["tini", "--", "entrypoint.sh"] +# tini (from distroless-prod-base) is PID 1 so ssh tunnel subprocesses get reaped. +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/environmentd"] diff --git a/src/environmentd/src/deployment/preflight.rs b/src/environmentd/src/deployment/preflight.rs index 16a5885a52c87..0dbee4fd25d8f 100644 --- a/src/environmentd/src/deployment/preflight.rs +++ b/src/environmentd/src/deployment/preflight.rs @@ -41,6 +41,9 @@ pub struct PreflightInput { pub ddl_check_interval: Duration, pub panic_after_timeout: bool, pub bootstrap_args: BootstrapArgs, + /// Whether to idle in place, rather than exit, when fenced out by a newer + /// generation. See [`crate::Config::idle_when_fenced_out`]. + pub idle_when_fenced_out: bool, } /// Output of preflight checks. @@ -67,6 +70,7 @@ pub async fn preflight_0dt( ddl_check_interval, panic_after_timeout, bootstrap_args, + idle_when_fenced_out, }: PreflightInput, ) -> Result { info!(%deploy_generation, ?caught_up_max_wait, "performing 0dt preflight checks"); @@ -221,6 +225,14 @@ pub async fn preflight_0dt( read_only: false, caught_up_trigger: None, }) + } else if idle_when_fenced_out { + // Exiting under a Kubernetes StatefulSet (`restartPolicy: Always`) would + // restart the container straight back into this fenced-out state, a + // crash loop until the orchestrator deletes the StatefulSet. Idle until + // pod deletion's SIGTERM terminates us instead. + info!("this deployment has been fenced out; idling until terminated"); + std::future::pending::<()>().await; + unreachable!("pending() never resolves"); } else { exit!(0, "this deployment has been fenced out"); } diff --git a/src/environmentd/src/environmentd/main.rs b/src/environmentd/src/environmentd/main.rs index 489b63464507e..62f3442d47030 100644 --- a/src/environmentd/src/environmentd/main.rs +++ b/src/environmentd/src/environmentd/main.rs @@ -1103,6 +1103,7 @@ fn run(mut args: Args) -> Result<(), anyhow::Error> { secrets_controller, cloud_resource_controller, system_dyncfgs, + idle_when_fenced_out: matches!(args.orchestrator, OrchestratorKind::Kubernetes), // Storage options. storage_usage_collection_interval: args.storage_usage_collection_interval_sec, storage_usage_retention_period: args.storage_usage_retention_period, diff --git a/src/environmentd/src/lib.rs b/src/environmentd/src/lib.rs index 1ff9c51b2a20b..1c2b808becd0c 100644 --- a/src/environmentd/src/lib.rs +++ b/src/environmentd/src/lib.rs @@ -140,6 +140,11 @@ pub struct Config { pub cloud_resource_controller: Option>, /// The process-wide live system dyncfg set. pub system_dyncfgs: Arc, + /// Whether to idle in place, rather than exit, when this generation is + /// fenced out by a newer one. Set when a restart-on-exit process manager (a + /// Kubernetes StatefulSet) supervises the process directly, unset when a + /// supervising entrypoint sleeps on graceful exit instead. + pub idle_when_fenced_out: bool, // === Storage options. === /// The interval at which to collect storage usage information. @@ -654,6 +659,7 @@ impl Listeners { panic_after_timeout: enable_0dt_deployment_panic_after_timeout, bootstrap_args, ddl_check_interval: with_0dt_deployment_ddl_check_interval, + idle_when_fenced_out: config.idle_when_fenced_out, }; let PreflightOutput { openable_adapter_storage, diff --git a/src/environmentd/src/test_util.rs b/src/environmentd/src/test_util.rs index f060f22428d34..8c4a329afa886 100644 --- a/src/environmentd/src/test_util.rs +++ b/src/environmentd/src/test_util.rs @@ -886,6 +886,7 @@ impl Listeners { secrets_controller, cloud_resource_controller: None, system_dyncfgs, + idle_when_fenced_out: false, tls: config.tls, frontegg: config.frontegg, frontegg_oauth_issuer_url: None, diff --git a/src/sqllogictest/src/runner.rs b/src/sqllogictest/src/runner.rs index 0b919899ff6ca..4b72dc99200bf 100644 --- a/src/sqllogictest/src/runner.rs +++ b/src/sqllogictest/src/runner.rs @@ -1229,6 +1229,7 @@ impl<'a> RunnerInner<'a> { secrets_controller, cloud_resource_controller: None, system_dyncfgs, + idle_when_fenced_out: false, tls: None, frontegg: None, frontegg_oauth_issuer_url: None,