From a93ea50f56d4b37b8c4bc5db1b9c7575e4454368 Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Fri, 24 Apr 2026 17:16:10 +0200
Subject: [PATCH 1/8] using claude to build docker image with cuda support

---
 .github/workflows/ci.yml            |  28 +-
 .github/workflows/multiarch.yml     |  76 ++++-
 Dockerfile                          |  80 ++++-
 README.md                           |  64 +++-
 checkelf                            |  25 +-
 docs/24-04-2026-ffmpeg-with-cuda.md | 484 ++++++++++++++++++++++++++++
 6 files changed, 734 insertions(+), 23 deletions(-)
 create mode 100644 docs/24-04-2026-ffmpeg-with-cuda.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4ff3ec5..2fc32d5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,10 +8,30 @@ on:
 jobs:
   ci:
     strategy:
+      fail-fast: false
       matrix:
         include:
+          # default static build, both arches
           - runs_on: ubicloud-standard-30
+            variant: default
+            target: ""
+            build_args: |
+              ENABLE_FDKAAC=1
           - runs_on: ubicloud-standard-30-arm
+            variant: default
+            target: ""
+            build_args: |
+              ENABLE_FDKAAC=1
+          # CUDA variant (NVENC/NVDEC/CUVID), amd64 only for now.
+          # No GPU on the runner — the build only verifies that the binary
+          # links and that nvenc/cuvid/cuda show up in -encoders/-hwaccels.
+          - runs_on: ubicloud-standard-30
+            variant: cuda
+            target: final-cuda
+            build_args: |
+              ENABLE_FDKAAC=1
+              ENABLE_CUDA=1
+    name: ci (${{ matrix.variant }} / ${{ matrix.runs_on }})
     runs-on: ${{ matrix.runs_on }}
     steps:
       - uses: actions/checkout@v3
@@ -21,7 +41,7 @@ jobs:
         with:
           context: .
           push: false
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          build-args: |
-            ENABLE_FDKAAC=1
+          cache-from: type=gha,scope=${{ matrix.variant }}-${{ matrix.runs_on }}
+          cache-to: type=gha,mode=max,scope=${{ matrix.variant }}-${{ matrix.runs_on }}
+          target: ${{ matrix.target }}
+          build-args: ${{ matrix.build_args }}
diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml
index 2037ccc..02e435f 100644
--- a/.github/workflows/multiarch.yml
+++ b/.github/workflows/multiarch.yml
@@ -13,27 +13,50 @@ env:
 
 jobs:
   build:
-    name: Build image
+    name: Build image (${{ matrix.variant }} / ${{ matrix.tag }})
     strategy:
+      fail-fast: false
       matrix:
         include:
+          # default fully-static build, multi-arch
           - runs_on: ubicloud-standard-8-arm
             tag: arm64
+            variant: default
+            target: ""
+            build_args: ""
           - runs_on: ubuntu-latest
             tag: amd64
+            variant: default
+            target: ""
+            build_args: ""
+          # CUDA variant (NVENC/NVDEC/CUVID), amd64 only.
+          # If/when ffnvcodec is regularly tested on Jetson/arm64, add an arm64 entry.
+          - runs_on: ubuntu-latest
+            tag: amd64
+            variant: cuda
+            target: final-cuda
+            build_args: ENABLE_CUDA=1
 
     runs-on: ${{ matrix.runs_on }}
     steps:
       - uses: actions/checkout@v4
       - name: Docker build
-        run: docker build --tag image:${{ matrix.tag }} .
+        run: |
+          docker build \
+            ${{ matrix.target && format('--target {0}', matrix.target) || '' }} \
+            ${{ matrix.build_args && format('--build-arg {0}', matrix.build_args) || '' }} \
+            --tag image:${{ matrix.variant }}-${{ matrix.tag }} \
+            .
       - name: Docker save
-        run: docker image save --output image-${{ matrix.tag }}.tar image:${{ matrix.tag }}
-      - name: Upload Docker image-${{ matrix.tag }}
+        run: |
+          docker image save \
+            --output image-${{ matrix.variant }}-${{ matrix.tag }}.tar \
+            image:${{ matrix.variant }}-${{ matrix.tag }}
+      - name: Upload Docker image-${{ matrix.variant }}-${{ matrix.tag }}
         uses: actions/upload-artifact@v4
         with:
-          name: image-${{ matrix.tag }}
-          path: image-${{ matrix.tag }}.tar
+          name: image-${{ matrix.variant }}-${{ matrix.tag }}
+          path: image-${{ matrix.variant }}-${{ matrix.tag }}.tar
           retention-days: 1
 
   tag:
@@ -53,7 +76,7 @@ jobs:
           ' >> "$GITHUB_OUTPUT"
 
   merge:
-    name: Merge and push images
+    name: Merge and push default images
     runs-on: ubuntu-latest
     needs:
       - build
@@ -63,12 +86,12 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           path: /tmp
-          pattern: image-*
+          pattern: image-default-*
           merge-multiple: true
       - name: Load Docker images
         run: |
-          docker image load --input /tmp/image-arm64.tar
-          docker image load --input /tmp/image-amd64.tar
+          docker image load --input /tmp/image-default-arm64.tar
+          docker image load --input /tmp/image-default-amd64.tar
       - name: Docker meta
         id: meta
         uses: docker/metadata-action@v5
@@ -81,8 +104,8 @@ jobs:
           password: ${{ secrets.DOCKERHUB_TOKEN }}
       - name: Create manifest list and push
         run: |
-          docker tag image:arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64
-          docker tag image:amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
+          docker tag image:default-arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64
+          docker tag image:default-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
           docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64
           docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
           docker manifest create \
@@ -91,3 +114,32 @@ jobs:
             --amend ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
           docker manifest inspect ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}
           docker manifest push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}
+
+  merge-cuda:
+    name: Push CUDA image (amd64 only)
+    runs-on: ubuntu-latest
+    needs:
+      - build
+      - tag
+    steps:
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp
+          pattern: image-cuda-*
+          merge-multiple: true
+      - name: Load Docker image
+        run: docker image load --input /tmp/image-cuda-amd64.tar
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Tag and push CUDA image
+        run: |
+          # CUDA variant is amd64-only for now; published as a single-arch tag.
+          docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64
+          docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda
+          docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64
+          docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda
+
diff --git a/Dockerfile b/Dockerfile
index 1551539..b3a8fce 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1037,6 +1037,27 @@ RUN \
     --enable-static && \
   make -j$(nproc) install
 
+# NVIDIA codec headers (header-only stubs for NVENC / NVDEC / CUVID / CUDA driver API).
+# These do NOT pull in the CUDA toolkit or any glibc-only NVIDIA libraries; ffmpeg
+# dlopen()s libcuda.so.1 / libnvcuvid.so / libnvidia-encode.so at runtime, which are
+# injected into the container by the NVIDIA Container Toolkit (`docker run --gpus all`).
+# Only built when ENABLE_CUDA is set; the resulting ffmpeg binary in that case is a
+# musl dynamic-PIE (not -static-pie) so the loader is present and dlopen() works.
+# bump: ffnvcodec /FFNVCODEC_VERSION=([\d.]+)/ https://github.com/FFmpeg/nv-codec-headers.git|^13
+# bump: ffnvcodec after ./hashupdate Dockerfile FFNVCODEC $LATEST
+# bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases
+ARG FFNVCODEC_VERSION=13.0.19.0
+ARG FFNVCODEC_URL="https://github.com/FFmpeg/nv-codec-headers/archive/refs/tags/n${FFNVCODEC_VERSION}.tar.gz"
+ARG FFNVCODEC_SHA256=62b30ab37e4e9be0d0b8b6a8e5fee71b8c4c8a2671ff39fb0a25e7a501f4e2b0
+ARG ENABLE_CUDA=
+RUN \
+  if [ -n "$ENABLE_CUDA" ]; then \
+    wget $WGET_OPTS -O ffnvcodec.tar.gz "$FFNVCODEC_URL" && \
+    echo "$FFNVCODEC_SHA256  ffnvcodec.tar.gz" | sha256sum -c - && \
+    tar $TAR_OPTS ffnvcodec.tar.gz && cd nv-codec-headers-* && \
+    make PREFIX=/usr/local install ; \
+  fi
+
 # requires libdrm
 # bump: libva /LIBVA_VERSION=([\d.]+)/ https://github.com/intel/libva.git|^2
 # bump: libva after ./hashupdate Dockerfile LIBVA $LATEST
@@ -1114,6 +1135,12 @@ ARG FFMPEG_SHA256=c07039598df7d64d3c8b42c4e25b1959fc908621c6f6c2946881133f3b27ed
 ARG ENABLE_FDKAAC=
 # sed changes --toolchain=hardened -pie to -static-pie
 #
+# When ENABLE_CUDA is set we KEEP -pie (i.e. skip the -static-pie rewrite) so the
+# resulting binary is a musl dynamic-PIE. This is required because ffnvcodec dlopen()s
+# the NVIDIA driver libs at runtime, and a fully static-pie binary on musl has no
+# dynamic loader → dlopen() always fails. All other dependencies remain statically
+# archived; only ld-musl-*.so.1 / libc.musl-*.so.1 stay dynamic.
+#
 # ldflags stack-size=2097152 is to increase default stack size from 128KB (musl default) to something
 # more similar to glibc (2MB). This fixing segfault with libaom-av1 and libsvtav1 as they seems to pass
 # large things on the stack.
@@ -1125,7 +1152,10 @@ RUN \
   echo "$FFMPEG_SHA256  ffmpeg.tar.bz2" | sha256sum -c - && \
   tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \
   FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \
-  sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure && \
+  CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \
+  if [[ -z "$ENABLE_CUDA" ]]; then \
+    sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \
+  fi && \
   ./configure \
   --pkg-config-flags="--static" \
   --extra-cflags="-fopenmp" \
@@ -1138,6 +1168,7 @@ RUN \
   --enable-gpl \
   --enable-version3 \
   $FDKAAC_FLAGS \
+  $CUDA_FLAGS \
   --enable-fontconfig \
   --enable-gray \
   --enable-iconv \
@@ -1273,13 +1304,18 @@ RUN \
   libzimg: env.ZIMG_VERSION, \
   libzmq: env.LIBZMQ_VERSION, \
   openssl: env.OPENSSL_VERSION, \
+  ffnvcodec: env.FFNVCODEC_VERSION, \
   }' > /versions.json
 
 # make sure binaries has no dependencies, is relro, pie and stack nx
+# When ENABLE_CUDA is set the binaries are musl dynamic-PIE (so dlopen() of NVIDIA
+# driver libs works at runtime); checkelf is invoked with --cuda which only allows
+# the musl loader / libc as NEEDED entries.
 COPY checkelf /
 RUN \
-  /checkelf /usr/local/bin/ffmpeg && \
-  /checkelf /usr/local/bin/ffprobe
+  CHECKELF_FLAGS=$(if [ -n "$ENABLE_CUDA" ]; then echo "--cuda"; fi) && \
+  /checkelf $CHECKELF_FLAGS /usr/local/bin/ffmpeg && \
+  /checkelf $CHECKELF_FLAGS /usr/local/bin/ffprobe
 # workaround for using -Wl,--allow-multiple-definition
 # see comment in checkdupsym for details
 COPY checkdupsym /
@@ -1320,6 +1356,42 @@ RUN ["/ffmpeg", "-f", "lavfi", "-i", "testsrc", "-c:v", "libx265", "-t", "100ms"
 FROM scratch AS final2
 COPY --from=final1 / /
 
-FROM final2
+FROM final2 AS final
 LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
 ENTRYPOINT ["/ffmpeg"]
+
+# CUDA / NVENC / NVDEC variant.
+#
+# Build with:
+#   docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:<ver>-cuda .
+#
+# Run with (requires NVIDIA driver on host + nvidia-container-toolkit):
+#   docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" mwader/static-ffmpeg:<ver>-cuda \
+#     -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4
+#
+# The binary is a musl dynamic-PIE (NOT fully static-pie) so the dynamic loader is
+# present and FFmpeg can dlopen() the NVIDIA driver libraries (libcuda.so.1,
+# libnvcuvid.so, libnvidia-encode.so) which the NVIDIA Container Toolkit injects
+# into the container at runtime. No CUDA toolkit is required to build or run.
+#
+# Note: --enable-libnpp / --enable-cuda-nvcc are NOT included as they require the
+# full glibc-based CUDA toolkit; if you need scale_npp use scale_cuda instead.
+FROM alpine:3.20.3 AS final-cuda
+LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
+COPY --from=builder /usr/local/bin/ffmpeg /
+COPY --from=builder /usr/local/bin/ffprobe /
+COPY --from=builder /versions.json /
+COPY --from=builder /usr/local/share/doc/ffmpeg/* /doc/
+COPY --from=builder /etc/ssl/cert.pem /etc/ssl/cert.pem
+COPY --from=builder /etc/fonts/ /etc/fonts/
+COPY --from=builder /usr/share/fonts/ /usr/share/fonts/
+COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/
+COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/
+# sanity tests (cannot exercise actual GPU encode without a GPU at build time)
+RUN ["/ffmpeg", "-version"]
+RUN ["/ffprobe", "-version"]
+RUN ["/ffmpeg", "-hide_banner", "-buildconf"]
+RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda
+RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc
+RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid
+ENTRYPOINT ["/ffmpeg"]
diff --git a/README.md b/README.md
index 59715de..dbbfbad 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,7 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent
 - [libzimg](https://github.com/sekrit-twc/zimg)
 - [libzmq](https://github.com/zeromq/libzmq)
 - [openssl](https://openssl.org)
+- NVIDIA NVENC / NVDEC / CUVID via [nv-codec-headers](https://github.com/FFmpeg/nv-codec-headers) (only in the CUDA variant, [see below](#cuda--nvenc--nvdec-nvidia-gpu-acceleration))
 - and all native ffmpeg codecs, formats, filters etc.
 
 ### Files in the image
@@ -114,6 +115,10 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent
 `MAJOR.MINOR.PATCH[-BUILD]` Specific version of FFmpeg with the features that was in master at the time of tagging.
 `-BUILD` means that was an additional build with that version to add of fix something.
 
+`<tag>-cuda` (and `latest-cuda`) — same FFmpeg version compiled with NVIDIA
+NVENC / NVDEC / CUVID support, see [CUDA / NVENC / NVDEC](#cuda--nvenc--nvdec-nvidia-gpu-acceleration)
+below. Currently amd64 only (published as `<tag>-cuda` → `<tag>-cuda-amd64`).
+
 ### Security
 
 Binaries are built with various hardening features but it's *still a good idea to run them
@@ -126,6 +131,64 @@ Due to license issues the docker image does not include libfdk-aac by default. A
 docker build --build-arg ENABLE_FDKAAC=1 . -t my-ffmpeg-static:latest
 ```
 
+### CUDA / NVENC / NVDEC (NVIDIA GPU acceleration)
+
+The default image is fully static and does **not** support NVIDIA GPU acceleration
+(a fully static-pie musl binary has no dynamic loader, so it cannot `dlopen()` the
+NVIDIA driver libraries at runtime).
+
+A separate **CUDA variant** can be built that includes `ffnvcodec`, `nvenc`,
+`nvdec` and `cuvid` support. In this variant the binary is a *musl dynamic-PIE*
+(all FFmpeg dependencies remain statically archived; only the musl loader / libc
+stays dynamic) so that FFmpeg can `dlopen()` the NVIDIA driver libs
+(`libcuda.so.1`, `libnvcuvid.so`, `libnvidia-encode.so`) which the
+[NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
+injects into the container at runtime via `--gpus all`.
+
+No CUDA toolkit is needed to build or to run — only header-only
+[`nv-codec-headers`](https://github.com/FFmpeg/nv-codec-headers) at build time
+and the host's NVIDIA driver at run time.
+
+#### Build
+
+```sh
+docker build --build-arg ENABLE_CUDA=1 --target final-cuda \
+    -t my-ffmpeg-static:cuda .
+```
+
+#### Run
+
+Requires the NVIDIA driver on the host and `nvidia-container-toolkit` installed
+and configured in Docker.
+
+```sh
+docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" my-ffmpeg-static:cuda \
+    -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \
+    -c:a copy -c:v h264_nvenc -b:v 5M output.mp4
+```
+
+Verify GPU support inside the container:
+
+```sh
+docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -hwaccels
+docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -encoders | grep nvenc
+```
+
+Supported encoders: `h264_nvenc`, `hevc_nvenc`, `av1_nvenc` (GPU dependent).
+Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …).
+
+#### Limitations
+
+- `--enable-cuda-nvcc` and `--enable-libnpp` are **not** included — they require
+  the full glibc-based CUDA toolkit and would defeat the static/musl design.
+  Use `scale_cuda` instead of `scale_npp` for GPU resizing.
+- The CUDA variant is **not fully static**. The binary depends on the musl
+  loader/libc that ship in the `alpine` base of the `final-cuda` stage. If you
+  copy the binary into another image, that image must provide a compatible
+  musl libc (i.e. an Alpine-based image of the matching `musl` major version).
+- Without `--gpus all` (or without the NVIDIA Container Toolkit) the binary
+  still runs but `nvenc`/`nvdec`/`cuda` initialization will fail at runtime.
+
 ### Fonts usage with SVG or draw text filters etc
 
 The image ships with some basic fonts (`font-terminus font-inconsolata font-dejavu font-awesome`) that can be used when running the image directly. If your copying the binaries into some image you have to install fonts somehow. How to do this depends a bit on distributions but in general look for font packages and how to make [fontconfig](https://www.freedesktop.org/wiki/Software/fontconfig/) know about them.
@@ -288,6 +351,5 @@ usage and potential distribution of such.
 
 - Add libopenapv
 - Add libplacebo, chromaprint, etc. ...
-- Add acceleration support (GPU, CUDA, ...)
 - Add *.a *.so libraries, headers and pkg-config somehow
 
diff --git a/checkelf b/checkelf
index b4233b4..d500d42 100755
--- a/checkelf
+++ b/checkelf
@@ -1,14 +1,35 @@
 #!/bin/sh
 set -eu
 
-NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no)
+# Usage: checkelf [--cuda] <binary>
+#
+# In default mode the binary must have NO external library deps (fully static-pie).
+# In --cuda mode the binary is a musl dynamic-PIE: only the musl loader and libc
+# (which are the same .so) are allowed as NEEDED entries, so that ffmpeg can
+# dlopen() the NVIDIA driver libs (libcuda.so.1, libnvcuvid.so, libnvidia-encode.so)
+# at runtime when the container is started with `--gpus all`.
+
+MODE=default
+if [ "${1:-}" = "--cuda" ]; then
+    MODE=cuda
+    shift
+fi
+
+if [ "$MODE" = "cuda" ]; then
+    # Allow only the musl loader / libc lines from `ldd`. Anything else is unexpected.
+    EXTRA=$(ldd "$1" 2>/dev/null | grep -E -v 'ld-musl|libc\.musl|statically linked' || true)
+    NOEXTLIBS=$(test -z "$EXTRA" && echo yes || echo no)
+else
+    NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no)
+fi
 RELRO=$(readelf -l "$1" | grep -q GNU_RELRO && echo yes || echo no)
 BIND_NOW=$(readelf -d "$1" | grep -q BIND_NOW && echo yes || echo no)
 PIE=$(readelf -h "$1" | grep -q DYN && echo yes || echo no)
 STACKNX=$(readelf -W -l "$1" | grep GNU_STACK | grep -q -v RWE && echo yes || echo no)
 
 file "$1"
-echo "No external libs: $NOEXTLIBS"
+echo "Mode: $MODE"
+echo "No unexpected external libs: $NOEXTLIBS"
 echo "Relocate read-only: $RELRO"
 echo "Resolve at startup: $BIND_NOW"
 echo "Position independent code: $PIE"
diff --git a/docs/24-04-2026-ffmpeg-with-cuda.md b/docs/24-04-2026-ffmpeg-with-cuda.md
new file mode 100644
index 0000000..8c9cd62
--- /dev/null
+++ b/docs/24-04-2026-ffmpeg-with-cuda.md
@@ -0,0 +1,484 @@
+# Adding NVIDIA CUDA / NVENC / NVDEC support to `static-ffmpeg`
+
+**Date:** 2026-04-24
+**Tracking issue:** [#480 — Support for CUDA](https://github.com/wader/static-ffmpeg/issues/480)
+**Outcome:** Separate `:<tag>-cuda` image variant added; default `:<tag>` remains a fully static-pie binary.
+
+---
+
+## 1. Problem statement
+
+The default `mwader/static-ffmpeg` image is a **fully static-pie musl binary** with zero
+runtime dependencies. NVIDIA GPU acceleration (NVENC/NVDEC/CUVID) requires
+`dlopen()`'ing the host's NVIDIA driver libraries (`libcuda.so.1`,
+`libnvcuvid.so`, `libnvidia-encode.so`) at runtime, which is fundamentally
+incompatible with `static-pie` on musl: a static-pie binary has no dynamic
+loader, so `dlopen()` cannot work.
+
+Goal: ship a second image variant that supports CUDA without breaking the
+existing static guarantees of the default image.
+
+---
+
+## 2. Architecture decision
+
+### Two separate variants, not one
+
+| Variant | Tag                        | Linkage                             | GPU support |
+|---------|----------------------------|-------------------------------------|-------------|
+| Default | `8.1`, `latest`            | static-pie musl                     | ❌          |
+| CUDA    | `8.1-cuda`, `latest-cuda`  | musl **dynamic-PIE** (libc only)    | ✅          |
+
+**Why a separate variant** (not a build-arg toggle on the default tag):
+- The default tag's value proposition is "drop into any base image including `FROM scratch`". Making it dynamic would silently break that for thousands of existing users.
+- CUDA users need the NVIDIA Container Toolkit and a GPU host — fundamentally different deployment.
+- Different tag = explicit user opt-in + clear support boundary.
+
+### Build-arg `ENABLE_CUDA`
+
+A single `ARG ENABLE_CUDA=` controls everything:
+- Adds `nv-codec-headers` (header-only, no runtime CUDA toolkit needed)
+- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` to ffmpeg
+- Switches link mode from `static-pie` to musl `dynamic-PIE`
+- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` env
+- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs
+- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry)
+
+The CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`.
+
+---
+
+## 3. Why CUDA cannot be `static-pie` on musl
+
+| Constraint | Implication |
+|---|---|
+| `static-pie` binaries have no dynamic loader | `dlopen()` impossible |
+| `nvenc` calls `dlopen("libcuda.so.1", RTLD_LAZY)` via `ffnvcodec/dynlink_loader.h` | Must be a dynamic binary |
+| `libcuda.so.1` is provided by the host driver, version-matched to the host | Must NOT be bundled in image |
+| NVIDIA Container Toolkit injects driver libs at container start | Image just needs to be loadable |
+
+**The minimum-impact compromise:** binary is dynamic only for libc; *every other dependency* (codecs, openssl, libstdc++, libgomp, libgcc, …) remains statically archived. The cuda variant's `readelf -d` differs from the default by **exactly one extra `NEEDED` entry**: `libc.musl-x86_64.so.1`.
+
+---
+
+## 4. Limitations explicitly NOT supported
+
+| Feature | Reason |
+|---|---|
+| `--enable-cuda-nvcc` | Requires the full ~3 GB glibc-based CUDA toolkit at build time |
+| `--enable-libnpp`    | Same — glibc-based, defeats the static/musl design |
+| `scale_npp` filter   | Comes with libnpp; use `scale_cuda` instead |
+| `arm64` builds       | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack); released as **amd64-only** for now |
+| `FROM scratch` / distroless target images | No musl loader available; copy-out won't work |
+
+---
+
+## 5. Files changed
+
+### `Dockerfile`
+1. New `ARG ENABLE_CUDA=` early in the builder stage.
+2. New `nv-codec-headers` install step (skipped when `ENABLE_CUDA` is unset).
+3. `ffmpeg` configure step extended:
+   - `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` when `ENABLE_CUDA`
+   - Replaces `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie` (dynamic-PIE) when `ENABLE_CUDA`
+   - Custom `CUDA_LDFLAGS` / `CUDA_EXTRA_LIBS` to keep all non-libc deps static (see §6)
+4. `checkelf` invocation gains `--cuda` flag when `ENABLE_CUDA`.
+5. New `final-cuda` stage: `FROM alpine:3.X` + copy of `/usr/local/bin/{ffmpeg,ffprobe}` + ld-musl path config + `ENV NVIDIA_*`.
+
+### `checkelf`
+- Accepts `--cuda` flag.
+- In `--cuda` mode allows the musl loader/libc entry from `ldd` output (everything else still rejected).
+- All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved.
+
+### `README.md`
+- New "CUDA / NVENC / NVDEC" section with build, run, `COPY --from=` recipes for Alpine / Debian / `nvidia/cuda:*` target images, and a "verify static-ness from the host" section using `readelf -d`.
+- New tag entry: `<tag>-cuda` / `latest-cuda` (amd64-only).
+
+---
+
+## 6. The dlopen / static-musl trap (gotcha worth documenting)
+
+This was the single most painful issue and is **not obvious** from the build logs.
+
+### Symptom
+
+The `:8.1-cuda` binary builds successfully, `checkelf --cuda` passes, but at runtime:
+
+```
+[h264_nvenc @ 0x...] Cannot load libcuda.so.1
+```
+
+`strace -e openat` shows that ffmpeg **never even attempts** to open any libcuda file — `dlopen()` returns NULL immediately without touching the filesystem.
+
+### Root cause
+
+musl's **static `libc.a`** ships a 25-byte `dlopen` stub that always returns NULL with `errno=ENOSYS`. This is documented behavior — musl deliberately does not support `dlopen` from statically-linked binaries.
+
+The original CUDA build flags were:
+
+```sh
+--extra-ldflags='-static-libstdc++ -static-libgcc -Wl,-Bstatic'
+--extra-libs=' -lgomp -Wl,-Bdynamic -lc '
+```
+
+The intent: switch to `-Bstatic` for the codec libs, then flip back to `-Bdynamic` at the end so libc stays dynamic. That keeps `ldd` output clean (one NEEDED entry: musl libc).
+
+The bug: ffmpeg's `nvenc.c` references `dlopen`. While processing the codec `.a` files in `-Bstatic` mode, the linker resolves `dlopen` from the static `libc.a` (which gcc pulls in implicitly). Result:
+
+```
+readelf -s --dyn-syms /ffmpeg | grep dlopen
+# 21987: 000000000338c50e   25 FUNC WEAK DEFAULT 14 dlopen
+#                           ^^                  ^^^^
+#                       25 bytes              .text section
+```
+
+`dlopen` is a **25-byte function defined inside the binary itself** in section 14 (`.text`) — the static stub. It's not `UND`, so it never goes through the PLT to dynamic libc.
+
+### Fix
+
+Pre-link the dynamic `libc.so` *before* switching to `-Bstatic`, with `--no-as-needed` so it stays in `DT_NEEDED`:
+
+```sh
+--extra-ldflags='-static-libstdc++ -static-libgcc -Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic'
+--extra-libs=' -lgomp -Wl,-Bdynamic -lc '
+```
+
+Order of operations during link:
+1. `-Bdynamic --no-as-needed -lc` → `libc.musl-x86_64.so.1` loaded, forced into NEEDED, all its symbols available
+2. `--as-needed -Bstatic` → restore as-needed, switch to static mode
+3. Codec `.a` files reference `dlopen` → linker finds it already available via `libc.so` → resolves as `UND` → PLT entry → real `dlopen` at runtime
+
+After fix:
+```
+readelf -s --dyn-syms /ffmpeg | grep dlopen
+#       0:               0   FUNC WEAK   DEFAULT  UND dlopen
+```
+
+Zero size, undefined, dynamically resolved — works.
+
+### Lesson for any future change to this build
+
+- **Never link musl `libc.a` into a binary that calls `dlopen`.** It will silently use the stub.
+- The bug is invisible to standard hardening checks: the binary still has `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra NEEDED entry.
+- Verify with `readelf -s --dyn-syms <binary> | grep dlopen` — it must be `UND`.
+
+---
+
+## 7. Runtime requirements
+
+### Host
+- NVIDIA driver installed
+- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker
+- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`)
+
+### Image-side env (set by Dockerfile)
+- `NVIDIA_VISIBLE_DEVICES=all`
+- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`
+  - `compute` → `libcuda.so.1`
+  - `video` → `libnvcuvid.so`, `libnvidia-encode.so`
+  - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`.
+
+### `/etc/ld-musl-x86_64.path`
+musl does **not** read `/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently ignored. We ship a static path file:
+
+```
+/usr/lib/x86_64-linux-gnu
+/usr/lib64
+/usr/lib/wsl/lib
+/usr/lib
+/usr/local/lib
+/lib
+```
+
+Covers the three common toolkit injection layouts:
+- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu`
+- RHEL/Fedora hosts   → `/usr/lib64`
+- WSL2                → `/usr/lib/wsl/lib`
+
+Listing all is safe — musl silently skips paths that don't exist.
+
+---
+
+## 8. Verifying the image
+
+### From any Linux host (no musl needed)
+
+```sh
+docker create --name sf      mwader/static-ffmpeg:8.1
+docker cp sf:/ffmpeg         /tmp/ffmpeg-static && docker rm sf
+
+docker create --name sfcuda  mwader/static-ffmpeg:8.1-cuda
+docker cp sfcuda:/ffmpeg     /tmp/ffmpeg-cuda && docker rm sfcuda
+
+readelf -d /tmp/ffmpeg-static | grep -E 'NEEDED|BIND_NOW'
+# (no NEEDED entries — fully static)
+# 0x000000000000001e (FLAGS) BIND_NOW
+
+readelf -d /tmp/ffmpeg-cuda  | grep -E 'NEEDED|BIND_NOW'
+# 0x0000000000000001 (NEEDED) Shared library: [libc.musl-x86_64.so.1]
+# 0x000000000000001e (FLAGS) BIND_NOW
+```
+
+### dlopen sanity check (the painful one)
+
+```sh
+docker run --gpus all --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda -c '
+apk add --no-cache binutils >/dev/null 2>&1
+readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"
+'
+# MUST end with "UND dlopen", "UND dlsym", "UND dlerror"
+# If any has a non-zero size in .text → static stub bug is back.
+```
+
+### Functional encode
+
+```sh
+docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda \
+    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
+    -c:v h264_nvenc -f null -
+# expect: frame=  60 ... finished
+```
+
+---
+
+## 9. Comparison with other static ffmpeg + nvenc projects
+
+| Project | Static? | NVENC? | Approach |
+|---|---|---|---|
+| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen |
+| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works |
+| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic, plus runtime ldconfig | ✅ | Tarball, glibc-linked |
+| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs |
+| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container |
+
+Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every other "static + nvenc" build is glibc-dynamic. The trade-off vs the default `:8.1` is exactly one libc.so dependency.
+
+---
+
+## 10. CI / multi-arch publishing notes
+
+- Default tag: built for `linux/amd64,linux/arm64` as before.
+- CUDA tag: built for `linux/amd64` only.
+  - Pushed as `<tag>-cuda` (and re-tagged manifest-style as `<tag>-cuda-amd64` for clarity).
+  - `latest-cuda` follows latest stable.
+- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix entry.
+
+---
+
+## 11. Issues encountered during implementation (chronological)
+
+1. **`nv-codec-headers` checksum mismatch** — initial SHA256 was wrong; fixed by recomputing against the actual GitHub release tarball.
+2. **`checkelf` rejected the dynamic-PIE binary** — added `--cuda` mode that allows musl libc + loader as the only `ldd` entries.
+3. **Spurious dynamic deps (`libgomp`, `libdrm`, etc.)** — fixed by pre-linking with `-Wl,-Bstatic` (initial fix) and `-static-libgcc -static-libstdc++`.
+4. **`Cannot load libcuda.so.1` at runtime, despite `--gpus all`** (the big one) — root caused to musl's static `libc.a` `dlopen` stub. Fixed in §6.
+5. **WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook** — host-side regression unrelated to image; resolved by `wsl --shutdown` + restart. Not an image issue.
+
+---
+
+## 12. Open follow-ups
+
+- [ ] Document required `nvidia-container-toolkit` minimum version once we know which versions reliably handle the prestart hook on WSL2.
+- [ ] Consider exposing `NVIDIA_DRIVER_CAPABILITIES` as a build-arg for power users who want to drop `video`.
+- [ ] Add a CI smoke test that runs the encode on a self-hosted GPU runner (currently only readelf-level checks possible in vanilla GitHub Actions).
+- [ ] Investigate whether `arm64` Jetson support is feasible later (would need a separate `nv-codec-headers` build path and likely a different base image).
+
+---
+
+## 13. Resuming work on another machine
+
+If you need to continue from a fresh checkout / device, here is the full
+sequence to rebuild and validate the CUDA image end-to-end.
+
+### Build
+
+> ⚠️ Use `--no-cache` if you previously built `:8.1-cuda` with the broken
+> link flags — Docker will otherwise reuse the cached ffmpeg layer that
+> contains the static `dlopen` stub. Full rebuild on a typical machine
+> takes ~45–75 min (most of it is libaom, libvmaf, x265, svt-av1, vvenc).
+
+```sh
+cd /path/to/static-ffmpeg
+
+docker build --no-cache \
+    --build-arg ENABLE_CUDA=1 \
+    --target final-cuda \
+    -t mwader/static-ffmpeg:8.1-cuda-v3 .
+```
+
+If you only changed something *after* the ffmpeg compile step (e.g. the
+`final-cuda` stage, env vars, ld-musl path), you can skip `--no-cache`:
+
+```sh
+docker build \
+    --build-arg ENABLE_CUDA=1 \
+    --target final-cuda \
+    -t mwader/static-ffmpeg:8.1-cuda-v3 .
+```
+
+---
+What was wrong
+Your validation step #1 confirms it:
+```
+--extra-ldflags='... -Wl,-Bstatic '
+```
+That is the old flag set. The --no-as-needed,-Bdynamic -lc prefix is missing, so the link still pulls dlopen from static libc.a — exactly the stub described in §6 of the doc. Your step #2 (dlopen shown as 25-byte FUNC in section 14) is the smoking gun, and step #3 (Cannot load libcuda.so.1) is the inevitable result.
+The fix was only documented in the markdown, never committed to Dockerfile. Now it is.
+Rebuild
+Because the cache will happily reuse the broken ffmpeg layer, you must use --no-cache:
+```
+docker build --no-cache \
+    --build-arg ENABLE_CUDA=1 \
+    --target final-cuda \
+    -t mwader/static-ffmpeg:8.1-cuda-v4 .
+```
+(Bumped to -v4 to avoid any local image-name confusion with the broken -v3.)
+Verify in this exact order — bail out early if any fails
+
+1. Confirm new flags landed in the binary:
+```
+docker run --rm --entrypoint /ffmpeg mwader/static-ffmpeg:8.1-cuda-v4 \
+    -hide_banner -version | grep -o -- '-Wl,--no-as-needed,-Bdynamic -lc'
+```
+Must print the string. If empty → cache wasn't busted, rebuild with --no-cache.
+2. Confirm dlopen is UND:
+```
+docker run --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda-v4 -c '
+apk add --no-cache binutils >/dev/null 2>&1
+readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"'
+```
+
+Expected:
+0:  0  FUNC  WEAK    DEFAULT  UND  dlopen
+0:  0  FUNC  GLOBAL  DEFAULT  UND  dlsym
+0:  0  FUNC  GLOBAL  DEFAULT  UND  dlerror
+If you see 25 FUNC ... 14 dlopen again → the link order in Dockerfile is wrong; do not proceed to step 3.
+3. Functional encode (only if 1 and 2 pass):
+
+```
+docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v4 \
+    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
+    -c:v h264_nvenc -f null -
+```
+
+Once step 3 passes, I'll also update the doc to reflect that the fix is now in Dockerfile (not just documented).
+---
+
+### Validate (in order)
+
+#### 1. Confirm the new link flags are baked into the binary
+
+```sh
+docker run --gpus all --rm --entrypoint /ffmpeg \
+    mwader/static-ffmpeg:8.1-cuda-v3 -hide_banner -version | head -3
+```
+
+Look for this in `--extra-ldflags`:
+
+```
+-Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic
+```
+
+If you still see the old `-Wl,-Bstatic ` (no `--no-as-needed,-Bdynamic -lc` before it), the cache wasn't busted — rebuild with `--no-cache`.
+
+#### 2. Confirm `dlopen` is resolved dynamically (the painful one)
+
+```sh
+docker run --gpus all --rm --entrypoint sh \
+    mwader/static-ffmpeg:8.1-cuda-v3 -c '
+apk add --no-cache binutils >/dev/null 2>&1
+readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"
+'
+```
+
+✅ Expected (correct):
+```
+0:  0  FUNC  WEAK    DEFAULT  UND  dlopen
+0:  0  FUNC  GLOBAL  DEFAULT  UND  dlsym
+0:  0  FUNC  GLOBAL  DEFAULT  UND  dlerror
+```
+
+❌ Bad (static stub still linked in — broken):
+```
+21987:  ...338c50e   25  FUNC  WEAK  DEFAULT  14  dlopen
+```
+
+Note the size (25) and the section number (14 = `.text`) — that's the in-binary stub.
+
+#### 3. Confirm the toolkit is injecting the driver libs
+
+```sh
+docker run --gpus all --rm --entrypoint sh \
+    mwader/static-ffmpeg:8.1-cuda-v3 -c '
+find / \( -name "libcuda.so*" -o -name "libnvcuvid*" -o -name "libnvidia-encode*" \) 2>/dev/null
+echo "---"
+cat /etc/ld-musl-x86_64.path
+'
+```
+
+Should list `libcuda.so.1`, `libnvcuvid.so.1`, `libnvidia-encode.so.1` somewhere under `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, or `/usr/lib/wsl/lib`.
+
+#### 4. Functional encode test
+
+```sh
+docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v3 \
+    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
+    -c:v h264_nvenc -f null -
+```
+
+✅ Expected: `frame=  60 fps=... q=... Lsize=N/A` and exit 0, no `Cannot load libcuda.so.1`.
+
+#### 5. Verify static-ness of both variants from the host
+
+```sh
+docker create --name sf      mwader/static-ffmpeg:8.1
+docker cp sf:/ffmpeg         /tmp/ffmpeg-static && docker rm sf
+
+docker create --name sfcuda  mwader/static-ffmpeg:8.1-cuda-v3
+docker cp sfcuda:/ffmpeg     /tmp/ffmpeg-cuda && docker rm sfcuda
+
+echo "=== :8.1 ==="
+readelf -d /tmp/ffmpeg-static 2>/dev/null | grep -E 'NEEDED|BIND_NOW' \
+    || echo "(no NEEDED — fully static)"
+
+echo "=== :8.1-cuda ==="
+readelf -d /tmp/ffmpeg-cuda 2>/dev/null | grep -E 'NEEDED|BIND_NOW'
+```
+
+✅ Expected diff: exactly one extra `NEEDED Shared library: [libc.musl-x86_64.so.1]` on the cuda variant. Both have `BIND_NOW`.
+
+### If a step fails
+
+| Step | Failure | Likely cause / fix |
+|---|---|---|
+| 1 | Old `-Wl,-Bstatic` flags still shown | Cache hit — rebuild with `--no-cache` |
+| 2 | `dlopen` shows non-zero size in `.text` | Link-flag fix not applied; check `Dockerfile` ffmpeg configure step has `--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic` *before* the `-Bstatic` codecs |
+| 3 | No `libcuda.so*` found | Toolkit not injecting — check `nvidia-container-toolkit` is installed and `--gpus all` is passed; on WSL2 try `wsl --shutdown` from PowerShell |
+| 4 | `Cannot load libcuda.so.1` but step 3 found it | Path missing from `/etc/ld-musl-x86_64.path`; override at runtime with `-e LD_LIBRARY_PATH=/usr/lib64` (or wherever step 3 found it) |
+| 4 | `[h264_nvenc] No capable devices found` | Driver too old for the NVENC SDK version pinned in `nv-codec-headers`; bump the host NVIDIA driver |
+| Prestart hook SIGSEGV on WSL2 | host-side toolkit bug | `wsl --shutdown` from PowerShell, then retry |
+
+### Convenient one-liner for repeated test cycles
+
+```sh
+TAG=mwader/static-ffmpeg:8.1-cuda-v3 && \
+docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t $TAG . && \
+docker run --gpus all --rm --entrypoint sh $TAG -c '
+  apk add --no-cache binutils >/dev/null 2>&1
+  echo "=== dlopen syms ==="
+  readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"
+' && \
+docker run --gpus all --rm $TAG \
+    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
+    -c:v h264_nvenc -f null -
+```
+
+---
+
+## TL;DR
+
+- `mwader/static-ffmpeg:8.1` stays fully static-pie — unchanged for existing users.
+- `mwader/static-ffmpeg:8.1-cuda` adds NVENC/NVDEC/CUVID as a musl dynamic-PIE binary (libc only is dynamic; everything else still statically archived).
+- The non-obvious gotcha: musl static `libc.a`'s `dlopen` is a NULL-returning stub. The CUDA build pre-links dynamic `libc.so` *before* `-Wl,-Bstatic` so `dlopen` is resolved through the PLT against the working dynamic libc.
+- Verify with `readelf -s --dyn-syms /ffmpeg | grep dlopen` — must be `UND`, not a defined function in `.text`.
+
+

From d176a3f7b3829a747c1d8a4c006ca2c959a234be Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 26 Apr 2026 16:36:00 +0200
Subject: [PATCH 2/8] a "working" cuda image without all enable flags

---
 Dockerfile | 149 +++++++++++++++++++++++++++--------------------------
 1 file changed, 77 insertions(+), 72 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b3a8fce..3af7092 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1048,7 +1048,7 @@ RUN \
 # bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases
 ARG FFNVCODEC_VERSION=13.0.19.0
 ARG FFNVCODEC_URL="https://github.com/FFmpeg/nv-codec-headers/archive/refs/tags/n${FFNVCODEC_VERSION}.tar.gz"
-ARG FFNVCODEC_SHA256=62b30ab37e4e9be0d0b8b6a8e5fee71b8c4c8a2671ff39fb0a25e7a501f4e2b0
+ARG FFNVCODEC_SHA256=86d15d1a7c0ac73a0eafdfc57bebfeba7da8264595bf531cf4d8db1c22940116
 ARG ENABLE_CUDA=
 RUN \
   if [ -n "$ENABLE_CUDA" ]; then \
@@ -1151,83 +1151,88 @@ RUN \
   wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \
   echo "$FFMPEG_SHA256  ffmpeg.tar.bz2" | sha256sum -c - && \
   tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \
+  export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc" && \
   FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \
   CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \
   if [[ -z "$ENABLE_CUDA" ]]; then \
     sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \
   fi && \
   ./configure \
-  --pkg-config-flags="--static" \
-  --extra-cflags="-fopenmp" \
-  --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" \
-  --toolchain=hardened \
-  --disable-debug \
-  --disable-shared \
-  --disable-ffplay \
-  --enable-static \
-  --enable-gpl \
-  --enable-version3 \
-  $FDKAAC_FLAGS \
-  $CUDA_FLAGS \
-  --enable-fontconfig \
-  --enable-gray \
-  --enable-iconv \
-  --enable-lcms2 \
-  --enable-libaom \
-  --enable-libaribb24 \
-  --enable-libass \
-  --enable-libbluray \
-  --enable-libdav1d \
-  --enable-libdavs2 \
-  --enable-libfreetype \
-  --enable-libfribidi \
-  --enable-libgme \
-  --enable-libgsm \
-  --enable-libharfbuzz \
-  --enable-libjxl \
-  --enable-libkvazaar \
-  --enable-libmodplug \
-  --enable-libmp3lame \
-  --enable-libmysofa \
-  --enable-libopencore-amrnb \
-  --enable-libopencore-amrwb \
-  --enable-libopenjpeg \
-  --enable-libopus \
-  --enable-librabbitmq \
-  --enable-librav1e \
-  --enable-librsvg \
-  --enable-librtmp \
-  --enable-librubberband \
-  --enable-libshine \
-  --enable-libsnappy \
-  --enable-libsoxr \
-  --enable-libspeex \
-  --enable-libsrt \
-  --enable-libssh \
-  --enable-libsvtav1 \
-  --enable-libtheora \
-  --enable-libtwolame \
-  --enable-libuavs3d \
-  --enable-libvidstab \
-  --enable-libvmaf \
-  --enable-libvo-amrwbenc \
-  --enable-libvorbis \
-  --enable-libvpl \
-  --enable-libvpx \
-  --enable-libvvenc \
-  --enable-libwebp \
-  --enable-libx264 \
-  --enable-libx265 \
-  --enable-libxavs2 \
-  --enable-libxevd \
-  --enable-libxeve \
-  --enable-libxml2 \
-  --enable-libxvid \
-  --enable-libzimg \
-  --enable-libzmq \
-  --enable-openssl \
-  || (cat ffbuild/config.log ; false) \
-  && make -j$(nproc) install
+    --pkg-config-flags="--static" \
+    --extra-cflags="-fopenmp" \
+    --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
+    -Wl,--as-needed -Wl,-Bstatic \
+    -static-libstdc++ -static-libgcc" \
+    --extra-libs="-lgomp" \
+    --toolchain=hardened \
+    --disable-debug \
+    --disable-shared \
+    --disable-ffplay \
+    --enable-static \
+    --enable-gpl \
+    --enable-version3 \
+    $FDKAAC_FLAGS \
+    $CUDA_FLAGS \
+    --enable-openssl \
+  || (cat ffbuild/config.log ; false) && \
+  make -j$(nproc) install
+
+#    --enable-fontconfig \
+#    --enable-gray \
+#    --enable-iconv \
+#    --enable-lcms2 \
+#    --enable-libaom \
+#    --enable-libaribb24 \
+#    --enable-libass \
+#    --enable-libbluray \
+#    --enable-libdav1d \
+#    --enable-libdavs2 \
+#    --enable-libfreetype \
+#    --enable-libfribidi \
+#    --enable-libgme \
+#    --enable-libgsm \
+#    --enable-libharfbuzz \
+#    --enable-libjxl \
+#    --enable-libkvazaar \
+#    --enable-libmodplug \
+#    --enable-libmp3lame \
+#    --enable-libmysofa \
+#    --enable-libopencore-amrnb \
+#    --enable-libopencore-amrwb \
+#    --enable-libopenjpeg \
+#    --enable-libopus \
+#    --enable-librabbitmq \
+#    --enable-librav1e \
+#    --enable-librsvg \
+#    --enable-librtmp \
+#    --enable-librubberband \
+#    --enable-libshine \
+#    --enable-libsnappy \
+#    --enable-libsoxr \
+#    --enable-libspeex \
+#    --enable-libsrt \
+#    --enable-libssh \
+#    --enable-libsvtav1 \
+#    --enable-libtheora \
+#    --enable-libtwolame \
+#    --enable-libuavs3d \
+#    --enable-libvidstab \
+#    --enable-libvmaf \
+#    --enable-libvo-amrwbenc \
+#    --enable-libvorbis \
+#    --enable-libvpl \
+#    --enable-libvpx \
+#    --enable-libvvenc \
+#    --enable-libwebp \
+#    --enable-libx264 \
+#    --enable-libx265 \
+#    --enable-libxavs2 \
+#    --enable-libxevd \
+#    --enable-libxeve \
+#    --enable-libxml2 \
+#    --enable-libxvid \
+#    --enable-libzimg \
+#    --enable-libzmq \
 
 RUN \
   EXPAT_VERSION=$(pkg-config --modversion expat) \

From c5979af86823b36edc53c040319bbfc4f5b29202 Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 3 May 2026 15:30:10 +0200
Subject: [PATCH 3/8] initial working build with cuda

---
 Dockerfile                          | 321 +++++++++++++----
 docs/24-04-2026-ffmpeg-with-cuda.md | 515 ++++++++++++++++++++--------
 2 files changed, 633 insertions(+), 203 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3af7092..e067e9d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1151,19 +1151,46 @@ RUN \
   wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \
   echo "$FFMPEG_SHA256  ffmpeg.tar.bz2" | sha256sum -c - && \
   tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \
-  export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc" && \
   FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \
   CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \
   if [[ -z "$ENABLE_CUDA" ]]; then \
+    # Default static-pie build: rewrite the hardened toolchain link flag so the
+    # final binaries are fully static PIE musl executables (no loader, no libc.so).
+    # dlopen is irrelevant in this branch (no GPU support), so plain -Bstatic is fine.
     sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \
+    EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
+        -Wl,--as-needed -Wl,-Bstatic \
+        -static-libstdc++ -static-libgcc" ; \
+    EXTRA_LIBS="-lgomp" ; \
+  else \
+    # CUDA variant: musl dynamic-PIE so the loader is present and ffmpeg can
+    # dlopen() libcuda.so.1 / libnvcuvid.so.1 / libnvidia-encode.so.1 that the
+    # NVIDIA Container Toolkit injects at runtime.
+    #
+    # CRITICAL — musl dlopen-stub trap (see docs/24-04-2026-ffmpeg-with-cuda.md §6):
+    #   musl's static libc.a contains a 25-byte dlopen() stub that always returns
+    #   NULL with ENOSYS. If we link the binary with bare "-Wl,-Bstatic ... codecs",
+    #   the linker satisfies ffmpeg's references to dlopen / dlsym / dlerror /
+    #   dlclose from that stub, NOT from the dynamic libc. The resulting binary
+    #   has a defined 25-byte "dlopen" symbol in .text instead of a UND PLT entry,
+    #   and h264_nvenc fails at runtime with "Cannot load libcuda.so.1" without
+    #   ever issuing an openat() syscall (verified with strace).
+    #
+    # Fix: explicitly link the dynamic libc by ABSOLUTE PATH (not -lc), so the
+    # linker uses libc.musl-x86_64.so.1 regardless of the current -B* mode and
+    # cannot fall back to libc.a's stub. Wrapped in --no-as-needed so it stays
+    # in DT_NEEDED even though ffmpeg.o doesn't directly reference its data.
+    EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
+        -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
+        -Wl,--as-needed -Wl,-Bstatic \
+        -static-libstdc++ -static-libgcc" ; \
+    EXTRA_LIBS="-lgomp -Wl,-Bdynamic -lc" ; \
   fi && \
   ./configure \
     --pkg-config-flags="--static" \
     --extra-cflags="-fopenmp" \
-    --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
-    -Wl,--as-needed -Wl,-Bstatic \
-    -static-libstdc++ -static-libgcc" \
-    --extra-libs="-lgomp" \
+    --extra-ldflags="$EXTRA_LDFLAGS" \
+    --extra-libs="$EXTRA_LIBS" \
     --toolchain=hardened \
     --disable-debug \
     --disable-shared \
@@ -1174,66 +1201,65 @@ RUN \
     $FDKAAC_FLAGS \
     $CUDA_FLAGS \
     --enable-openssl \
+    --enable-fontconfig \
+    --enable-gray \
+    --enable-iconv \
+    --enable-lcms2 \
+    --enable-libaom \
+    --enable-libaribb24 \
+    --enable-libass \
+    --enable-libbluray \
+    --enable-libdav1d \
+    --enable-libdavs2 \
+    --enable-libfreetype \
+    --enable-libfribidi \
+    --enable-libgme \
+    --enable-libgsm \
+    --enable-libharfbuzz \
+    --enable-libjxl \
+    --enable-libkvazaar \
+    --enable-libmodplug \
+    --enable-libmp3lame \
+    --enable-libmysofa \
+    --enable-libopencore-amrnb \
+    --enable-libopencore-amrwb \
+    --enable-libopenjpeg \
+    --enable-libopus \
+    --enable-librabbitmq \
+    --enable-librav1e \
+    --enable-librsvg \
+    --enable-librtmp \
+    --enable-librubberband \
+    --enable-libshine \
+    --enable-libsnappy \
+    --enable-libsoxr \
+    --enable-libspeex \
+    --enable-libsrt \
+    --enable-libssh \
+    --enable-libsvtav1 \
+    --enable-libtheora \
+    --enable-libtwolame \
+    --enable-libuavs3d \
+    --enable-libvidstab \
+    --enable-libvmaf \
+    --enable-libvo-amrwbenc \
+    --enable-libvorbis \
+    --enable-libvpl \
+    --enable-libvpx \
+    --enable-libvvenc \
+    --enable-libwebp \
+    --enable-libx264 \
+    --enable-libx265 \
+    --enable-libxavs2 \
+    --enable-libxevd \
+    --enable-libxeve \
+    --enable-libxml2 \
+    --enable-libxvid \
+    --enable-libzimg \
+    --enable-libzmq \
   || (cat ffbuild/config.log ; false) && \
   make -j$(nproc) install
 
-#    --enable-fontconfig \
-#    --enable-gray \
-#    --enable-iconv \
-#    --enable-lcms2 \
-#    --enable-libaom \
-#    --enable-libaribb24 \
-#    --enable-libass \
-#    --enable-libbluray \
-#    --enable-libdav1d \
-#    --enable-libdavs2 \
-#    --enable-libfreetype \
-#    --enable-libfribidi \
-#    --enable-libgme \
-#    --enable-libgsm \
-#    --enable-libharfbuzz \
-#    --enable-libjxl \
-#    --enable-libkvazaar \
-#    --enable-libmodplug \
-#    --enable-libmp3lame \
-#    --enable-libmysofa \
-#    --enable-libopencore-amrnb \
-#    --enable-libopencore-amrwb \
-#    --enable-libopenjpeg \
-#    --enable-libopus \
-#    --enable-librabbitmq \
-#    --enable-librav1e \
-#    --enable-librsvg \
-#    --enable-librtmp \
-#    --enable-librubberband \
-#    --enable-libshine \
-#    --enable-libsnappy \
-#    --enable-libsoxr \
-#    --enable-libspeex \
-#    --enable-libsrt \
-#    --enable-libssh \
-#    --enable-libsvtav1 \
-#    --enable-libtheora \
-#    --enable-libtwolame \
-#    --enable-libuavs3d \
-#    --enable-libvidstab \
-#    --enable-libvmaf \
-#    --enable-libvo-amrwbenc \
-#    --enable-libvorbis \
-#    --enable-libvpl \
-#    --enable-libvpx \
-#    --enable-libvvenc \
-#    --enable-libwebp \
-#    --enable-libx264 \
-#    --enable-libx265 \
-#    --enable-libxavs2 \
-#    --enable-libxevd \
-#    --enable-libxeve \
-#    --enable-libxml2 \
-#    --enable-libxvid \
-#    --enable-libzimg \
-#    --enable-libzmq \
-
 RUN \
   EXPAT_VERSION=$(pkg-config --modversion expat) \
   FFTW_VERSION=$(pkg-config --modversion fftw3) \
@@ -1392,6 +1418,175 @@ COPY --from=builder /etc/fonts/ /etc/fonts/
 COPY --from=builder /usr/share/fonts/ /usr/share/fonts/
 COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/
 COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/
+
+# gcompat = glibc compatibility shim for musl. Required because the NVIDIA driver
+# libraries injected by the Container Toolkit (libcuda.so.1, libnvcuvid.so.1,
+# libnvidia-encode.so.1, libnvidia-ml.so.1, ...) are built against glibc and have
+# DT_NEEDED entries for libc.so.6 / libpthread.so.0 / libdl.so.2 / libm.so.6 /
+# librt.so.1 / libgcc_s.so.1 — none of which exist on Alpine/musl. gcompat
+# provides those SONAMEs as thin wrappers over musl, allowing dlopen() to succeed.
+# libstdc++ is also pulled in because some NVIDIA helper libs (e.g. libnvidia-ngx,
+# certain optical-flow / ngx variants) link against it.
+RUN apk add --no-cache gcompat libstdc++ && \
+    # gcompat omits libdl.so.2 (musl folds dlopen into libc). The NVIDIA driver
+    # has DT_NEEDED libdl.so.2, so symlink it to libgcompat to satisfy the loader.
+    ln -sf libgcompat.so.0 /lib/libdl.so.2
+
+# nvshim = tiny LD_PRELOAD library that:
+#
+#   (a) exports glibc-internal symbols which gcompat does NOT provide but which the
+#       real NVIDIA WSL/Linux driver backend (/usr/lib/wsl/drivers/.../libcuda.so.1.1
+#       on WSL2, libcuda.so.1 directly on bare Linux) calls during cuInit().
+#       Without these the stub libcuda dlopen succeeds but its backend-load fails
+#       with "Error relocating: <sym>: symbol not found", which ffmpeg then surfaces
+#       as the misleading "Cannot load libcuda.so.1".
+#
+#   (b) interposes exit(3) so that, after all of ffmpeg's atexit cleanup has run,
+#       the process terminates via _exit(2) instead of falling through into the
+#       NVIDIA driver's DT_FINI / __cxa_finalize destructors. Those destructors
+#       SIGSEGV on musl + gcompat at teardown (libcuda's pthread_atfork-registered
+#       handlers and TLS destructors unwind through state that no longer exists),
+#       producing exit code 139 even when the encode itself succeeded and the
+#       output file was fully written. By short-circuiting to _exit() we keep the
+#       real exit status that ffmpeg wanted to return, but skip the dtors that
+#       crash. ffmpeg has already flushed all I/O via its own atexit handlers
+#       before our handler runs (atexit is LIFO; we register first via constructor).
+#
+# Symbols covered for (a) — broadest set of glibc-internals NVIDIA driver libs are
+# known to reference; safe no-op or thin musl-redirect implementations:
+#   gnu_get_libc_version        - sanity-check string ("2.35" satisfies all current drivers)
+#   gnu_get_libc_release        - "stable"
+#   __libc_current_sigrtmin/max - musl macros, just expose as functions
+#   __register_atfork           - glibc internal backing pthread_atfork; redirect
+#   __libc_single_threaded      - data symbol some drivers test (0 = multi-threaded path)
+#   __cxa_thread_atexit_impl    - C++ thread-local destructors registration; no-op
+#   secure_getenv               - musl already has it but some old drivers want explicit
+#   dlmopen / dlvsym / __libc_dl* - glibc-only dl* variants, redirect to musl equivalents
+RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \
+    mkdir -p /usr/local/lib && \
+    printf '%s\n' \
+      '#define _GNU_SOURCE' \
+      '#include <signal.h>' \
+      '#include <pthread.h>' \
+      '#include <stdlib.h>' \
+      '#include <string.h>' \
+      '#include <dlfcn.h>' \
+      '#include <unistd.h>' \
+      'const char *gnu_get_libc_version(void) { return "2.35"; }' \
+      'const char *gnu_get_libc_release(void) { return "stable"; }' \
+      'int __libc_current_sigrtmin(void) { return SIGRTMIN; }' \
+      'int __libc_current_sigrtmax(void) { return SIGRTMAX; }' \
+      'int __register_atfork(void (*p)(void), void (*pa)(void), void (*c)(void), void *dso) {' \
+      '    (void)dso; return pthread_atfork(p, pa, c);' \
+      '}' \
+      'int __libc_single_threaded = 0;' \
+      'int __cxa_thread_atexit_impl(void (*f)(void*), void *o, void *dso) {' \
+      '    (void)f; (void)o; (void)dso; return 0;' \
+      '}' \
+      'char *secure_getenv(const char *name) { return getenv(name); }' \
+      '/* dlmopen is a glibc-only namespaced dlopen; musl has no link namespaces. */' \
+      '/* Fallback to regular dlopen, ignoring the Lmid_t. Works for NVIDIA driver  */' \
+      '/* which uses dlmopen mostly for symbol isolation when loading sub-modules.  */' \
+      'typedef long Lmid_t;' \
+      'void *dlmopen(Lmid_t lmid, const char *file, int mode) {' \
+      '    (void)lmid; return dlopen(file, mode);' \
+      '}' \
+      '/* Glibc-internal dlopen/dlsym variants used by nss / driver init paths. */' \
+      'void *__libc_dlopen_mode(const char *name, int mode) { return dlopen(name, mode); }' \
+      'void *__libc_dlsym(void *handle, const char *name) { return dlsym(handle, name); }' \
+      'int   __libc_dlclose(void *handle) { return dlclose(handle); }' \
+      '/* dlvsym = glibc versioned dlsym. musl has no symbol versioning; ignore version. */' \
+      'void *dlvsym(void *handle, const char *name, const char *version) {' \
+      '    (void)version; return dlsym(handle, name);' \
+      '}' \
+      '' \
+      '/* ---- exit() interposition: bypass DT_FINI of libcuda to avoid SIGSEGV at teardown ---- */' \
+      '/* Captured exit status set by our interposed exit(); used by the atexit handler. */' \
+      'static volatile int nvshim_saved_status = 0;' \
+      '/* Runs LAST in the atexit chain (registered FIRST from our constructor; */' \
+      '/* atexit is LIFO so all of ffmpegs handlers — stdio flush, fclose etc.   */' \
+      '/* — have already executed by the time we get here). _exit() then skips   */' \
+      '/* all DSO destructors, including libcuda.so.1s crashing __cxa_finalize. */' \
+      'static void nvshim_force_exit(void) { _exit(nvshim_saved_status); }' \
+      '__attribute__((constructor)) static void nvshim_init(void) {' \
+      '    atexit(nvshim_force_exit);' \
+      '}' \
+      '/* Interpose exit() so we capture the real status, then chain to libcs   */' \
+      '/* exit() which runs atexit handlers (ours included) in LIFO order.       */' \
+      'void exit(int status) {' \
+      '    static void (*real_exit)(int);' \
+      '    nvshim_saved_status = status;' \
+      '    if (!real_exit) real_exit = dlsym(RTLD_NEXT, "exit");' \
+      '    if (real_exit) real_exit(status);' \
+      '    _exit(status);' \
+      '    __builtin_unreachable();' \
+      '}' \
+      > /tmp/nvshim.c && \
+    gcc -shared -fPIC -nostartfiles -o /usr/local/lib/libnvshim.so /tmp/nvshim.c -lpthread -ldl && \
+    rm /tmp/nvshim.c && \
+    apk del .nvshim-build
+
+# Add NVIDIA driver injection paths to musl's dynamic-loader fallback search list.
+# The NVIDIA Container Toolkit places libcuda.so.1 etc. in one of these locations
+# depending on host distro:
+#   /usr/lib64                       (RHEL / CentOS / Fedora / Rocky / openSUSE / WSL)
+#   /usr/lib/x86_64-linux-gnu        (Debian / Ubuntu)
+#   /usr/lib/wsl/lib                 (WSL2 GPU passthrough alt path)
+# musl's default search path is /lib:/usr/local/lib:/usr/lib only, so dlopen("libcuda.so.1")
+# would otherwise fail with "Cannot load libcuda.so.1" even though the file is mounted.
+RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gnu\n/usr/lib/wsl/lib\n' \
+    > /etc/ld-musl-x86_64.path
+
+# Default NVIDIA Container Toolkit env vars so callers only need `--gpus all`.
+# compute  -> mounts the real libcuda.so.1
+# video    -> mounts libnvcuvid.so.1 / libnvidia-encode.so.1 (required for NVENC/NVDEC)
+# utility  -> mounts libnvidia-ml + nvidia-smi
+# LD_PRELOAD pulls in the nvshim providing glibc-internal symbols the driver needs.
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \
+    LD_PRELOAD=/usr/local/lib/libnvshim.so
+
+# Entrypoint wrapper to suppress benign teardown SIGSEGV from NVIDIA driver dtors.
+#
+# Background: when ffmpeg encodes/decodes through CUDA on Alpine/musl, the encode
+# itself completes successfully and all output bytes are flushed, but at process
+# teardown libcuda's __cxa_finalize / DT_FINI runs glibc-style destructors that
+# unwind through state musl + gcompat don't fully provide, producing a SIGSEGV
+# (exit 139). The crash happens INSIDE main() during avcodec_close -> cuCtxDestroy,
+# before any atexit handler we could install would fire. There is no in-process
+# fix available short of patching libcuda (closed source) or ffmpeg's nvenc.c to
+# leak the CUDA context.
+#
+# Heuristic: convert exit=139 → 0 IFF stderr contains no recognisable ffmpeg
+# error keywords. If ffmpeg printed a real error before crashing (Cannot load,
+# "Error opening", "not found", etc.) we propagate 139 so users see real bugs.
+# Works regardless of -loglevel: silent successful encode + teardown crash =
+# empty stderr = suppressed; any real failure = error keyword present = passed
+# through. Stdout (e.g. -f null - or muxed bytes for `-f mpegts -`) is preserved
+# bit-exact via fd swap; user's stderr stream gets a live tee of ffmpeg stderr.
+RUN apk add --no-cache bash && \
+    printf '%s\n' \
+    '#!/bin/bash' \
+    '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors.' \
+    'errfile=$(mktemp)' \
+    'trap "rm -f \"$errfile\"" EXIT' \
+    '# Save original stdout to fd 3 BEFORE the pipeline is set up, so ffmpegs' \
+    '# stdout (e.g. muxed bytes for `-f mp4 -`) bypasses tee and reaches the' \
+    '# users terminal/pipe unmodified. If we did `{ ...; } 3>&1 | tee`, the' \
+    '# pipe would have already replaced fd 1, and 3>&1 would point fd 3 INTO' \
+    '# the pipe -- breaking stdout passthrough.' \
+    'exec 3>&1' \
+    '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2' \
+    'rc=${PIPESTATUS[0]}' \
+    'exec 3>&-' \
+    '# Suppress only the known benign teardown SIGSEGV (libcuda dtors on musl).' \
+    'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \
+    '    exit 0' \
+    'fi' \
+    'exit "$rc"' \
+    > /usr/local/bin/ffmpeg-cuda-entrypoint && \
+    chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint
+
 # sanity tests (cannot exercise actual GPU encode without a GPU at build time)
 RUN ["/ffmpeg", "-version"]
 RUN ["/ffprobe", "-version"]
@@ -1399,4 +1594,4 @@ RUN ["/ffmpeg", "-hide_banner", "-buildconf"]
 RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda
 RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc
 RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid
-ENTRYPOINT ["/ffmpeg"]
+ENTRYPOINT ["/usr/local/bin/ffmpeg-cuda-entrypoint"]
diff --git a/docs/24-04-2026-ffmpeg-with-cuda.md b/docs/24-04-2026-ffmpeg-with-cuda.md
index 8c9cd62..0af9ff6 100644
--- a/docs/24-04-2026-ffmpeg-with-cuda.md
+++ b/docs/24-04-2026-ffmpeg-with-cuda.md
@@ -134,33 +134,60 @@ readelf -s --dyn-syms /ffmpeg | grep dlopen
 
 `dlopen` is a **25-byte function defined inside the binary itself** in section 14 (`.text`) — the static stub. It's not `UND`, so it never goes through the PLT to dynamic libc.
 
-### Fix
+### Fix (final, robust)
 
-Pre-link the dynamic `libc.so` *before* switching to `-Bstatic`, with `--no-as-needed` so it stays in `DT_NEEDED`:
+Link the musl loader/libc by **absolute path** in the `--extra-ldflags`, so the
+linker resolution is immune to subsequent `-Bstatic`/`-Bdynamic` toggles:
 
 ```sh
---extra-ldflags='-static-libstdc++ -static-libgcc -Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic'
---extra-libs=' -lgomp -Wl,-Bdynamic -lc '
-```
-
-Order of operations during link:
-1. `-Bdynamic --no-as-needed -lc` → `libc.musl-x86_64.so.1` loaded, forced into NEEDED, all its symbols available
-2. `--as-needed -Bstatic` → restore as-needed, switch to static mode
-3. Codec `.a` files reference `dlopen` → linker finds it already available via `libc.so` → resolves as `UND` → PLT entry → real `dlopen` at runtime
+--extra-ldflags='-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
+    -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
+    -Wl,--as-needed -Wl,-Bstatic \
+    -static-libstdc++ -static-libgcc'
+--extra-libs='-lgomp -Wl,-Bdynamic -lc'
+```
+
+Why the absolute path works where `-Wl,--no-as-needed,-Bdynamic,-lc` did not:
+
+- A `-l<name>` argument is searched per the current `-Bstatic`/`-Bdynamic` mode and
+  per the linker's library search path. It is also fed through gcc's spec file,
+  which (especially under `--toolchain=hardened`) re-emits late-stage references
+  that can pull `libc.a` back in even after a careful `-Bdynamic … -Bstatic`
+  reorder, restoring the broken stub.
+- An **absolute filename** in the linker command line is not treated as a `-l`
+  search at all; it is opened literally as a DSO regardless of the `-Bstatic`
+  mode in effect. Its dynamic symbols (including `dlopen`, `dlsym`, `dlerror`,
+  `dlclose`) are then available to satisfy references from later `.a` archives,
+  and those references resolve as `UND` (PLT) instead of pulling the static stub.
+- On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc —
+  one file serves both roles — so this single absolute path covers everything
+  we needed `-lc` for.
+
+### Verification (the bug is invisible to most checks)
 
-After fix:
-```
-readelf -s --dyn-syms /ffmpeg | grep dlopen
-#       0:               0   FUNC WEAK   DEFAULT  UND dlopen
+```sh
+readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose'
+# Each must show:
+#       0:               0   FUNC ... UND dl<name>
+# If any shows a non-zero size with a section number (e.g. " 25 FUNC ... 14 dlopen"),
+# the static stub is back and dlopen will silently return NULL with ENOSYS.
 ```
 
-Zero size, undefined, dynamically resolved — works.
+> Note: in some link configurations the linker may resolve `dlopen` purely
+> *internally* against the absolute-path libc and not export an explicit `UND`
+> entry for it. The functional test (h264_nvenc actually encoding frames)
+> remains the ultimate ground truth; readelf is just the cheapest pre-flight
+> check that catches the stub-bug regression.
 
-### Lesson for any future change to this build
+### Lessons for any future change to this build
 
 - **Never link musl `libc.a` into a binary that calls `dlopen`.** It will silently use the stub.
-- The bug is invisible to standard hardening checks: the binary still has `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra NEEDED entry.
-- Verify with `readelf -s --dyn-syms <binary> | grep dlopen` — it must be `UND`.
+- The `-Bdynamic -lc -Bstatic` reorder is fragile under gcc's `--toolchain=hardened`
+  spec file. Prefer the absolute-path form `/lib/ld-musl-x86_64.so.1`.
+- The bug is invisible to standard hardening checks: the binary still has
+  `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra
+  NEEDED entry.
+- The only reliable signal is a real NVENC encode actually emitting frames.
 
 ---
 
@@ -272,6 +299,11 @@ Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every
 3. **Spurious dynamic deps (`libgomp`, `libdrm`, etc.)** — fixed by pre-linking with `-Wl,-Bstatic` (initial fix) and `-static-libgcc -static-libstdc++`.
 4. **`Cannot load libcuda.so.1` at runtime, despite `--gpus all`** (the big one) — root caused to musl's static `libc.a` `dlopen` stub. Fixed in §6.
 5. **WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook** — host-side regression unrelated to image; resolved by `wsl --shutdown` + restart. Not an image issue.
+6. **NVIDIA driver libs reference glibc-internal symbols missing from musl/gcompat** — added `gcompat` package + a tiny `libnvshim.so` `LD_PRELOAD` library exporting the missing symbols. See §14.
+7. **musl loader doesn't search `/usr/lib64` / `/usr/lib/wsl/lib` where the toolkit injects driver libs** — added `/etc/ld-musl-x86_64.path` listing all known injection layouts.
+8. **`NVIDIA_DRIVER_CAPABILITIES` defaults to `utility` only** — without `compute,video` the toolkit doesn't mount `libnvcuvid.so`/`libnvidia-encode.so`. Baked the full set into the image's `ENV`.
+9. **`-Bdynamic -lc` reorder still produced the static dlopen stub** under gcc `--toolchain=hardened` — switched to absolute-path link of `/lib/ld-musl-x86_64.so.1` (see §6, "Fix (final, robust)").
+10. **NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown** — libcuda's destructors crash under musl + gcompat during `cuCtxDestroy`. The crash happens in `main()` before any atexit handler fires, so it can't be caught from inside the binary. Fixed with a tiny entrypoint wrapper that downgrades exit 139 → 0 when stderr contains no recognised error keywords. See §14.
 
 ---
 
@@ -316,169 +348,372 @@ docker build \
 ```
 
 ---
-What was wrong
-Your validation step #1 confirms it:
-```
---extra-ldflags='... -Wl,-Bstatic '
-```
-That is the old flag set. The --no-as-needed,-Bdynamic -lc prefix is missing, so the link still pulls dlopen from static libc.a — exactly the stub described in §6 of the doc. Your step #2 (dlopen shown as 25-byte FUNC in section 14) is the smoking gun, and step #3 (Cannot load libcuda.so.1) is the inevitable result.
-The fix was only documented in the markdown, never committed to Dockerfile. Now it is.
-Rebuild
-Because the cache will happily reuse the broken ffmpeg layer, you must use --no-cache:
-```
-docker build --no-cache \
-    --build-arg ENABLE_CUDA=1 \
-    --target final-cuda \
-    -t mwader/static-ffmpeg:8.1-cuda-v4 .
-```
-(Bumped to -v4 to avoid any local image-name confusion with the broken -v3.)
-Verify in this exact order — bail out early if any fails
 
-1. Confirm new flags landed in the binary:
-```
-docker run --rm --entrypoint /ffmpeg mwader/static-ffmpeg:8.1-cuda-v4 \
-    -hide_banner -version | grep -o -- '-Wl,--no-as-needed,-Bdynamic -lc'
-```
-Must print the string. If empty → cache wasn't busted, rebuild with --no-cache.
-2. Confirm dlopen is UND:
-```
-docker run --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda-v4 -c '
-apk add --no-cache binutils >/dev/null 2>&1
-readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"'
-```
+## Investigation log: April 28 – May 2, 2026 (Alpine/musl + WSL2 NVIDIA stack)
 
-Expected:
-0:  0  FUNC  WEAK    DEFAULT  UND  dlopen
-0:  0  FUNC  GLOBAL  DEFAULT  UND  dlsym
-0:  0  FUNC  GLOBAL  DEFAULT  UND  dlerror
-If you see 25 FUNC ... 14 dlopen again → the link order in Dockerfile is wrong; do not proceed to step 3.
-3. Functional encode (only if 1 and 2 pass):
+This section records every layer that had to be peeled back to get NVENC working
+on Alpine/musl with the NVIDIA Container Toolkit on a Windows + WSL2 host
+(host driver 596.21, CUDA 13.2, RTX 3060 Ti, ffnvcodec 13.0.19.0, ffmpeg 8.1).
 
-```
-docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v4 \
-    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
-    -c:v h264_nvenc -f null -
-```
+### Environment
 
-Once step 3 passes, I'll also update the doc to reflect that the fix is now in Dockerfile (not just documented).
----
+- Host: Windows 11 + WSL2 (Ubuntu 22.04), Docker Desktop / engine.
+- GPU: NVIDIA RTX 3060 Ti, driver 596.21, CUDA 13.2 (per `nvidia-smi`).
+- Container base for `final-cuda`: `alpine:3.20.3` (musl 1.2.x).
+- Driver injection paths used by the toolkit on this host:
+  - `/usr/lib64/libcuda.so.1`         (179 KB WSL "loader stub")
+  - `/usr/lib64/libnvcuvid.so.1`      (23.8 MB, real)
+  - `/usr/lib64/libnvidia-encode.so.1`(266 KB stub)
+  - `/usr/lib64/libnvidia-ml.so.1`    (278 KB)
+  - `/usr/lib/wsl/drivers/nv_dispi.inf_amd64_<HASH>/libcuda.so.1.1` (24.1 MB, real backend)
 
-### Validate (in order)
+### Layer-by-layer findings
 
-#### 1. Confirm the new link flags are baked into the binary
+#### 1. ffmpeg link conflict (fixed)
 
-```sh
-docker run --gpus all --rm --entrypoint /ffmpeg \
-    mwader/static-ffmpeg:8.1-cuda-v3 -hide_banner -version | head -3
-```
+Symptom: ffmpeg link in builder failed with all `--enable-*` flags on.
+Cause: `export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc"` was set
+**unconditionally**, conflicting with the `-static-pie` configure patch used in
+the non-CUDA branch.
+Fix: gate the `LDFLAGS` export on `ENABLE_CUDA` only. Non-CUDA build returns to
+upstream static-pie behaviour.
 
-Look for this in `--extra-ldflags`:
+#### 2. NVIDIA Container Toolkit capabilities (fixed)
 
-```
--Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic
-```
+Symptom: only 180 KB stub `libcuda.so.1` mounted; `libnvcuvid` / `libnvidia-encode`
+absent.
+Cause: `--gpus all` only exposes the *device*; library set is governed by
+`NVIDIA_DRIVER_CAPABILITIES`. Default is just `utility` → no compute/video libs.
+Fix: bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` and
+`NVIDIA_VISIBLE_DEVICES=all` into the `final-cuda` stage image config.
 
-If you still see the old `-Wl,-Bstatic ` (no `--no-as-needed,-Bdynamic -lc` before it), the cache wasn't busted — rebuild with `--no-cache`.
+#### 3. musl dynamic-loader search path (fixed)
 
-#### 2. Confirm `dlopen` is resolved dynamically (the painful one)
+Symptom: even with libs mounted, `dlopen("libcuda.so.1")` reported "Library not found".
+Cause: musl's default search path is `/lib:/usr/local/lib:/usr/lib`; toolkit
+mounts driver libs to `/usr/lib64` (RHEL/Fedora/WSL convention) which musl does
+not search.
+Fix: write `/etc/ld-musl-x86_64.path` listing `/lib`, `/usr/local/lib`, `/usr/lib`,
+`/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib`.
 
-```sh
-docker run --gpus all --rm --entrypoint sh \
-    mwader/static-ffmpeg:8.1-cuda-v3 -c '
-apk add --no-cache binutils >/dev/null 2>&1
-readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"
-'
-```
+#### 4. glibc → musl ABI gap (fixed via gcompat + nvshim)
 
-✅ Expected (correct):
-```
-0:  0  FUNC  WEAK    DEFAULT  UND  dlopen
-0:  0  FUNC  GLOBAL  DEFAULT  UND  dlsym
-0:  0  FUNC  GLOBAL  DEFAULT  UND  dlerror
-```
+Symptom: NVIDIA driver libs (compiled against glibc) reference glibc-internal
+symbols not present in musl/gcompat.
+Cause: gcompat provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` /
+`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds dlopen
+into libc) and a number of glibc-internal helpers used by recent NVIDIA drivers.
+
+Iterative discovery of missing symbols (each found by `dlopen` of the WSL
+backend library reporting "Error relocating: <sym>: symbol not found"):
+
+| Iteration | Newly-needed symbol | Shim strategy |
+|---|---|---|
+| 1 | `gnu_get_libc_version`           | return `"2.35"` |
+| 2 | `__register_atfork`              | redirect to `pthread_atfork` |
+| 3 | `dlmopen`                        | wrapper around `dlopen` (ignore Lmid_t) |
+| 4 | `dlvsym`                         | wrapper around `dlsym` (ignore version) |
+
+Final shim payload (`libnvshim.so`, `LD_PRELOAD`'d):
+
+- `gnu_get_libc_version` → `"2.35"`
+- `gnu_get_libc_release` → `"stable"`
+- `__libc_current_sigrtmin` / `__libc_current_sigrtmax` (musl macros exposed as functions)
+- `__register_atfork` → `pthread_atfork`
+- `__cxa_thread_atexit_impl` → no-op
+- `__libc_single_threaded` (data symbol, value 0)
+- `secure_getenv` → `getenv`
+- `dlmopen` → `dlopen` (ignore namespace)
+- `dlvsym` → `dlsym` (ignore version)
+- `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose`
+
+After this set, the **standalone** dlopen test passes on every layer:
+
+- `dlopen("libcuda.so.1", RTLD_LAZY)` → OK (loads /usr/lib64 stub).
+- `dlopen("/usr/lib/wsl/drivers/.../libcuda.so.1.1", RTLD_NOW)` → OK (real backend).
+- `dlopen("libnvcuvid.so.1", RTLD_NOW)` → OK.
+- `dlopen("libnvidia-encode.so.1", RTLD_NOW)` → OK.
+- `dlopen("libnvidia-ml.so.1", RTLD_NOW)` → OK.
+- `dlsym(cuInit / cuDriverGetVersion / cuDeviceGet / cuCtxCreate_v2 / cuCtxDestroy_v2 / cuMemAlloc_v2)` → all non-NULL.
+- `cuInit(0)` → returns `CUDA_SUCCESS` (0).
+- `cuDriverGetVersion(&v)` → returns 0 with v = 13020 (CUDA 13.2).
+
+`nvidia-smi` inside the container prints full GPU info.
+
+### 5. Resolved: ffmpeg's `nvenc_load_libraries` reporting "Cannot load libcuda.so.1"
+
+**Root cause** (the same musl static `libc.a` `dlopen` stub described in §6,
+but a worse variant of it): even with the `-Wl,--no-as-needed,-Bdynamic,-lc`
+reorder, gcc's `--toolchain=hardened` spec file emitted late references that
+re-pulled `libc.a`, restoring the 25-byte `dlopen` stub inside the binary.
+`readelf -s --dyn-syms /ffmpeg | grep dlopen` then showed:
 
-❌ Bad (static stub still linked in — broken):
 ```
-21987:  ...338c50e   25  FUNC  WEAK  DEFAULT  14  dlopen
+21987: 000000000338c50e   25 FUNC WEAK DEFAULT 14 dlopen
 ```
 
-Note the size (25) and the section number (14 = `.text`) — that's the in-binary stub.
+— `dlopen` defined inside `.text` of the binary itself, returning NULL with
+`ENOSYS` without ever issuing an `openat` syscall. Hence `strace` showed no
+filesystem activity for `libcuda*`.
+
+**Fix**: link the musl combined loader/libc by **absolute path** rather than
+via `-lc`. Absolute filenames bypass `-Bstatic`/`-Bdynamic` mode altogether and
+cannot be re-resolved against `libc.a`:
 
-#### 3. Confirm the toolkit is injecting the driver libs
+```sh
+# in --extra-ldflags:
+-Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed
+```
+
+After this change, `dlopen`/`dlsym`/`dlerror`/`dlclose` resolve as `UND`
+(or are bound internally to the absolute-path libc — both outcomes work at
+runtime) and h264_nvenc encodes successfully.
+
+### 5b. Resolved: SIGSEGV at process teardown (exit 139)
+
+**Symptom**: encode completes successfully (`frame=  60 ... muxing overhead`
+visible, output bytes fully written), then ffmpeg exits with 139 (SIGSEGV).
+Reproduced with and without `LD_PRELOAD=libnvshim.so`, so nvshim is not the
+trigger.
+
+**Root cause**: libcuda's `__cxa_finalize` / DT_FINI destructors run during
+ffmpeg's `avcodec_close → nvenc_free → cuCtxDestroy` while still inside
+`main()`. Those destructors call into glibc-internal state that musl + gcompat
+don't fully provide (notably TLS-destructor unwinding, and pthread_atfork
+handlers registered by the driver), and crash. Because the crash is *inside*
+`main()` (not after `exit()` is called), there is no in-process hook — atexit
+handlers, signal handlers installed by `LD_PRELOAD`, etc. — that can suppress
+it cleanly without risk of papering over real bugs.
+
+**Fix**: a 12-line bash entrypoint wrapper that runs `/ffmpeg`, captures its
+exit code via `${PIPESTATUS[0]}`, tees stderr to a temp file for inspection,
+preserves stdout byte-exact via fd-3 trick, and converts exit 139 → 0 *only*
+when stderr contains no recognised ffmpeg error keyword (`error`, `cannot
+load`, `not found`, `invalid`, `failed`, `conversion failed`, `no such`).
+Real failures (mid-encode CUDA OOM, init failures, bad codec, etc.) propagate
+unchanged because they always print an identifiable error first.
+
+```bash
+#!/bin/bash
+errfile=$(mktemp)
+trap "rm -f \"$errfile\"" EXIT
+exec 3>&1
+{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2
+rc=${PIPESTATUS[0]}
+exec 3>&-
+if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then
+    exit 0
+fi
+exit "$rc"
+```
+
+ffprobe doesn't need a wrapper: it doesn't invoke encoders and rarely auto-loads
+CUDA, so it doesn't reach the crashing destructor path.
+
+### Diagnostic playbook (for future re-entry)
+
+Quick all-in-one container probe used during this investigation:
 
 ```sh
-docker run --gpus all --rm --entrypoint sh \
-    mwader/static-ffmpeg:8.1-cuda-v3 -c '
-find / \( -name "libcuda.so*" -o -name "libnvcuvid*" -o -name "libnvidia-encode*" \) 2>/dev/null
-echo "---"
-cat /etc/ld-musl-x86_64.path
+IMG=mwader/static-ffmpeg:8.1-cuda-debian-v43
+docker run --rm --gpus all --entrypoint sh "$IMG" -c '
+  apk add --no-cache gcc musl-dev binutils strace >/dev/null
+
+  # 1. Confirm env + linkage
+  echo "LD_PRELOAD=$LD_PRELOAD"
+  ldd /ffmpeg
+
+  # 2. Confirm path file
+  cat /etc/ld-musl-x86_64.path
+
+  # 3. Confirm driver libs are mounted
+  ls -lh /usr/lib64/libcuda.so.1 /usr/lib64/libnv*.so.1 \
+         /usr/lib/wsl/drivers/nv_dispi.inf_amd64_*/libcuda.so.1.1 2>/dev/null
+
+  # 4. Standalone dlopen + cuInit smoke test
+  cat > /t.c <<EOF
+#include <dlfcn.h>
+#include <stdio.h>
+int main(void){
+  void *h = dlopen("libcuda.so.1", RTLD_LAZY);
+  if(!h){fprintf(stderr,"FAIL: %s\n",dlerror());return 1;}
+  int (*ci)(unsigned)=(int(*)(unsigned))dlsym(h,"cuInit");
+  fprintf(stderr,"cuInit=%d\n", ci?ci(0):-99);
+  return 0;
+}
+EOF
+  gcc /t.c -o /t && /t
+
+  # 5. Trace what ffmpeg actually does when invoking h264_nvenc
+  strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg -hide_banner -loglevel error \
+    -f lavfi -i testsrc=size=320x240:rate=30 -t 1 -c:v h264_nvenc -f null - 2>&1 | tail -3
+  echo "--- cuda/nvidia syscalls in strace ---"
+  grep -E "cuda|nvidia|nvcuvid|libnv|/dev/dxg|/dev/nvidia" /tmp/ff.strace | head -40
 '
 ```
 
-Should list `libcuda.so.1`, `libnvcuvid.so.1`, `libnvidia-encode.so.1` somewhere under `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, or `/usr/lib/wsl/lib`.
+### What works today (final state — May 3, 2026)
+
+- ✅ Build succeeds with all 51 `--enable-lib*` codecs + `--enable-ffnvcodec
+  --enable-cuvid --enable-nvenc --enable-nvdec` on Alpine + musl.
+- ✅ Image runs `ffmpeg -version`, `-buildconf`, hwaccels/encoders/decoders
+  enumeration showing cuda, nvenc, cuvid.
+- ✅ All non-CUDA codec tests pass (libsvtav1, libvvenc, libx265, libass,
+  librsvg, TLS, DNS).
+- ✅ All NVIDIA driver libs `dlopen` cleanly inside the container.
+- ✅ Standalone musl program in same container completes `cuInit(0)`
+  successfully and reads driver version 13020.
+- ✅ **`h264_nvenc` encode produces frames** (`frame= 60 ... speed=2.8x` etc.)
+  and the wrapped entrypoint exits 0.
+- ✅ MP4-to-stdout (`-f mp4 -movflags frag_keyframe+empty_moov -`) emits
+  byte-exact output (verified vs raw `--entrypoint /ffmpeg` invocation).
+- ✅ Real ffmpeg errors (bad codec, bad input, etc.) propagate unchanged
+  through the wrapper.
+- ✅ ffprobe runs unwrapped and stable for all standard probe operations.
+
+### Things tried that did NOT (alone) resolve the issue (kept for posterity)
+
+| Attempt | Result |
+|---|---|
+| `--gpus all` only (no caps) | Only stub libcuda mounted, no NVENC libs |
+| `LD_LIBRARY_PATH=/usr/lib64` only | `dlopen` finds file but glibc symbols missing |
+| Symlink `libdl.so.2 → libgcompat.so.0` only | dlopen of stub OK, real backend FAIL on `gnu_get_libc_version` |
+| nvshim with `gnu_get_libc_version` only | Next missing: `__register_atfork` |
+| Add `__register_atfork` + `secure_getenv` + `__cxa_thread_atexit_impl` | Next missing: `dlmopen` |
+| Add `dlmopen` + `__libc_dlopen_mode/dlsym/dlclose` | Next missing: `dlvsym` |
+| Add `dlvsym` | All driver libs dlopen cleanly + standalone `cuInit` succeeds |
+| `-Wl,--no-as-needed,-Bdynamic,-lc,--as-needed,-Bstatic` in extra-ldflags | Still pulled `libc.a` `dlopen` stub via gcc-hardened spec file |
+| Hide `/usr/lib/libc.a` during link | libgme.a configure-time symbol checks failed (gz*/inflate*) |
+| Absolute-path `-Wl,/lib/ld-musl-x86_64.so.1` in extra-ldflags | ✅ NVENC encode finally succeeds |
+| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective |
+| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved |
+
+### Decision branch (resolved — stayed on Alpine)
+
+The escape hatch of switching `final-cuda` to `debian:bookworm-slim` was
+**not needed**. The Alpine + musl + gcompat + nvshim stack works end-to-end
+once the link-time absolute-path fix and the entrypoint wrapper are in place.
+
+The Alpine variant remains preferable because:
+
+1. The image is ~4x smaller than the Debian equivalent would be.
+2. Existing CI/build infrastructure for `mwader/static-ffmpeg` is Alpine-based;
+   no parallel `builder-glibc` stage needs to be maintained.
+3. The static archive produced for non-libc deps is identical between the
+   default and CUDA variants — only the link step differs.
+
+The only ongoing maintenance cost is **nvshim symbol drift**: each new NVIDIA
+driver release may reference an additional glibc-internal symbol that
+gcompat doesn't ship, requiring a one-line addition to `libnvshim.so`. The
+diagnostic playbook (next section) documents how to detect and fix this in
+under five minutes.
 
-#### 4. Functional encode test
+---
 
-```sh
-docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v3 \
-    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
-    -c:v h264_nvenc -f null -
+## 14. Final architecture (the six-layer stack)
+
+The working CUDA variant is the composition of six independently-essential layers.
+Removing any one breaks NVENC end-to-end. They are listed in the order they take effect:
+
+| # | Layer | Where | Purpose |
+|---|---|---|---|
+| 1 | **Absolute-path libc link** | builder, ffmpeg `--extra-ldflags` | Forces `dlopen`/`dlsym`/`dlerror`/`dlclose` to resolve dynamically against the real musl libc instead of `libc.a`'s NULL-returning stub. Without this the binary appears to build fine but `dlopen()` of `libcuda.so.1` returns NULL with no syscall. |
+| 2 | **Dynamic-PIE link mode** | builder, ffmpeg link | Replaces `-fPIE -static-pie` with `-fPIE -pie`. A static-pie binary has no dynamic loader, making `dlopen` impossible by definition. |
+| 3 | **`/etc/ld-musl-x86_64.path`** | final-cuda stage | Adds `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib` to musl's loader search path. The NVIDIA Container Toolkit injects driver libs into one of these depending on host distro; musl's default `/lib:/usr/local/lib:/usr/lib` finds none of them. |
+| 4 | **`gcompat` package + `libdl.so.2` symlink** | final-cuda stage | Provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / `librt.so.1` as musl wrappers (the driver's `DT_NEEDED` entries). The symlink points the driver's `libdl.so.2` reference at `libgcompat.so.0` since musl folds dlopen into libc and ships no separate `libdl`. |
+| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. |
+| 6 | **Entrypoint wrapper** | final-cuda stage | Bash script that exec's `/ffmpeg`, captures exit code via `${PIPESTATUS[0]}`, preserves stdout byte-exact via fd-3 trick, tees stderr to a temp file, and downgrades exit 139 → 0 *only* when stderr contains no recognised error keyword. Suppresses the cosmetic libcuda-destructor SIGSEGV that fires after the encode is fully complete. |
+
+Layers 1–2 belong to the **builder stage** (link-time concerns).
+Layers 3–6 belong to the **`final-cuda` runtime stage** (loader, ABI, lifecycle concerns).
+
+### Diagram of the runtime call chain
+
+```
+docker run --gpus all  ⇒  toolkit injects libcuda.so.1 → /usr/lib64
+                          + sets NVIDIA_DRIVER_CAPABILITIES from image ENV
+       │
+       ▼
+ffmpeg-cuda-entrypoint (bash)               ← layer 6
+       │ exec
+       ▼
+/ffmpeg  (musl dynamic-PIE, libc-only NEEDED)
+       │ ld.so loads libc.musl-x86_64.so.1
+       │   (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path)   ← layer 3
+       │ LD_PRELOAD → /usr/local/lib/libnvshim.so                            ← layer 5
+       ▼
+ffnvcodec dynlink_loader.h:
+       dlopen("libcuda.so.1", RTLD_LAZY)    ← needs layer 1 (real PLT entry)
+       │
+       ▼ ld.so loads libcuda.so.1 (WSL stub)
+       │   resolves DT_NEEDED libdl.so.2 → libgcompat.so.0                   ← layer 4
+       │
+       ▼ libcuda dlopens its WSL backend libcuda.so.1.1
+       │   resolves glibc-internals via libnvshim.so                         ← layer 5
+       │
+       ▼ encode runs successfully, frames produced, output flushed
+       │
+       ▼ ffmpeg main() → avcodec_close → cuCtxDestroy
+       │   libcuda __cxa_finalize crashes during teardown          ☠ SIGSEGV
+       │
+       ▼ wrapper sees exit=139, no error keyword in stderr → exit 0         ← layer 6
 ```
 
-✅ Expected: `frame=  60 fps=... q=... Lsize=N/A` and exit 0, no `Cannot load libcuda.so.1`.
+---
 
-#### 5. Verify static-ness of both variants from the host
+## 15. ffprobe note
 
-```sh
-docker create --name sf      mwader/static-ffmpeg:8.1
-docker cp sf:/ffmpeg         /tmp/ffmpeg-static && docker rm sf
+`ffprobe` shares the same link-time and runtime-loader configuration as `ffmpeg`
+(layers 1–5 above), but does **not** need the entrypoint wrapper because:
 
-docker create --name sfcuda  mwader/static-ffmpeg:8.1-cuda-v3
-docker cp sfcuda:/ffmpeg     /tmp/ffmpeg-cuda && docker rm sfcuda
+- It doesn't open NVENC encoders, so `nvenc_free → cuCtxDestroy` is never invoked.
+- Its `-hwaccel` option is silently ignored (it's an `ffmpeg`-only flag).
+- It doesn't auto-initialize CUDA for normal probe/show operations.
 
-echo "=== :8.1 ==="
-readelf -d /tmp/ffmpeg-static 2>/dev/null | grep -E 'NEEDED|BIND_NOW' \
-    || echo "(no NEEDED — fully static)"
+Tested invocations that all return exit 0 cleanly without the wrapper:
 
-echo "=== :8.1-cuda ==="
-readelf -d /tmp/ffmpeg-cuda 2>/dev/null | grep -E 'NEEDED|BIND_NOW'
+```sh
+docker run --rm --gpus all --entrypoint /ffprobe IMG -version
+docker run --rm --gpus all --entrypoint /ffprobe IMG \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 -show_streams -of json
+docker run --rm --gpus all --entrypoint /ffprobe IMG -i some_h264.mp4
 ```
 
-✅ Expected diff: exactly one extra `NEEDED Shared library: [libc.musl-x86_64.so.1]` on the cuda variant. Both have `BIND_NOW`.
+If a future ffmpeg/driver combination ever makes `ffprobe` reach the crashing
+destructor path, the same wrapper script can be installed with the binary path
+parametrised. Not worth the extra layer today.
 
-### If a step fails
+---
 
-| Step | Failure | Likely cause / fix |
-|---|---|---|
-| 1 | Old `-Wl,-Bstatic` flags still shown | Cache hit — rebuild with `--no-cache` |
-| 2 | `dlopen` shows non-zero size in `.text` | Link-flag fix not applied; check `Dockerfile` ffmpeg configure step has `--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic` *before* the `-Bstatic` codecs |
-| 3 | No `libcuda.so*` found | Toolkit not injecting — check `nvidia-container-toolkit` is installed and `--gpus all` is passed; on WSL2 try `wsl --shutdown` from PowerShell |
-| 4 | `Cannot load libcuda.so.1` but step 3 found it | Path missing from `/etc/ld-musl-x86_64.path`; override at runtime with `-e LD_LIBRARY_PATH=/usr/lib64` (or wherever step 3 found it) |
-| 4 | `[h264_nvenc] No capable devices found` | Driver too old for the NVENC SDK version pinned in `nv-codec-headers`; bump the host NVIDIA driver |
-| Prestart hook SIGSEGV on WSL2 | host-side toolkit bug | `wsl --shutdown` from PowerShell, then retry |
+## 16. Final verification recipe (May 3, 2026)
 
-### Convenient one-liner for repeated test cycles
+Replace `IMG` with your actual tag.
 
 ```sh
-TAG=mwader/static-ffmpeg:8.1-cuda-v3 && \
-docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t $TAG . && \
-docker run --gpus all --rm --entrypoint sh $TAG -c '
-  apk add --no-cache binutils >/dev/null 2>&1
-  echo "=== dlopen syms ==="
-  readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"
-' && \
-docker run --gpus all --rm $TAG \
-    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
-    -c:v h264_nvenc -f null -
-```
+IMG=mwader/static-ffmpeg:8.1-cuda-debian-v47   # or :8.1-cuda after retag
 
----
+# 1. Static-ness check (binary should have exactly one NEEDED entry: musl libc)
+docker run --rm --entrypoint sh "$IMG" -c '
+  apk add --no-cache binutils >/dev/null 2>&1
+  readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW"
+'
 
-## TL;DR
+# 2. NVENC encode end-to-end (the real test)
+docker run --rm --gpus all "$IMG" \
+    -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
+    -c:v h264_nvenc -f null - ; echo "exit=$? (must be 0)"
 
-- `mwader/static-ffmpeg:8.1` stays fully static-pie — unchanged for existing users.
-- `mwader/static-ffmpeg:8.1-cuda` adds NVENC/NVDEC/CUVID as a musl dynamic-PIE binary (libc only is dynamic; everything else still statically archived).
-- The non-obvious gotcha: musl static `libc.a`'s `dlopen` is a NULL-returning stub. The CUDA build pre-links dynamic `libc.so` *before* `-Wl,-Bstatic` so `dlopen` is resolved through the PLT against the working dynamic libc.
-- Verify with `readelf -s --dyn-syms /ffmpeg | grep dlopen` — must be `UND`, not a defined function in `.text`.
+# 3. MP4-to-stdout byte-exactness (wrapper passthrough check)
+docker run --rm --gpus all "$IMG" \
+    -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v h264_nvenc -f mp4 -movflags frag_keyframe+empty_moov - 2>/dev/null \
+    | wc -c   # must print > 0
 
+# 4. ffprobe sanity (no wrapper)
+docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null
+echo "exit=$? (must be 0)"
+```
 
+All four must succeed for the image to be considered shippable.

From e72e7c8beb637b1c5fd3723887bf210dff660b79 Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 3 May 2026 15:55:24 +0200
Subject: [PATCH 4/8] silent exit codes arent propagated so change to 1; needs
 investigation

---
 Dockerfile | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e067e9d..5791aea 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1567,22 +1567,36 @@ ENV NVIDIA_VISIBLE_DEVICES=all \
 RUN apk add --no-cache bash && \
     printf '%s\n' \
     '#!/bin/bash' \
-    '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors.' \
+    '# ffmpeg-cuda entrypoint:' \
+    '#   - swallow benign teardown SIGSEGV from libcuda dtors (139 -> 0)' \
+    '#   - upgrade silent-failure exits (0 -> 1) when ffmpeg printed a known' \
+    '#     fatal-error summary line. The CUDA build of ffmpeg currently' \
+    '#     returns exit code 0 for several real failure paths (bad encoder,' \
+    '#     bad input, bad filter); see docs/24-04-2026-ffmpeg-with-cuda.md' \
+    '#     "Known issue: silent-failure exit code".' \
     'errfile=$(mktemp)' \
-    'trap "rm -f \"$errfile\"" EXIT' \
-    '# Save original stdout to fd 3 BEFORE the pipeline is set up, so ffmpegs' \
-    '# stdout (e.g. muxed bytes for `-f mp4 -`) bypasses tee and reaches the' \
-    '# users terminal/pipe unmodified. If we did `{ ...; } 3>&1 | tee`, the' \
-    '# pipe would have already replaced fd 1, and 3>&1 would point fd 3 INTO' \
-    '# the pipe -- breaking stdout passthrough.' \
+    'shellerr=$(mktemp)' \
+    'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \
     'exec 3>&1' \
-    '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2' \
+    'exec 4>&2' \
+    'exec 2>"$shellerr"' \
+    '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \
     'rc=${PIPESTATUS[0]}' \
     'exec 3>&-' \
-    '# Suppress only the known benign teardown SIGSEGV (libcuda dtors on musl).' \
+    'exec 2>&4 4>&-' \
+    '# Replay bash diagnostics minus the known-benign SEGV line.' \
+    'grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true' \
+    '# Suppress the known benign teardown SIGSEGV (libcuda dtors on musl).' \
     'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \
     '    exit 0' \
     'fi' \
+    '# Upgrade silent-failure exit codes. ffmpeg prints these summary lines' \
+    '# only on hard-fail paths -- never as transient warnings on successful' \
+    '# encodes. Anchored to start-of-line to avoid false positives from' \
+    '# decoder/encoder log lines like "[h264 @ ...] error decoding stream".' \
+    'if [ "$rc" = "0" ] && grep -qE "^(Error opening (input|output) files?|Conversion failed!)" "$errfile"; then' \
+    '    exit 1' \
+    'fi' \
     'exit "$rc"' \
     > /usr/local/bin/ffmpeg-cuda-entrypoint && \
     chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint

From 6c486f86bd3fddf89bde9e7831396cb7d8233f96 Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 3 May 2026 17:53:23 +0200
Subject: [PATCH 5/8] fixed exit code

---
 Dockerfile                                    |  61 +++------
 ...fmpeg-with-cuda.md => ffmpeg-with-cuda.md} | 116 +++++++++++++++++-
 2 files changed, 128 insertions(+), 49 deletions(-)
 rename docs/{24-04-2026-ffmpeg-with-cuda.md => ffmpeg-with-cuda.md} (84%)

diff --git a/Dockerfile b/Dockerfile
index 5791aea..0848dee 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1441,16 +1441,18 @@ RUN apk add --no-cache gcompat libstdc++ && \
 #       with "Error relocating: <sym>: symbol not found", which ffmpeg then surfaces
 #       as the misleading "Cannot load libcuda.so.1".
 #
-#   (b) interposes exit(3) so that, after all of ffmpeg's atexit cleanup has run,
-#       the process terminates via _exit(2) instead of falling through into the
-#       NVIDIA driver's DT_FINI / __cxa_finalize destructors. Those destructors
-#       SIGSEGV on musl + gcompat at teardown (libcuda's pthread_atfork-registered
-#       handlers and TLS destructors unwind through state that no longer exists),
-#       producing exit code 139 even when the encode itself succeeded and the
-#       output file was fully written. By short-circuiting to _exit() we keep the
-#       real exit status that ffmpeg wanted to return, but skip the dtors that
-#       crash. ffmpeg has already flushed all I/O via its own atexit handlers
-#       before our handler runs (atexit is LIFO; we register first via constructor).
+#   (b) [REMOVED 2026-05-03] An earlier version of this shim also interposed
+#       exit(3) and registered an atexit handler that called _exit() to skip
+#       libcuda's crashing DT_FINI destructors. That hack was structurally
+#       broken: ffmpeg's error paths return from main() with a nonzero status
+#       rather than calling exit() explicitly, so musl's _start invokes its
+#       internal exit() WITHOUT going through the PLT — bypassing our LD_PRELOAD
+#       interpose. Our atexit handler then fired with a stale saved_status of 0
+#       and clobbered every nonzero exit code (bad codec → 0, bad input → 0).
+#       The teardown SIGSEGV is now handled exclusively by the bash entrypoint
+#       wrapper at /usr/local/bin/ffmpeg-cuda-entrypoint, which converts the
+#       benign 139 to 0 only when no error keyword is present in stderr. Real
+#       failure exit codes propagate unchanged.
 #
 # Symbols covered for (a) — broadest set of glibc-internals NVIDIA driver libs are
 # known to reference; safe no-op or thin musl-redirect implementations:
@@ -1499,28 +1501,6 @@ RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \
       'void *dlvsym(void *handle, const char *name, const char *version) {' \
       '    (void)version; return dlsym(handle, name);' \
       '}' \
-      '' \
-      '/* ---- exit() interposition: bypass DT_FINI of libcuda to avoid SIGSEGV at teardown ---- */' \
-      '/* Captured exit status set by our interposed exit(); used by the atexit handler. */' \
-      'static volatile int nvshim_saved_status = 0;' \
-      '/* Runs LAST in the atexit chain (registered FIRST from our constructor; */' \
-      '/* atexit is LIFO so all of ffmpegs handlers — stdio flush, fclose etc.   */' \
-      '/* — have already executed by the time we get here). _exit() then skips   */' \
-      '/* all DSO destructors, including libcuda.so.1s crashing __cxa_finalize. */' \
-      'static void nvshim_force_exit(void) { _exit(nvshim_saved_status); }' \
-      '__attribute__((constructor)) static void nvshim_init(void) {' \
-      '    atexit(nvshim_force_exit);' \
-      '}' \
-      '/* Interpose exit() so we capture the real status, then chain to libcs   */' \
-      '/* exit() which runs atexit handlers (ours included) in LIFO order.       */' \
-      'void exit(int status) {' \
-      '    static void (*real_exit)(int);' \
-      '    nvshim_saved_status = status;' \
-      '    if (!real_exit) real_exit = dlsym(RTLD_NEXT, "exit");' \
-      '    if (real_exit) real_exit(status);' \
-      '    _exit(status);' \
-      '    __builtin_unreachable();' \
-      '}' \
       > /tmp/nvshim.c && \
     gcc -shared -fPIC -nostartfiles -o /usr/local/lib/libnvshim.so /tmp/nvshim.c -lpthread -ldl && \
     rm /tmp/nvshim.c && \
@@ -1567,13 +1547,9 @@ ENV NVIDIA_VISIBLE_DEVICES=all \
 RUN apk add --no-cache bash && \
     printf '%s\n' \
     '#!/bin/bash' \
-    '# ffmpeg-cuda entrypoint:' \
-    '#   - swallow benign teardown SIGSEGV from libcuda dtors (139 -> 0)' \
-    '#   - upgrade silent-failure exits (0 -> 1) when ffmpeg printed a known' \
-    '#     fatal-error summary line. The CUDA build of ffmpeg currently' \
-    '#     returns exit code 0 for several real failure paths (bad encoder,' \
-    '#     bad input, bad filter); see docs/24-04-2026-ffmpeg-with-cuda.md' \
-    '#     "Known issue: silent-failure exit code".' \
+    '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors' \
+    '# (exit 139 -> 0) only when no error keyword appears in stderr. Real failure' \
+    '# exit codes (1, 8, 254, ...) propagate unchanged.' \
     'errfile=$(mktemp)' \
     'shellerr=$(mktemp)' \
     'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \
@@ -1590,13 +1566,6 @@ RUN apk add --no-cache bash && \
     'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \
     '    exit 0' \
     'fi' \
-    '# Upgrade silent-failure exit codes. ffmpeg prints these summary lines' \
-    '# only on hard-fail paths -- never as transient warnings on successful' \
-    '# encodes. Anchored to start-of-line to avoid false positives from' \
-    '# decoder/encoder log lines like "[h264 @ ...] error decoding stream".' \
-    'if [ "$rc" = "0" ] && grep -qE "^(Error opening (input|output) files?|Conversion failed!)" "$errfile"; then' \
-    '    exit 1' \
-    'fi' \
     'exit "$rc"' \
     > /usr/local/bin/ffmpeg-cuda-entrypoint && \
     chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint
diff --git a/docs/24-04-2026-ffmpeg-with-cuda.md b/docs/ffmpeg-with-cuda.md
similarity index 84%
rename from docs/24-04-2026-ffmpeg-with-cuda.md
rename to docs/ffmpeg-with-cuda.md
index 0af9ff6..30279f5 100644
--- a/docs/24-04-2026-ffmpeg-with-cuda.md
+++ b/docs/ffmpeg-with-cuda.md
@@ -304,6 +304,7 @@ Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every
 8. **`NVIDIA_DRIVER_CAPABILITIES` defaults to `utility` only** — without `compute,video` the toolkit doesn't mount `libnvcuvid.so`/`libnvidia-encode.so`. Baked the full set into the image's `ENV`.
 9. **`-Bdynamic -lc` reorder still produced the static dlopen stub** under gcc `--toolchain=hardened` — switched to absolute-path link of `/lib/ld-musl-x86_64.so.1` (see §6, "Fix (final, robust)").
 10. **NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown** — libcuda's destructors crash under musl + gcompat during `cuCtxDestroy`. The crash happens in `main()` before any atexit handler fires, so it can't be caught from inside the binary. Fixed with a tiny entrypoint wrapper that downgrades exit 139 → 0 when stderr contains no recognised error keywords. See §14.
+11. **All ffmpeg errors silently exit 0 (bad codec, bad input, bad filter)** — root caused to a `_exit` interposer in `libnvshim.so` that always called `syscall(SYS_exit_group, 0)` regardless of the status it received (or had a bug that lost the argument). Verified via an `LD_PRELOAD` `dladdr` tracer: every `_exit` call resolved to `dso=/usr/local/lib/libnvshim.so`. **Fix**: removed the `_exit`/`exit` interposers from `libnvshim.so` entirely — they were never needed for the glibc→musl ABI shim, only the original (mistaken) attempt to suppress the teardown SEGV from inside the process. Real ffmpeg exit codes (`8` for bad codec, `254` for bad input, `8` for bad filter) now propagate identically to the non-CUDA `:8.1` image. See §5c.
 
 ---
 
@@ -510,6 +511,108 @@ exit "$rc"
 ffprobe doesn't need a wrapper: it doesn't invoke encoders and rarely auto-loads
 CUDA, so it doesn't reach the crashing destructor path.
 
+### 5c. Resolved: ffmpeg silently exits 0 on every error path
+
+**Symptom**: every fatal-error invocation of the CUDA build returned exit code
+`0` to the shell, despite ffmpeg printing the correct error messages on stderr.
+Verified against the non-CUDA `:8.1` baseline:
+
+| Scenario                               | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) |
+|----------------------------------------|-----------------|---------------|--------------|
+| `-c:v this_codec_does_not_exist`       | `8`             | `0` ❌        | `8` ✅       |
+| `-i /no/such/file.mp4`                 | `254`           | `0` ❌        | `254` ✅     |
+| `-vf this_filter_does_not_exist`       | `8`             | `0` ❌        | `8` ✅       |
+| Successful encode                      | `0`             | `0` ✅        | `0` ✅       |
+| Successful encode (post-teardown SEGV) | n/a             | `139` (raw)   | `0` (wrapped) |
+
+This was masked at first because the wrapper grew an "upgrade exit 0 → 1 when
+stderr matches a fatal-error keyword" branch. That made T3 pass with a
+plausible-looking exit `1`, but it was a workaround, not a fix — and the wrong
+exit code (`1` instead of `8`/`254`) broke any caller that switched on the
+specific code.
+
+**Root-cause discovery**: an `LD_PRELOAD` `dladdr` tracer interposing `_exit`
+revealed that on every code path — bad-codec, bad-input, even successful
+`-version` — the call to `_exit` came from `libnvshim.so`:
+
+```
+[exittrace] _exit(0) ra=0x...  dso=/usr/local/lib/libnvshim.so
+```
+
+`libnvshim.so` had been given an `_exit` interposer (and at one point an
+`exit` interposer too) as part of the earlier-but-abandoned attempt to suppress
+the teardown SIGSEGV from inside the process. The interposer always invoked
+`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status on
+the floor, hard-coding `0`. None of the standard ELF / readelf / `nm` checks
+flag this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and
+musl's PLT happily binds `_exit` to whichever DSO comes first in symbol search
+order — `LD_PRELOAD` always wins.
+
+**Fix**: drop the `_exit` (and `exit`) overrides from `libnvshim.so` entirely.
+They were never needed for any glibc→musl ABI gap (those are all the symbol
+list documented in §4 — `gnu_get_libc_version`, `__register_atfork`,
+`dlmopen`, `dlvsym`, etc.). Process-lifecycle suppression belongs in the
+out-of-process bash wrapper (§5b), where it can read the real exit status via
+`${PIPESTATUS[0]}` and pattern-match on the actual error keywords.
+
+After removing the interposers, all standard ffmpeg exit codes match the
+non-CUDA build byte-for-byte, and the wrapper script collapses back to its
+minimal form:
+
+```bash
+#!/bin/bash
+errfile=$(mktemp)
+shellerr=$(mktemp)
+trap "rm -f \"$errfile\" \"$shellerr\"" EXIT
+exec 3>&1
+exec 4>&2
+exec 2>"$shellerr"
+{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4
+rc=${PIPESTATUS[0]}
+exec 3>&-
+exec 2>&4 4>&-
+grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true
+# Suppress *only* the known-benign teardown SIGSEGV from libcuda dtors.
+# Real failure exit codes (1, 8, 254, ...) propagate unchanged.
+if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then
+    exit 0
+fi
+exit "$rc"
+```
+
+**Lesson**: `LD_PRELOAD` shims should be the *minimum* symbol set that closes
+the glibc→musl ABI gap. Any process-lifecycle hook (exit, signal, atexit) added
+to such a shim will silently apply to *every* call from the host program, not
+just the one CUDA-driver call you were trying to fix. Keep lifecycle policy
+out-of-process.
+
+**Diagnostic recipe** (reuse this for any future "wrong exit code" regression):
+
+```sh
+docker run --rm --gpus all --entrypoint sh "$IMG" -c '
+  apk add --no-cache gcc musl-dev binutils >/dev/null
+  cat > /tmp/t.c <<EOF
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+__attribute__((noreturn)) void _exit(int s){
+  void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i);
+  dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?");
+  syscall(SYS_exit_group,s); __builtin_unreachable();
+}
+EOF
+  gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl
+  LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v this_codec_does_not_exist -f null -
+'
+# The traced _exit must show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc),
+# NOT dso=/usr/local/lib/libnvshim.so. If it shows nvshim, the interposer
+# regression is back.
+```
+
 ### Diagnostic playbook (for future re-entry)
 
 Quick all-in-one container probe used during this investigation:
@@ -585,8 +688,8 @@ EOF
 | `-Wl,--no-as-needed,-Bdynamic,-lc,--as-needed,-Bstatic` in extra-ldflags | Still pulled `libc.a` `dlopen` stub via gcc-hardened spec file |
 | Hide `/usr/lib/libc.a` during link | libgme.a configure-time symbol checks failed (gz*/inflate*) |
 | Absolute-path `-Wl,/lib/ld-musl-x86_64.so.1` in extra-ldflags | ✅ NVENC encode finally succeeds |
-| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective |
-| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved |
+| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective. **Worse**: leaving the `_exit` interposer in the shim silently swallowed *every* ffmpeg exit code (always returned 0). See §5c. |
+| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved, real exit codes (8/254/…) propagate unchanged |
 
 ### Decision branch (resolved — stayed on Alpine)
 
@@ -621,7 +724,7 @@ Removing any one breaks NVENC end-to-end. They are listed in the order they take
 | 2 | **Dynamic-PIE link mode** | builder, ffmpeg link | Replaces `-fPIE -static-pie` with `-fPIE -pie`. A static-pie binary has no dynamic loader, making `dlopen` impossible by definition. |
 | 3 | **`/etc/ld-musl-x86_64.path`** | final-cuda stage | Adds `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib` to musl's loader search path. The NVIDIA Container Toolkit injects driver libs into one of these depending on host distro; musl's default `/lib:/usr/local/lib:/usr/lib` finds none of them. |
 | 4 | **`gcompat` package + `libdl.so.2` symlink** | final-cuda stage | Provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / `librt.so.1` as musl wrappers (the driver's `DT_NEEDED` entries). The symlink points the driver's `libdl.so.2` reference at `libgcompat.so.0` since musl folds dlopen into libc and ships no separate `libdl`. |
-| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. |
+| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. **Must NOT export `exit`/`_exit`/`_Exit`** — see §5c; interposing those swallows ffmpeg's real exit status. |
 | 6 | **Entrypoint wrapper** | final-cuda stage | Bash script that exec's `/ffmpeg`, captures exit code via `${PIPESTATUS[0]}`, preserves stdout byte-exact via fd-3 trick, tees stderr to a temp file, and downgrades exit 139 → 0 *only* when stderr contains no recognised error keyword. Suppresses the cosmetic libcuda-destructor SIGSEGV that fires after the encode is fully complete. |
 
 Layers 1–2 belong to the **builder stage** (link-time concerns).
@@ -714,6 +817,13 @@ docker run --rm --gpus all "$IMG" \
 # 4. ffprobe sanity (no wrapper)
 docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null
 echo "exit=$? (must be 0)"
+
+# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for §5c)
+docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v this_codec_does_not_exist -f null - ; echo "exit=$? (must be 8)"
+docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \
+    -i /no/such/file.mp4 -f null - ; echo "exit=$? (must be 254)"
 ```
 
 All four must succeed for the image to be considered shippable.

From e0b1099ab36789872ab963dbee07f04827ec1c6f Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 3 May 2026 20:40:45 +0200
Subject: [PATCH 6/8] working cuda and non-cuda build

---
 Dockerfile               |  221 +++-----
 docs/ffmpeg-with-cuda.md | 1034 ++++++++++++++------------------------
 2 files changed, 452 insertions(+), 803 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0848dee..746e9aa 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1037,12 +1037,10 @@ RUN \
     --enable-static && \
   make -j$(nproc) install
 
-# NVIDIA codec headers (header-only stubs for NVENC / NVDEC / CUVID / CUDA driver API).
-# These do NOT pull in the CUDA toolkit or any glibc-only NVIDIA libraries; ffmpeg
-# dlopen()s libcuda.so.1 / libnvcuvid.so / libnvidia-encode.so at runtime, which are
-# injected into the container by the NVIDIA Container Toolkit (`docker run --gpus all`).
-# Only built when ENABLE_CUDA is set; the resulting ffmpeg binary in that case is a
-# musl dynamic-PIE (not -static-pie) so the loader is present and dlopen() works.
+# NVIDIA codec headers (header-only; no CUDA toolkit needed). ffmpeg dlopen()s the
+# real driver libs (libcuda / libnvcuvid / libnvidia-encode) at runtime, injected
+# by the NVIDIA Container Toolkit. Only built when ENABLE_CUDA is set.
+# See docs/ffmpeg-with-cuda.md.
 # bump: ffnvcodec /FFNVCODEC_VERSION=([\d.]+)/ https://github.com/FFmpeg/nv-codec-headers.git|^13
 # bump: ffnvcodec after ./hashupdate Dockerfile FFNVCODEC $LATEST
 # bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases
@@ -1133,20 +1131,16 @@ ARG FFMPEG_VERSION=8.1
 ARG FFMPEG_URL="https://ffmpeg.org/releases/ffmpeg-$FFMPEG_VERSION.tar.bz2"
 ARG FFMPEG_SHA256=c07039598df7d64d3c8b42c4e25b1959fc908621c6f6c2946881133f3b27eda2
 ARG ENABLE_FDKAAC=
-# sed changes --toolchain=hardened -pie to -static-pie
+# sed changes --toolchain=hardened -pie to -static-pie (default build only).
 #
-# When ENABLE_CUDA is set we KEEP -pie (i.e. skip the -static-pie rewrite) so the
-# resulting binary is a musl dynamic-PIE. This is required because ffnvcodec dlopen()s
-# the NVIDIA driver libs at runtime, and a fully static-pie binary on musl has no
-# dynamic loader → dlopen() always fails. All other dependencies remain statically
-# archived; only ld-musl-*.so.1 / libc.musl-*.so.1 stay dynamic.
+# CUDA variant: keep -pie (musl dynamic-PIE) so ffnvcodec can dlopen() the
+# NVIDIA driver libs. All other deps stay statically archived; only the musl
+# loader/libc is dynamic. See docs/ffmpeg-with-cuda.md.
 #
-# ldflags stack-size=2097152 is to increase default stack size from 128KB (musl default) to something
-# more similar to glibc (2MB). This fixing segfault with libaom-av1 and libsvtav1 as they seems to pass
-# large things on the stack.
-#
-# ldfalgs -Wl,--allow-multiple-definition is a workaround for linking with multiple rust staticlib to
-# not cause collision in toolchain symbols, see comment in checkdupsym script for details.
+# ldflags stack-size=2097152 raises musl's 128KB default to ~glibc 2MB
+# (libaom/libsvtav1 pass large objects on the stack).
+# ldflags --allow-multiple-definition works around rust staticlib toolchain
+# symbol collisions (see checkdupsym).
 RUN \
   wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \
   echo "$FFMPEG_SHA256  ffmpeg.tar.bz2" | sha256sum -c - && \
@@ -1154,32 +1148,17 @@ RUN \
   FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \
   CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \
   if [[ -z "$ENABLE_CUDA" ]]; then \
-    # Default static-pie build: rewrite the hardened toolchain link flag so the
-    # final binaries are fully static PIE musl executables (no loader, no libc.so).
-    # dlopen is irrelevant in this branch (no GPU support), so plain -Bstatic is fine.
+    # Default: fully static-pie musl binary, no loader, no dlopen.
     sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \
-    EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
-        -Wl,--as-needed -Wl,-Bstatic \
-        -static-libstdc++ -static-libgcc" ; \
-    EXTRA_LIBS="-lgomp" ; \
+    EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" ; \
+    EXTRA_LIBS="" ; \
   else \
-    # CUDA variant: musl dynamic-PIE so the loader is present and ffmpeg can
-    # dlopen() libcuda.so.1 / libnvcuvid.so.1 / libnvidia-encode.so.1 that the
-    # NVIDIA Container Toolkit injects at runtime.
-    #
-    # CRITICAL — musl dlopen-stub trap (see docs/24-04-2026-ffmpeg-with-cuda.md §6):
-    #   musl's static libc.a contains a 25-byte dlopen() stub that always returns
-    #   NULL with ENOSYS. If we link the binary with bare "-Wl,-Bstatic ... codecs",
-    #   the linker satisfies ffmpeg's references to dlopen / dlsym / dlerror /
-    #   dlclose from that stub, NOT from the dynamic libc. The resulting binary
-    #   has a defined 25-byte "dlopen" symbol in .text instead of a UND PLT entry,
-    #   and h264_nvenc fails at runtime with "Cannot load libcuda.so.1" without
-    #   ever issuing an openat() syscall (verified with strace).
-    #
-    # Fix: explicitly link the dynamic libc by ABSOLUTE PATH (not -lc), so the
-    # linker uses libc.musl-x86_64.so.1 regardless of the current -B* mode and
-    # cannot fall back to libc.a's stub. Wrapped in --no-as-needed so it stays
-    # in DT_NEEDED even though ffmpeg.o doesn't directly reference its data.
+    # CUDA: musl dynamic-PIE. Link the dynamic libc by ABSOLUTE PATH (not -lc)
+    # to avoid musl's libc.a 25-byte dlopen() stub that always returns NULL —
+    # gcc's hardened toolchain can otherwise resolve dlopen/dlsym/dlerror from
+    # the static archive even when -Bdynamic is requested, breaking nvenc with
+    # a silent "Cannot load libcuda.so.1" (no openat syscall fires).
+    # See docs/ffmpeg-with-cuda.md (P1).
     EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
         -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
         -Wl,--as-needed -Wl,-Bstatic \
@@ -1338,10 +1317,9 @@ RUN \
   ffnvcodec: env.FFNVCODEC_VERSION, \
   }' > /versions.json
 
-# make sure binaries has no dependencies, is relro, pie and stack nx
-# When ENABLE_CUDA is set the binaries are musl dynamic-PIE (so dlopen() of NVIDIA
-# driver libs works at runtime); checkelf is invoked with --cuda which only allows
-# the musl loader / libc as NEEDED entries.
+# make sure binaries has no dependencies, is relro, pie and stack nx.
+# CUDA build is musl dynamic-PIE; --cuda allows the musl loader/libc as the
+# only NEEDED entry.
 COPY checkelf /
 RUN \
   CHECKELF_FLAGS=$(if [ -n "$ENABLE_CUDA" ]; then echo "--cuda"; fi) && \
@@ -1391,24 +1369,20 @@ FROM final2 AS final
 LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
 ENTRYPOINT ["/ffmpeg"]
 
-# CUDA / NVENC / NVDEC variant.
-#
-# Build with:
-#   docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:<ver>-cuda .
+# CUDA / NVENC / NVDEC variant. See docs/ffmpeg-with-cuda.md for full design.
 #
-# Run with (requires NVIDIA driver on host + nvidia-container-toolkit):
-#   docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" mwader/static-ffmpeg:<ver>-cuda \
-#     -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4
+# Build:  docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:<ver>-cuda .
+# Run:    docker run --gpus all --rm mwader/static-ffmpeg:<ver>-cuda \
+#             -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4
 #
-# The binary is a musl dynamic-PIE (NOT fully static-pie) so the dynamic loader is
-# present and FFmpeg can dlopen() the NVIDIA driver libraries (libcuda.so.1,
-# libnvcuvid.so, libnvidia-encode.so) which the NVIDIA Container Toolkit injects
-# into the container at runtime. No CUDA toolkit is required to build or run.
+# Requires NVIDIA driver on host + nvidia-container-toolkit. The binary is a musl
+# dynamic-PIE so the loader is present and the NVIDIA driver libs (libcuda.so.1,
+# libnvcuvid.so, libnvidia-encode.so) injected by the toolkit can be dlopen()'d.
+# No CUDA toolkit needed at build or run time.
 #
-# Note: --enable-libnpp / --enable-cuda-nvcc are NOT included as they require the
-# full glibc-based CUDA toolkit; if you need scale_npp use scale_cuda instead.
-FROM alpine:3.20.3 AS final-cuda
-LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
+# --enable-libnpp / --enable-cuda-nvcc are NOT included (require glibc CUDA toolkit).
+# Use scale_cuda instead of scale_npp.
+FROM alpine:3.20.3 AS final-cuda1
 COPY --from=builder /usr/local/bin/ffmpeg /
 COPY --from=builder /usr/local/bin/ffprobe /
 COPY --from=builder /versions.json /
@@ -1419,51 +1393,24 @@ COPY --from=builder /usr/share/fonts/ /usr/share/fonts/
 COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/
 COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/
 
-# gcompat = glibc compatibility shim for musl. Required because the NVIDIA driver
-# libraries injected by the Container Toolkit (libcuda.so.1, libnvcuvid.so.1,
-# libnvidia-encode.so.1, libnvidia-ml.so.1, ...) are built against glibc and have
+# gcompat: glibc->musl shim. NVIDIA driver libs are built against glibc and have
 # DT_NEEDED entries for libc.so.6 / libpthread.so.0 / libdl.so.2 / libm.so.6 /
-# librt.so.1 / libgcc_s.so.1 — none of which exist on Alpine/musl. gcompat
-# provides those SONAMEs as thin wrappers over musl, allowing dlopen() to succeed.
-# libstdc++ is also pulled in because some NVIDIA helper libs (e.g. libnvidia-ngx,
-# certain optical-flow / ngx variants) link against it.
+# librt.so.1 — gcompat provides those SONAMEs as musl wrappers. libstdc++ is
+# pulled in for NVIDIA helper libs (e.g. libnvidia-ngx). gcompat omits libdl.so.2
+# (musl folds dlopen into libc) so symlink it manually.
 RUN apk add --no-cache gcompat libstdc++ && \
-    # gcompat omits libdl.so.2 (musl folds dlopen into libc). The NVIDIA driver
-    # has DT_NEEDED libdl.so.2, so symlink it to libgcompat to satisfy the loader.
     ln -sf libgcompat.so.0 /lib/libdl.so.2
 
-# nvshim = tiny LD_PRELOAD library that:
-#
-#   (a) exports glibc-internal symbols which gcompat does NOT provide but which the
-#       real NVIDIA WSL/Linux driver backend (/usr/lib/wsl/drivers/.../libcuda.so.1.1
-#       on WSL2, libcuda.so.1 directly on bare Linux) calls during cuInit().
-#       Without these the stub libcuda dlopen succeeds but its backend-load fails
-#       with "Error relocating: <sym>: symbol not found", which ffmpeg then surfaces
-#       as the misleading "Cannot load libcuda.so.1".
-#
-#   (b) [REMOVED 2026-05-03] An earlier version of this shim also interposed
-#       exit(3) and registered an atexit handler that called _exit() to skip
-#       libcuda's crashing DT_FINI destructors. That hack was structurally
-#       broken: ffmpeg's error paths return from main() with a nonzero status
-#       rather than calling exit() explicitly, so musl's _start invokes its
-#       internal exit() WITHOUT going through the PLT — bypassing our LD_PRELOAD
-#       interpose. Our atexit handler then fired with a stale saved_status of 0
-#       and clobbered every nonzero exit code (bad codec → 0, bad input → 0).
-#       The teardown SIGSEGV is now handled exclusively by the bash entrypoint
-#       wrapper at /usr/local/bin/ffmpeg-cuda-entrypoint, which converts the
-#       benign 139 to 0 only when no error keyword is present in stderr. Real
-#       failure exit codes propagate unchanged.
+# nvshim: tiny LD_PRELOAD library exporting glibc-internal symbols that gcompat
+# does NOT provide but the real NVIDIA driver backend calls during cuInit().
+# Without these, the stub libcuda dlopens fine but its backend fails with
+# "Error relocating: <sym>: symbol not found", which ffmpeg surfaces as the
+# misleading "Cannot load libcuda.so.1".
 #
-# Symbols covered for (a) — broadest set of glibc-internals NVIDIA driver libs are
-# known to reference; safe no-op or thin musl-redirect implementations:
-#   gnu_get_libc_version        - sanity-check string ("2.35" satisfies all current drivers)
-#   gnu_get_libc_release        - "stable"
-#   __libc_current_sigrtmin/max - musl macros, just expose as functions
-#   __register_atfork           - glibc internal backing pthread_atfork; redirect
-#   __libc_single_threaded      - data symbol some drivers test (0 = multi-threaded path)
-#   __cxa_thread_atexit_impl    - C++ thread-local destructors registration; no-op
-#   secure_getenv               - musl already has it but some old drivers want explicit
-#   dlmopen / dlvsym / __libc_dl* - glibc-only dl* variants, redirect to musl equivalents
+# IMPORTANT: this shim must NOT interpose exit / _exit / _Exit. Doing so
+# silently swallows ffmpeg's real exit codes (every error returns 0).
+# Process-lifecycle policy belongs in the bash entrypoint wrapper below.
+# See docs/ffmpeg-with-cuda.md (P6).
 RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \
     mkdir -p /usr/local/lib && \
     printf '%s\n' \
@@ -1486,18 +1433,16 @@ RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \
       '    (void)f; (void)o; (void)dso; return 0;' \
       '}' \
       'char *secure_getenv(const char *name) { return getenv(name); }' \
-      '/* dlmopen is a glibc-only namespaced dlopen; musl has no link namespaces. */' \
-      '/* Fallback to regular dlopen, ignoring the Lmid_t. Works for NVIDIA driver  */' \
-      '/* which uses dlmopen mostly for symbol isolation when loading sub-modules.  */' \
+      '/* dlmopen: glibc-only namespaced dlopen; musl has no link namespaces. */' \
       'typedef long Lmid_t;' \
       'void *dlmopen(Lmid_t lmid, const char *file, int mode) {' \
       '    (void)lmid; return dlopen(file, mode);' \
       '}' \
-      '/* Glibc-internal dlopen/dlsym variants used by nss / driver init paths. */' \
+      '/* glibc-internal dl* variants used by nss / driver init. */' \
       'void *__libc_dlopen_mode(const char *name, int mode) { return dlopen(name, mode); }' \
       'void *__libc_dlsym(void *handle, const char *name) { return dlsym(handle, name); }' \
       'int   __libc_dlclose(void *handle) { return dlclose(handle); }' \
-      '/* dlvsym = glibc versioned dlsym. musl has no symbol versioning; ignore version. */' \
+      '/* dlvsym: glibc versioned dlsym; musl has no symbol versioning. */' \
       'void *dlvsym(void *handle, const char *name, const char *version) {' \
       '    (void)version; return dlsym(handle, name);' \
       '}' \
@@ -1506,50 +1451,22 @@ RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \
     rm /tmp/nvshim.c && \
     apk del .nvshim-build
 
-# Add NVIDIA driver injection paths to musl's dynamic-loader fallback search list.
-# The NVIDIA Container Toolkit places libcuda.so.1 etc. in one of these locations
-# depending on host distro:
-#   /usr/lib64                       (RHEL / CentOS / Fedora / Rocky / openSUSE / WSL)
-#   /usr/lib/x86_64-linux-gnu        (Debian / Ubuntu)
-#   /usr/lib/wsl/lib                 (WSL2 GPU passthrough alt path)
-# musl's default search path is /lib:/usr/local/lib:/usr/lib only, so dlopen("libcuda.so.1")
-# would otherwise fail with "Cannot load libcuda.so.1" even though the file is mounted.
+# musl loader fallback search path. The NVIDIA Container Toolkit injects driver
+# libs into one of these depending on host distro; musl's defaults
+# (/lib:/usr/local/lib:/usr/lib) miss all three.
 RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gnu\n/usr/lib/wsl/lib\n' \
     > /etc/ld-musl-x86_64.path
 
-# Default NVIDIA Container Toolkit env vars so callers only need `--gpus all`.
-# compute  -> mounts the real libcuda.so.1
-# video    -> mounts libnvcuvid.so.1 / libnvidia-encode.so.1 (required for NVENC/NVDEC)
-# utility  -> mounts libnvidia-ml + nvidia-smi
-# LD_PRELOAD pulls in the nvshim providing glibc-internal symbols the driver needs.
-ENV NVIDIA_VISIBLE_DEVICES=all \
-    NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \
-    LD_PRELOAD=/usr/local/lib/libnvshim.so
 
-# Entrypoint wrapper to suppress benign teardown SIGSEGV from NVIDIA driver dtors.
-#
-# Background: when ffmpeg encodes/decodes through CUDA on Alpine/musl, the encode
-# itself completes successfully and all output bytes are flushed, but at process
-# teardown libcuda's __cxa_finalize / DT_FINI runs glibc-style destructors that
-# unwind through state musl + gcompat don't fully provide, producing a SIGSEGV
-# (exit 139). The crash happens INSIDE main() during avcodec_close -> cuCtxDestroy,
-# before any atexit handler we could install would fire. There is no in-process
-# fix available short of patching libcuda (closed source) or ffmpeg's nvenc.c to
-# leak the CUDA context.
-#
-# Heuristic: convert exit=139 → 0 IFF stderr contains no recognisable ffmpeg
-# error keywords. If ffmpeg printed a real error before crashing (Cannot load,
-# "Error opening", "not found", etc.) we propagate 139 so users see real bugs.
-# Works regardless of -loglevel: silent successful encode + teardown crash =
-# empty stderr = suppressed; any real failure = error keyword present = passed
-# through. Stdout (e.g. -f null - or muxed bytes for `-f mpegts -`) is preserved
-# bit-exact via fd swap; user's stderr stream gets a live tee of ffmpeg stderr.
+# Entrypoint wrapper: convert the benign teardown SIGSEGV (139 -> 0) that
+# libcuda's __cxa_finalize triggers under musl + gcompat. The crash happens
+# inside main() after the encode is complete and all output is flushed, so
+# no in-process hook can suppress it. Heuristic: only downgrade 139 when
+# stderr contains no recognisable error keyword. Real failure exit codes
+# (1, 8, 254, ...) propagate unchanged. See docs/ffmpeg-with-cuda.md (P5).
 RUN apk add --no-cache bash && \
     printf '%s\n' \
     '#!/bin/bash' \
-    '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors' \
-    '# (exit 139 -> 0) only when no error keyword appears in stderr. Real failure' \
-    '# exit codes (1, 8, 254, ...) propagate unchanged.' \
     'errfile=$(mktemp)' \
     'shellerr=$(mktemp)' \
     'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \
@@ -1560,9 +1477,7 @@ RUN apk add --no-cache bash && \
     'rc=${PIPESTATUS[0]}' \
     'exec 3>&-' \
     'exec 2>&4 4>&-' \
-    '# Replay bash diagnostics minus the known-benign SEGV line.' \
     'grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true' \
-    '# Suppress the known benign teardown SIGSEGV (libcuda dtors on musl).' \
     'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \
     '    exit 0' \
     'fi' \
@@ -1570,11 +1485,25 @@ RUN apk add --no-cache bash && \
     > /usr/local/bin/ffmpeg-cuda-entrypoint && \
     chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint
 
-# sanity tests (cannot exercise actual GPU encode without a GPU at build time)
+# sanity tests (cannot exercise actual GPU encode without a GPU at build time).
+# LD_PRELOAD set inline since the env is only declared in the final stage below.
 RUN ["/ffmpeg", "-version"]
 RUN ["/ffprobe", "-version"]
 RUN ["/ffmpeg", "-hide_banner", "-buildconf"]
 RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda
 RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc
 RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid
+
+# clamp all files into one layer
+FROM scratch AS final-cuda2
+COPY --from=final-cuda1 / /
+
+FROM final-cuda2 AS final-cuda
+LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
+# Default toolkit env so callers only need `--gpus all`.
+#   compute -> libcuda.so.1 ; video -> libnvcuvid + libnvidia-encode (NVENC/NVDEC) ;
+#   utility -> libnvidia-ml + nvidia-smi.
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \
+    LD_PRELOAD=/usr/local/lib/libnvshim.so
 ENTRYPOINT ["/usr/local/bin/ffmpeg-cuda-entrypoint"]
diff --git a/docs/ffmpeg-with-cuda.md b/docs/ffmpeg-with-cuda.md
index 30279f5..e1286dc 100644
--- a/docs/ffmpeg-with-cuda.md
+++ b/docs/ffmpeg-with-cuda.md
@@ -1,213 +1,161 @@
 # Adding NVIDIA CUDA / NVENC / NVDEC support to `static-ffmpeg`
 
-**Date:** 2026-04-24
+**Date:** 2026-04-24 → 2026-05-03
 **Tracking issue:** [#480 — Support for CUDA](https://github.com/wader/static-ffmpeg/issues/480)
-**Outcome:** Separate `:<tag>-cuda` image variant added; default `:<tag>` remains a fully static-pie binary.
+**Outcome:** Separate `:<tag>-cuda` image variant; default `:<tag>` remains a fully static-pie binary.
 
 ---
 
-## 1. Problem statement
+## TL;DR
 
-The default `mwader/static-ffmpeg` image is a **fully static-pie musl binary** with zero
-runtime dependencies. NVIDIA GPU acceleration (NVENC/NVDEC/CUVID) requires
-`dlopen()`'ing the host's NVIDIA driver libraries (`libcuda.so.1`,
-`libnvcuvid.so`, `libnvidia-encode.so`) at runtime, which is fundamentally
-incompatible with `static-pie` on musl: a static-pie binary has no dynamic
-loader, so `dlopen()` cannot work.
-
-Goal: ship a second image variant that supports CUDA without breaking the
-existing static guarantees of the default image.
+| | Default `:8.1` | CUDA `:8.1-cuda` |
+|---|---|---|
+| Linkage | static-pie musl | musl **dynamic-PIE** (libc only) |
+| `readelf -d` NEEDED | (none) | exactly one: `libc.musl-x86_64.so.1` |
+| GPU | ❌ | ✅ NVENC / NVDEC / CUVID |
+| Arch | amd64 + arm64 | amd64 only |
+| Base image | scratch | alpine |
+| ffmpeg exit codes | upstream | identical to upstream |
+
+The CUDA variant works on Alpine + musl by combining six independently-essential
+layers (link-time + runtime). Each layer fixes one specific failure mode that
+appeared during development. The layers are summarized below; full
+problem → cause → fix sections follow.
+
+| # | Layer | Stage | Fixes |
+|---|---|---|---|
+| 1 | Absolute-path link of `/lib/ld-musl-x86_64.so.1` | builder | dlopen returning NULL silently (P1) |
+| 2 | Dynamic-PIE link mode (`-fPIE -pie`, not `-static-pie`) | builder | dlopen impossible on static-pie (P1) |
+| 3 | `/etc/ld-musl-x86_64.path` listing toolkit injection dirs | runtime | musl can't find `/usr/lib64`, `/usr/lib/wsl/lib` (P3) |
+| 4 | `gcompat` package + `libdl.so.2 → libgcompat.so.0` symlink | runtime | NVIDIA driver libs need `libc.so.6` / `libdl.so.2` (P4) |
+| 5 | `libnvshim.so` LD_PRELOAD (ABI-shim symbols only) | runtime | glibc-internal symbols missing from gcompat (P4) |
+| 6 | Bash entrypoint wrapper (139 → 0 only) | runtime | benign teardown SIGSEGV from libcuda dtors (P5) |
 
 ---
 
-## 2. Architecture decision
+## 1. Architecture decision
 
 ### Two separate variants, not one
 
-| Variant | Tag                        | Linkage                             | GPU support |
-|---------|----------------------------|-------------------------------------|-------------|
-| Default | `8.1`, `latest`            | static-pie musl                     | ❌          |
-| CUDA    | `8.1-cuda`, `latest-cuda`  | musl **dynamic-PIE** (libc only)    | ✅          |
-
-**Why a separate variant** (not a build-arg toggle on the default tag):
-- The default tag's value proposition is "drop into any base image including `FROM scratch`". Making it dynamic would silently break that for thousands of existing users.
-- CUDA users need the NVIDIA Container Toolkit and a GPU host — fundamentally different deployment.
-- Different tag = explicit user opt-in + clear support boundary.
+- The default `mwader/static-ffmpeg` is a fully static-pie musl binary that drops into `FROM scratch`. We must not silently break that for existing users.
+- CUDA requires `dlopen()` of host driver libraries → fundamentally incompatible with `static-pie` on musl (no dynamic loader).
+- CUDA users need the NVIDIA Container Toolkit and a GPU host — different deployment.
+- → Different tag = explicit user opt-in + clear support boundary.
 
 ### Build-arg `ENABLE_CUDA`
 
 A single `ARG ENABLE_CUDA=` controls everything:
-- Adds `nv-codec-headers` (header-only, no runtime CUDA toolkit needed)
-- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` to ffmpeg
-- Switches link mode from `static-pie` to musl `dynamic-PIE`
-- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` env
-- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs
-- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry)
-
-The CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`.
-
----
-
-## 3. Why CUDA cannot be `static-pie` on musl
-
-| Constraint | Implication |
-|---|---|
-| `static-pie` binaries have no dynamic loader | `dlopen()` impossible |
-| `nvenc` calls `dlopen("libcuda.so.1", RTLD_LAZY)` via `ffnvcodec/dynlink_loader.h` | Must be a dynamic binary |
-| `libcuda.so.1` is provided by the host driver, version-matched to the host | Must NOT be bundled in image |
-| NVIDIA Container Toolkit injects driver libs at container start | Image just needs to be loadable |
 
-**The minimum-impact compromise:** binary is dynamic only for libc; *every other dependency* (codecs, openssl, libstdc++, libgomp, libgcc, …) remains statically archived. The cuda variant's `readelf -d` differs from the default by **exactly one extra `NEEDED` entry**: `libc.musl-x86_64.so.1`.
+- Adds `nv-codec-headers` (header-only, no CUDA toolkit at build time).
+- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec`.
+- Switches link mode from `static-pie` to musl dynamic-PIE.
+- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`.
+- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs.
+- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry).
 
----
+CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`.
 
-## 4. Limitations explicitly NOT supported
+### Explicitly NOT supported
 
 | Feature | Reason |
 |---|---|
 | `--enable-cuda-nvcc` | Requires the full ~3 GB glibc-based CUDA toolkit at build time |
-| `--enable-libnpp`    | Same — glibc-based, defeats the static/musl design |
-| `scale_npp` filter   | Comes with libnpp; use `scale_cuda` instead |
-| `arm64` builds       | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack); released as **amd64-only** for now |
-| `FROM scratch` / distroless target images | No musl loader available; copy-out won't work |
+| `--enable-libnpp` / `scale_npp` | Same — glibc-only; use `scale_cuda` instead |
+| `arm64` | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack) |
+| `FROM scratch` / distroless target images | No musl loader available |
 
 ---
 
-## 5. Files changed
-
-### `Dockerfile`
-1. New `ARG ENABLE_CUDA=` early in the builder stage.
-2. New `nv-codec-headers` install step (skipped when `ENABLE_CUDA` is unset).
-3. `ffmpeg` configure step extended:
-   - `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` when `ENABLE_CUDA`
-   - Replaces `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie` (dynamic-PIE) when `ENABLE_CUDA`
-   - Custom `CUDA_LDFLAGS` / `CUDA_EXTRA_LIBS` to keep all non-libc deps static (see §6)
-4. `checkelf` invocation gains `--cuda` flag when `ENABLE_CUDA`.
-5. New `final-cuda` stage: `FROM alpine:3.X` + copy of `/usr/local/bin/{ffmpeg,ffprobe}` + ld-musl path config + `ENV NVIDIA_*`.
-
-### `checkelf`
-- Accepts `--cuda` flag.
-- In `--cuda` mode allows the musl loader/libc entry from `ldd` output (everything else still rejected).
-- All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved.
+## 2. Problem → Root cause → Fix
 
-### `README.md`
-- New "CUDA / NVENC / NVDEC" section with build, run, `COPY --from=` recipes for Alpine / Debian / `nvidia/cuda:*` target images, and a "verify static-ness from the host" section using `readelf -d`.
-- New tag entry: `<tag>-cuda` / `latest-cuda` (amd64-only).
+Each subsection records one failure mode encountered during development.
 
 ---
 
-## 6. The dlopen / static-musl trap (gotcha worth documenting)
-
-This was the single most painful issue and is **not obvious** from the build logs.
-
-### Symptom
-
-The `:8.1-cuda` binary builds successfully, `checkelf --cuda` passes, but at runtime:
-
-```
-[h264_nvenc @ 0x...] Cannot load libcuda.so.1
-```
-
-`strace -e openat` shows that ffmpeg **never even attempts** to open any libcuda file — `dlopen()` returns NULL immediately without touching the filesystem.
-
-### Root cause
-
-musl's **static `libc.a`** ships a 25-byte `dlopen` stub that always returns NULL with `errno=ENOSYS`. This is documented behavior — musl deliberately does not support `dlopen` from statically-linked binaries.
-
-The original CUDA build flags were:
-
-```sh
---extra-ldflags='-static-libstdc++ -static-libgcc -Wl,-Bstatic'
---extra-libs=' -lgomp -Wl,-Bdynamic -lc '
-```
-
-The intent: switch to `-Bstatic` for the codec libs, then flip back to `-Bdynamic` at the end so libc stays dynamic. That keeps `ldd` output clean (one NEEDED entry: musl libc).
-
-The bug: ffmpeg's `nvenc.c` references `dlopen`. While processing the codec `.a` files in `-Bstatic` mode, the linker resolves `dlopen` from the static `libc.a` (which gcc pulls in implicitly). Result:
-
-```
-readelf -s --dyn-syms /ffmpeg | grep dlopen
-# 21987: 000000000338c50e   25 FUNC WEAK DEFAULT 14 dlopen
-#                           ^^                  ^^^^
-#                       25 bytes              .text section
-```
-
-`dlopen` is a **25-byte function defined inside the binary itself** in section 14 (`.text`) — the static stub. It's not `UND`, so it never goes through the PLT to dynamic libc.
-
-### Fix (final, robust)
-
-Link the musl loader/libc by **absolute path** in the `--extra-ldflags`, so the
-linker resolution is immune to subsequent `-Bstatic`/`-Bdynamic` toggles:
-
+### P1. `[h264_nvenc] Cannot load libcuda.so.1` — `dlopen()` silently returns NULL
+
+**Symptom.** Binary builds, `checkelf --cuda` passes, but at runtime
+`dlopen("libcuda.so.1")` returns NULL. `strace -e openat` shows ffmpeg never
+even attempts to open any libcuda file — no syscall fires at all.
+
+**Root cause.** Two independent musl traps stacked together:
+
+1. **`-static-pie` has no dynamic loader.** A static-pie musl binary cannot
+   `dlopen()` anything by definition.
+2. **musl's static `libc.a` ships a 25-byte `dlopen` stub** that always returns
+   `NULL` with `errno=ENOSYS`. Even after switching to dynamic-PIE, gcc's
+   `--toolchain=hardened` spec file kept emitting late references that pulled
+   `libc.a` back in, restoring the stub inside the binary. The bug was
+   invisible to standard checks: `BIND_NOW`, `RELRO`, `PIE`, NX stack all
+   passed; `ldd` still showed only one extra NEEDED entry. Only
+   `readelf -s --dyn-syms /ffmpeg | grep dlopen` revealed:
+   ```
+   21987: 000000000338c50e   25 FUNC WEAK DEFAULT 14 dlopen
+   ```
+   — `dlopen` defined inside `.text` at 25 bytes, not `UND`.
+
+   Variants tried that did NOT fix it:
+   - `--extra-libs=' -lgomp -Wl,-Bdynamic -lc '` reorder — gcc spec file re-pulled `libc.a`.
+   - Hiding `/usr/lib/libc.a` during link — broke libgme configure-time symbol checks.
+
+**Fix (Layers 1 + 2).**
+
+1. Link mode: replace `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie`.
+2. Link the musl combined loader/libc by **absolute path** in
+   `--extra-ldflags`, so the linker resolution is immune to `-Bstatic` /
+   `-Bdynamic` toggles and gcc spec-file re-emissions:
+   ```sh
+   --extra-ldflags='-fopenmp -Wl,--allow-multiple-definition \
+       -Wl,-z,stack-size=2097152 \
+       -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
+       -Wl,--as-needed -Wl,-Bstatic \
+       -static-libstdc++ -static-libgcc'
+   --extra-libs='-lgomp -Wl,-Bdynamic -lc'
+   ```
+
+   On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc;
+   one absolute filename covers everything we needed `-lc` for. An absolute
+   filename is opened literally regardless of `-Bstatic` mode and cannot be
+   re-resolved against `libc.a`.
+
+**Verification.**
 ```sh
---extra-ldflags='-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
-    -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
-    -Wl,--as-needed -Wl,-Bstatic \
-    -static-libstdc++ -static-libgcc'
---extra-libs='-lgomp -Wl,-Bdynamic -lc'
+readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose'
+# Each must be 0-size UND, OR not exported (resolved internally against
+# the absolute-path libc — both work). The functional NVENC encode is
+# the ground truth; readelf is the cheap pre-flight.
 ```
 
-Why the absolute path works where `-Wl,--no-as-needed,-Bdynamic,-lc` did not:
-
-- A `-l<name>` argument is searched per the current `-Bstatic`/`-Bdynamic` mode and
-  per the linker's library search path. It is also fed through gcc's spec file,
-  which (especially under `--toolchain=hardened`) re-emits late-stage references
-  that can pull `libc.a` back in even after a careful `-Bdynamic … -Bstatic`
-  reorder, restoring the broken stub.
-- An **absolute filename** in the linker command line is not treated as a `-l`
-  search at all; it is opened literally as a DSO regardless of the `-Bstatic`
-  mode in effect. Its dynamic symbols (including `dlopen`, `dlsym`, `dlerror`,
-  `dlclose`) are then available to satisfy references from later `.a` archives,
-  and those references resolve as `UND` (PLT) instead of pulling the static stub.
-- On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc —
-  one file serves both roles — so this single absolute path covers everything
-  we needed `-lc` for.
-
-### Verification (the bug is invisible to most checks)
+**Lesson.** Never link musl `libc.a` into a binary that calls `dlopen` — it
+will silently use the stub. The `-Bdynamic -lc -Bstatic` reorder is fragile
+under `--toolchain=hardened`; prefer the absolute-path form.
 
-```sh
-readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose'
-# Each must show:
-#       0:               0   FUNC ... UND dl<name>
-# If any shows a non-zero size with a section number (e.g. " 25 FUNC ... 14 dlopen"),
-# the static stub is back and dlopen will silently return NULL with ENOSYS.
-```
+---
 
-> Note: in some link configurations the linker may resolve `dlopen` purely
-> *internally* against the absolute-path libc and not export an explicit `UND`
-> entry for it. The functional test (h264_nvenc actually encoding frames)
-> remains the ultimate ground truth; readelf is just the cheapest pre-flight
-> check that catches the stub-bug regression.
+### P2. `checkelf` rejects the dynamic-PIE binary
 
-### Lessons for any future change to this build
+**Symptom.** The CUDA build's hardening check rejects the binary because it
+has a `NEEDED` entry (libc), whereas the default build has zero.
 
-- **Never link musl `libc.a` into a binary that calls `dlopen`.** It will silently use the stub.
-- The `-Bdynamic -lc -Bstatic` reorder is fragile under gcc's `--toolchain=hardened`
-  spec file. Prefer the absolute-path form `/lib/ld-musl-x86_64.so.1`.
-- The bug is invisible to standard hardening checks: the binary still has
-  `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra
-  NEEDED entry.
-- The only reliable signal is a real NVENC encode actually emitting frames.
+**Fix.** Add `--cuda` flag to `checkelf`. In `--cuda` mode it allows the
+musl loader/libc entry from `ldd` output (everything else still rejected).
+All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved.
 
 ---
 
-## 7. Runtime requirements
+### P3. `dlopen("libcuda.so.1")` reports "Library not found"
 
-### Host
-- NVIDIA driver installed
-- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker
-- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`)
-
-### Image-side env (set by Dockerfile)
-- `NVIDIA_VISIBLE_DEVICES=all`
-- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`
-  - `compute` → `libcuda.so.1`
-  - `video` → `libnvcuvid.so`, `libnvidia-encode.so`
-  - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`.
+**Symptom.** With driver libs actually mounted by the toolkit,
+`dlopen("libcuda.so.1")` still fails with "Library not found".
 
-### `/etc/ld-musl-x86_64.path`
-musl does **not** read `/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently ignored. We ship a static path file:
+**Root cause.** musl's default loader search path is
+`/lib:/usr/local/lib:/usr/lib`. The NVIDIA Container Toolkit injects driver
+libs to `/usr/lib64` (RHEL/Fedora/WSL convention) or
+`/usr/lib/x86_64-linux-gnu` (Debian/Ubuntu). musl also doesn't read
+`/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently
+ignored.
 
+**Fix (Layer 3).** Ship a static `/etc/ld-musl-x86_64.path`:
 ```
 /usr/lib/x86_64-linux-gnu
 /usr/lib64
@@ -216,424 +164,190 @@ musl does **not** read `/etc/ld.so.cache`, so the toolkit's `ldconfig` post-star
 /usr/local/lib
 /lib
 ```
-
-Covers the three common toolkit injection layouts:
-- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu`
-- RHEL/Fedora hosts   → `/usr/lib64`
-- WSL2                → `/usr/lib/wsl/lib`
-
 Listing all is safe — musl silently skips paths that don't exist.
 
 ---
 
-## 8. Verifying the image
-
-### From any Linux host (no musl needed)
-
-```sh
-docker create --name sf      mwader/static-ffmpeg:8.1
-docker cp sf:/ffmpeg         /tmp/ffmpeg-static && docker rm sf
-
-docker create --name sfcuda  mwader/static-ffmpeg:8.1-cuda
-docker cp sfcuda:/ffmpeg     /tmp/ffmpeg-cuda && docker rm sfcuda
-
-readelf -d /tmp/ffmpeg-static | grep -E 'NEEDED|BIND_NOW'
-# (no NEEDED entries — fully static)
-# 0x000000000000001e (FLAGS) BIND_NOW
-
-readelf -d /tmp/ffmpeg-cuda  | grep -E 'NEEDED|BIND_NOW'
-# 0x0000000000000001 (NEEDED) Shared library: [libc.musl-x86_64.so.1]
-# 0x000000000000001e (FLAGS) BIND_NOW
-```
-
-### dlopen sanity check (the painful one)
-
-```sh
-docker run --gpus all --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda -c '
-apk add --no-cache binutils >/dev/null 2>&1
-readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"
-'
-# MUST end with "UND dlopen", "UND dlsym", "UND dlerror"
-# If any has a non-zero size in .text → static stub bug is back.
-```
-
-### Functional encode
-
-```sh
-docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda \
-    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
-    -c:v h264_nvenc -f null -
-# expect: frame=  60 ... finished
-```
-
----
-
-## 9. Comparison with other static ffmpeg + nvenc projects
-
-| Project | Static? | NVENC? | Approach |
-|---|---|---|---|
-| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen |
-| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works |
-| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic, plus runtime ldconfig | ✅ | Tarball, glibc-linked |
-| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs |
-| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container |
-
-Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every other "static + nvenc" build is glibc-dynamic. The trade-off vs the default `:8.1` is exactly one libc.so dependency.
-
----
-
-## 10. CI / multi-arch publishing notes
-
-- Default tag: built for `linux/amd64,linux/arm64` as before.
-- CUDA tag: built for `linux/amd64` only.
-  - Pushed as `<tag>-cuda` (and re-tagged manifest-style as `<tag>-cuda-amd64` for clarity).
-  - `latest-cuda` follows latest stable.
-- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix entry.
-
----
-
-## 11. Issues encountered during implementation (chronological)
-
-1. **`nv-codec-headers` checksum mismatch** — initial SHA256 was wrong; fixed by recomputing against the actual GitHub release tarball.
-2. **`checkelf` rejected the dynamic-PIE binary** — added `--cuda` mode that allows musl libc + loader as the only `ldd` entries.
-3. **Spurious dynamic deps (`libgomp`, `libdrm`, etc.)** — fixed by pre-linking with `-Wl,-Bstatic` (initial fix) and `-static-libgcc -static-libstdc++`.
-4. **`Cannot load libcuda.so.1` at runtime, despite `--gpus all`** (the big one) — root caused to musl's static `libc.a` `dlopen` stub. Fixed in §6.
-5. **WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook** — host-side regression unrelated to image; resolved by `wsl --shutdown` + restart. Not an image issue.
-6. **NVIDIA driver libs reference glibc-internal symbols missing from musl/gcompat** — added `gcompat` package + a tiny `libnvshim.so` `LD_PRELOAD` library exporting the missing symbols. See §14.
-7. **musl loader doesn't search `/usr/lib64` / `/usr/lib/wsl/lib` where the toolkit injects driver libs** — added `/etc/ld-musl-x86_64.path` listing all known injection layouts.
-8. **`NVIDIA_DRIVER_CAPABILITIES` defaults to `utility` only** — without `compute,video` the toolkit doesn't mount `libnvcuvid.so`/`libnvidia-encode.so`. Baked the full set into the image's `ENV`.
-9. **`-Bdynamic -lc` reorder still produced the static dlopen stub** under gcc `--toolchain=hardened` — switched to absolute-path link of `/lib/ld-musl-x86_64.so.1` (see §6, "Fix (final, robust)").
-10. **NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown** — libcuda's destructors crash under musl + gcompat during `cuCtxDestroy`. The crash happens in `main()` before any atexit handler fires, so it can't be caught from inside the binary. Fixed with a tiny entrypoint wrapper that downgrades exit 139 → 0 when stderr contains no recognised error keywords. See §14.
-11. **All ffmpeg errors silently exit 0 (bad codec, bad input, bad filter)** — root caused to a `_exit` interposer in `libnvshim.so` that always called `syscall(SYS_exit_group, 0)` regardless of the status it received (or had a bug that lost the argument). Verified via an `LD_PRELOAD` `dladdr` tracer: every `_exit` call resolved to `dso=/usr/local/lib/libnvshim.so`. **Fix**: removed the `_exit`/`exit` interposers from `libnvshim.so` entirely — they were never needed for the glibc→musl ABI shim, only the original (mistaken) attempt to suppress the teardown SEGV from inside the process. Real ffmpeg exit codes (`8` for bad codec, `254` for bad input, `8` for bad filter) now propagate identically to the non-CUDA `:8.1` image. See §5c.
-
----
-
-## 12. Open follow-ups
-
-- [ ] Document required `nvidia-container-toolkit` minimum version once we know which versions reliably handle the prestart hook on WSL2.
-- [ ] Consider exposing `NVIDIA_DRIVER_CAPABILITIES` as a build-arg for power users who want to drop `video`.
-- [ ] Add a CI smoke test that runs the encode on a self-hosted GPU runner (currently only readelf-level checks possible in vanilla GitHub Actions).
-- [ ] Investigate whether `arm64` Jetson support is feasible later (would need a separate `nv-codec-headers` build path and likely a different base image).
+### P4. NVIDIA driver libs reference glibc-internal symbols missing from musl
+
+**Symptom.** Even with libs found, `dlopen("libcuda.so.1.1")` (the WSL2
+backend) fails with `Error relocating: <sym>: symbol not found`. Iteratively
+discovered missing symbols: `gnu_get_libc_version`, `__register_atfork`,
+`dlmopen`, `dlvsym`, etc.
+
+**Root cause.** NVIDIA driver libs are built against glibc.
+`gcompat` provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` /
+`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds
+`dlopen` into libc) and a number of glibc-internal helpers used by recent
+drivers.
+
+**Fix (Layers 4 + 5).**
+
+- Install `gcompat` package.
+- Symlink `libdl.so.2 → libgcompat.so.0` (driver's `DT_NEEDED libdl.so.2`).
+- Build a small `libnvshim.so` exporting the missing glibc-internal symbols
+  and `LD_PRELOAD` it. Final shim payload:
+
+  | Symbol | Implementation |
+  |---|---|
+  | `gnu_get_libc_version` | return `"2.35"` |
+  | `gnu_get_libc_release` | return `"stable"` |
+  | `__libc_current_sigrtmin` / `__libc_current_sigrtmax` | musl macros exposed as functions |
+  | `__register_atfork` | redirect to `pthread_atfork` |
+  | `__cxa_thread_atexit_impl` | no-op |
+  | `__libc_single_threaded` | data symbol, value 0 |
+  | `secure_getenv` | redirect to `getenv` |
+  | `dlmopen` | redirect to `dlopen` (ignore Lmid_t) |
+  | `dlvsym` | redirect to `dlsym` (ignore version) |
+  | `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose` | wrappers |
+
+  > **Critical: `libnvshim.so` must NOT export `exit` / `_exit` / `_Exit`.**
+  > See P6 — interposing those swallows ffmpeg's real exit status.
+
+**Maintenance note.** Each new NVIDIA driver release may reference one more
+glibc-internal symbol. Diagnostic recipe in §3 finds it in <5 minutes; fix
+is a one-line addition to `libnvshim.so`.
 
 ---
 
-## 13. Resuming work on another machine
-
-If you need to continue from a fresh checkout / device, here is the full
-sequence to rebuild and validate the CUDA image end-to-end.
-
-### Build
-
-> ⚠️ Use `--no-cache` if you previously built `:8.1-cuda` with the broken
-> link flags — Docker will otherwise reuse the cached ffmpeg layer that
-> contains the static `dlopen` stub. Full rebuild on a typical machine
-> takes ~45–75 min (most of it is libaom, libvmaf, x265, svt-av1, vvenc).
-
-```sh
-cd /path/to/static-ffmpeg
-
-docker build --no-cache \
-    --build-arg ENABLE_CUDA=1 \
-    --target final-cuda \
-    -t mwader/static-ffmpeg:8.1-cuda-v3 .
-```
-
-If you only changed something *after* the ffmpeg compile step (e.g. the
-`final-cuda` stage, env vars, ld-musl path), you can skip `--no-cache`:
-
-```sh
-docker build \
-    --build-arg ENABLE_CUDA=1 \
-    --target final-cuda \
-    -t mwader/static-ffmpeg:8.1-cuda-v3 .
-```
-
----
+### P5. NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown
 
-## Investigation log: April 28 – May 2, 2026 (Alpine/musl + WSL2 NVIDIA stack)
+**Symptom.** Encode completes successfully (`frame= 60 ... muxing overhead`,
+output bytes fully written), then ffmpeg exits with 139.
 
-This section records every layer that had to be peeled back to get NVENC working
-on Alpine/musl with the NVIDIA Container Toolkit on a Windows + WSL2 host
-(host driver 596.21, CUDA 13.2, RTX 3060 Ti, ffnvcodec 13.0.19.0, ffmpeg 8.1).
+**Root cause.** libcuda's `__cxa_finalize` / `DT_FINI` destructors run during
+`avcodec_close → nvenc_free → cuCtxDestroy` while still inside `main()`.
+Those destructors call into glibc-internal state (TLS-destructor unwinding,
+pthread_atfork handlers) that musl + gcompat don't fully provide, and crash.
 
-### Environment
+Because the crash is inside `main()` (not after `exit()` is called), no
+in-process hook — atexit, `LD_PRELOAD` signal handlers, etc. — can suppress
+it cleanly. Attempts at in-process suppression all failed:
 
-- Host: Windows 11 + WSL2 (Ubuntu 22.04), Docker Desktop / engine.
-- GPU: NVIDIA RTX 3060 Ti, driver 596.21, CUDA 13.2 (per `nvidia-smi`).
-- Container base for `final-cuda`: `alpine:3.20.3` (musl 1.2.x).
-- Driver injection paths used by the toolkit on this host:
-  - `/usr/lib64/libcuda.so.1`         (179 KB WSL "loader stub")
-  - `/usr/lib64/libnvcuvid.so.1`      (23.8 MB, real)
-  - `/usr/lib64/libnvidia-encode.so.1`(266 KB stub)
-  - `/usr/lib64/libnvidia-ml.so.1`    (278 KB)
-  - `/usr/lib/wsl/drivers/nv_dispi.inf_amd64_<HASH>/libcuda.so.1.1` (24.1 MB, real backend)
-
-### Layer-by-layer findings
-
-#### 1. ffmpeg link conflict (fixed)
-
-Symptom: ffmpeg link in builder failed with all `--enable-*` flags on.
-Cause: `export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc"` was set
-**unconditionally**, conflicting with the `-static-pie` configure patch used in
-the non-CUDA branch.
-Fix: gate the `LDFLAGS` export on `ENABLE_CUDA` only. Non-CUDA build returns to
-upstream static-pie behaviour.
-
-#### 2. NVIDIA Container Toolkit capabilities (fixed)
-
-Symptom: only 180 KB stub `libcuda.so.1` mounted; `libnvcuvid` / `libnvidia-encode`
-absent.
-Cause: `--gpus all` only exposes the *device*; library set is governed by
-`NVIDIA_DRIVER_CAPABILITIES`. Default is just `utility` → no compute/video libs.
-Fix: bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` and
-`NVIDIA_VISIBLE_DEVICES=all` into the `final-cuda` stage image config.
-
-#### 3. musl dynamic-loader search path (fixed)
-
-Symptom: even with libs mounted, `dlopen("libcuda.so.1")` reported "Library not found".
-Cause: musl's default search path is `/lib:/usr/local/lib:/usr/lib`; toolkit
-mounts driver libs to `/usr/lib64` (RHEL/Fedora/WSL convention) which musl does
-not search.
-Fix: write `/etc/ld-musl-x86_64.path` listing `/lib`, `/usr/local/lib`, `/usr/lib`,
-`/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib`.
-
-#### 4. glibc → musl ABI gap (fixed via gcompat + nvshim)
-
-Symptom: NVIDIA driver libs (compiled against glibc) reference glibc-internal
-symbols not present in musl/gcompat.
-Cause: gcompat provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` /
-`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds dlopen
-into libc) and a number of glibc-internal helpers used by recent NVIDIA drivers.
-
-Iterative discovery of missing symbols (each found by `dlopen` of the WSL
-backend library reporting "Error relocating: <sym>: symbol not found"):
-
-| Iteration | Newly-needed symbol | Shim strategy |
-|---|---|---|
-| 1 | `gnu_get_libc_version`           | return `"2.35"` |
-| 2 | `__register_atfork`              | redirect to `pthread_atfork` |
-| 3 | `dlmopen`                        | wrapper around `dlopen` (ignore Lmid_t) |
-| 4 | `dlvsym`                         | wrapper around `dlsym` (ignore version) |
-
-Final shim payload (`libnvshim.so`, `LD_PRELOAD`'d):
-
-- `gnu_get_libc_version` → `"2.35"`
-- `gnu_get_libc_release` → `"stable"`
-- `__libc_current_sigrtmin` / `__libc_current_sigrtmax` (musl macros exposed as functions)
-- `__register_atfork` → `pthread_atfork`
-- `__cxa_thread_atexit_impl` → no-op
-- `__libc_single_threaded` (data symbol, value 0)
-- `secure_getenv` → `getenv`
-- `dlmopen` → `dlopen` (ignore namespace)
-- `dlvsym` → `dlsym` (ignore version)
-- `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose`
-
-After this set, the **standalone** dlopen test passes on every layer:
-
-- `dlopen("libcuda.so.1", RTLD_LAZY)` → OK (loads /usr/lib64 stub).
-- `dlopen("/usr/lib/wsl/drivers/.../libcuda.so.1.1", RTLD_NOW)` → OK (real backend).
-- `dlopen("libnvcuvid.so.1", RTLD_NOW)` → OK.
-- `dlopen("libnvidia-encode.so.1", RTLD_NOW)` → OK.
-- `dlopen("libnvidia-ml.so.1", RTLD_NOW)` → OK.
-- `dlsym(cuInit / cuDriverGetVersion / cuDeviceGet / cuCtxCreate_v2 / cuCtxDestroy_v2 / cuMemAlloc_v2)` → all non-NULL.
-- `cuInit(0)` → returns `CUDA_SUCCESS` (0).
-- `cuDriverGetVersion(&v)` → returns 0 with v = 13020 (CUDA 13.2).
-
-`nvidia-smi` inside the container prints full GPU info.
-
-### 5. Resolved: ffmpeg's `nvenc_load_libraries` reporting "Cannot load libcuda.so.1"
-
-**Root cause** (the same musl static `libc.a` `dlopen` stub described in §6,
-but a worse variant of it): even with the `-Wl,--no-as-needed,-Bdynamic,-lc`
-reorder, gcc's `--toolchain=hardened` spec file emitted late references that
-re-pulled `libc.a`, restoring the 25-byte `dlopen` stub inside the binary.
-`readelf -s --dyn-syms /ffmpeg | grep dlopen` then showed:
-
-```
-21987: 000000000338c50e   25 FUNC WEAK DEFAULT 14 dlopen
-```
-
-— `dlopen` defined inside `.text` of the binary itself, returning NULL with
-`ENOSYS` without ever issuing an `openat` syscall. Hence `strace` showed no
-filesystem activity for `libcuda*`.
-
-**Fix**: link the musl combined loader/libc by **absolute path** rather than
-via `-lc`. Absolute filenames bypass `-Bstatic`/`-Bdynamic` mode altogether and
-cannot be re-resolved against `libc.a`:
-
-```sh
-# in --extra-ldflags:
--Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed
-```
+| Attempt | Result |
+|---|---|
+| `nvshim` `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* `main()` returns; atexit never runs |
+| In-process signal handler | Same — crash is in destructor before signal can dispatch |
 
-After this change, `dlopen`/`dlsym`/`dlerror`/`dlclose` resolve as `UND`
-(or are bound internally to the absolute-path libc — both outcomes work at
-runtime) and h264_nvenc encodes successfully.
-
-### 5b. Resolved: SIGSEGV at process teardown (exit 139)
-
-**Symptom**: encode completes successfully (`frame=  60 ... muxing overhead`
-visible, output bytes fully written), then ffmpeg exits with 139 (SIGSEGV).
-Reproduced with and without `LD_PRELOAD=libnvshim.so`, so nvshim is not the
-trigger.
-
-**Root cause**: libcuda's `__cxa_finalize` / DT_FINI destructors run during
-ffmpeg's `avcodec_close → nvenc_free → cuCtxDestroy` while still inside
-`main()`. Those destructors call into glibc-internal state that musl + gcompat
-don't fully provide (notably TLS-destructor unwinding, and pthread_atfork
-handlers registered by the driver), and crash. Because the crash is *inside*
-`main()` (not after `exit()` is called), there is no in-process hook — atexit
-handlers, signal handlers installed by `LD_PRELOAD`, etc. — that can suppress
-it cleanly without risk of papering over real bugs.
-
-**Fix**: a 12-line bash entrypoint wrapper that runs `/ffmpeg`, captures its
-exit code via `${PIPESTATUS[0]}`, tees stderr to a temp file for inspection,
-preserves stdout byte-exact via fd-3 trick, and converts exit 139 → 0 *only*
-when stderr contains no recognised ffmpeg error keyword (`error`, `cannot
-load`, `not found`, `invalid`, `failed`, `conversion failed`, `no such`).
-Real failures (mid-encode CUDA OOM, init failures, bad codec, etc.) propagate
-unchanged because they always print an identifiable error first.
+**Fix (Layer 6).** Out-of-process bash entrypoint wrapper that captures the
+real exit code via `${PIPESTATUS[0]}` and downgrades **only** `139 → 0`,
+gated on stderr containing no recognized error keyword. Real failures
+(mid-encode CUDA OOM, init failures, etc.) propagate unchanged because they
+always print an identifiable error first.
 
 ```bash
 #!/bin/bash
 errfile=$(mktemp)
-trap "rm -f \"$errfile\"" EXIT
+shellerr=$(mktemp)
+trap "rm -f \"$errfile\" \"$shellerr\"" EXIT
 exec 3>&1
-{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2
+exec 4>&2
+exec 2>"$shellerr"
+{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4
 rc=${PIPESTATUS[0]}
 exec 3>&-
+exec 2>&4 4>&-
+# Filter the bash job-control "Segmentation fault (core dumped)" line.
+grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true
+# Suppress *only* the known-benign teardown SIGSEGV.
 if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then
     exit 0
 fi
 exit "$rc"
 ```
 
-ffprobe doesn't need a wrapper: it doesn't invoke encoders and rarely auto-loads
-CUDA, so it doesn't reach the crashing destructor path.
+ffprobe doesn't need the wrapper — it doesn't open NVENC encoders, so the
+crashing destructor path isn't reached.
 
-### 5c. Resolved: ffmpeg silently exits 0 on every error path
+---
 
-**Symptom**: every fatal-error invocation of the CUDA build returned exit code
-`0` to the shell, despite ffmpeg printing the correct error messages on stderr.
+### P6. ffmpeg silently exits 0 on every error path
+
+**Symptom.** Every fatal-error invocation of the CUDA build returned exit
+code `0` to the shell, despite ffmpeg printing the correct error messages.
 Verified against the non-CUDA `:8.1` baseline:
 
-| Scenario                               | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) |
-|----------------------------------------|-----------------|---------------|--------------|
-| `-c:v this_codec_does_not_exist`       | `8`             | `0` ❌        | `8` ✅       |
-| `-i /no/such/file.mp4`                 | `254`           | `0` ❌        | `254` ✅     |
-| `-vf this_filter_does_not_exist`       | `8`             | `0` ❌        | `8` ✅       |
-| Successful encode                      | `0`             | `0` ✅        | `0` ✅       |
-| Successful encode (post-teardown SEGV) | n/a             | `139` (raw)   | `0` (wrapped) |
-
-This was masked at first because the wrapper grew an "upgrade exit 0 → 1 when
-stderr matches a fatal-error keyword" branch. That made T3 pass with a
-plausible-looking exit `1`, but it was a workaround, not a fix — and the wrong
+| Scenario | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) |
+|---|---|---|---|
+| `-c:v this_codec_does_not_exist` | `8` | `0` ❌ | `8` ✅ |
+| `-i /no/such/file.mp4` | `254` | `0` ❌ | `254` ✅ |
+| `-vf this_filter_does_not_exist` | `8` | `0` ❌ | `8` ✅ |
+| Successful encode | `0` | `0` ✅ | `0` ✅ |
+| Successful encode (post-teardown SEGV) | n/a | `139` (raw) | `0` (wrapped) |
+
+This was masked at first by an "upgrade exit 0 → 1 when stderr matches a
+fatal-error keyword" branch in the wrapper. That made tests pass with a
+plausible-looking exit `1`, but it was a workaround, not a fix — the wrong
 exit code (`1` instead of `8`/`254`) broke any caller that switched on the
 specific code.
 
-**Root-cause discovery**: an `LD_PRELOAD` `dladdr` tracer interposing `_exit`
+**Root-cause discovery.** An `LD_PRELOAD` `dladdr` tracer interposing `_exit`
 revealed that on every code path — bad-codec, bad-input, even successful
 `-version` — the call to `_exit` came from `libnvshim.so`:
-
 ```
 [exittrace] _exit(0) ra=0x...  dso=/usr/local/lib/libnvshim.so
 ```
 
 `libnvshim.so` had been given an `_exit` interposer (and at one point an
-`exit` interposer too) as part of the earlier-but-abandoned attempt to suppress
-the teardown SIGSEGV from inside the process. The interposer always invoked
-`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status on
-the floor, hard-coding `0`. None of the standard ELF / readelf / `nm` checks
-flag this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and
-musl's PLT happily binds `_exit` to whichever DSO comes first in symbol search
-order — `LD_PRELOAD` always wins.
-
-**Fix**: drop the `_exit` (and `exit`) overrides from `libnvshim.so` entirely.
-They were never needed for any glibc→musl ABI gap (those are all the symbol
-list documented in §4 — `gnu_get_libc_version`, `__register_atfork`,
-`dlmopen`, `dlvsym`, etc.). Process-lifecycle suppression belongs in the
-out-of-process bash wrapper (§5b), where it can read the real exit status via
-`${PIPESTATUS[0]}` and pattern-match on the actual error keywords.
+`exit` interposer too) as part of the abandoned in-process attempt to
+suppress the teardown SIGSEGV (P5). The interposer always invoked
+`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status
+and hard-coded `0`. None of the standard ELF / readelf / `nm` checks flag
+this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and
+musl's PLT happily binds `_exit` to whichever DSO comes first in symbol
+search order — `LD_PRELOAD` always wins.
+
+**Fix.** Drop the `_exit` (and `exit`) overrides from `libnvshim.so`
+entirely. They were never needed for any glibc→musl ABI gap (those are all
+the symbols in P4). Process-lifecycle suppression belongs in the
+out-of-process bash wrapper (P5), where it can read the real exit status via
+`${PIPESTATUS[0]}` and pattern-match on actual error keywords.
 
 After removing the interposers, all standard ffmpeg exit codes match the
-non-CUDA build byte-for-byte, and the wrapper script collapses back to its
-minimal form:
+non-CUDA build byte-for-byte.
 
-```bash
-#!/bin/bash
-errfile=$(mktemp)
-shellerr=$(mktemp)
-trap "rm -f \"$errfile\" \"$shellerr\"" EXIT
-exec 3>&1
-exec 4>&2
-exec 2>"$shellerr"
-{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4
-rc=${PIPESTATUS[0]}
-exec 3>&-
-exec 2>&4 4>&-
-grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true
-# Suppress *only* the known-benign teardown SIGSEGV from libcuda dtors.
-# Real failure exit codes (1, 8, 254, ...) propagate unchanged.
-if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then
-    exit 0
-fi
-exit "$rc"
-```
+**Lesson (now baked into Layer 5).** `LD_PRELOAD` shims should be the
+*minimum* symbol set that closes the glibc→musl ABI gap. Any
+process-lifecycle hook (exit, signal, atexit) added to such a shim will
+silently apply to *every* call from the host program, not just the one
+CUDA-driver call you were trying to fix. **Keep lifecycle policy
+out-of-process.**
 
-**Lesson**: `LD_PRELOAD` shims should be the *minimum* symbol set that closes
-the glibc→musl ABI gap. Any process-lifecycle hook (exit, signal, atexit) added
-to such a shim will silently apply to *every* call from the host program, not
-just the one CUDA-driver call you were trying to fix. Keep lifecycle policy
-out-of-process.
+---
 
-**Diagnostic recipe** (reuse this for any future "wrong exit code" regression):
+### P7. Other small issues encountered (one-line each)
 
-```sh
-docker run --rm --gpus all --entrypoint sh "$IMG" -c '
-  apk add --no-cache gcc musl-dev binutils >/dev/null
-  cat > /tmp/t.c <<EOF
-#define _GNU_SOURCE
-#include <dlfcn.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/syscall.h>
-__attribute__((noreturn)) void _exit(int s){
-  void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i);
-  dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?");
-  syscall(SYS_exit_group,s); __builtin_unreachable();
-}
-EOF
-  gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl
-  LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \
-    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
-    -c:v this_codec_does_not_exist -f null -
-'
-# The traced _exit must show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc),
-# NOT dso=/usr/local/lib/libnvshim.so. If it shows nvshim, the interposer
-# regression is back.
-```
+| # | Issue | Fix |
+|---|---|---|
+| 1 | `nv-codec-headers` checksum mismatch | Recompute SHA256 against actual GitHub release tarball |
+| 2 | ffmpeg link failed because `LDFLAGS` was set unconditionally and conflicted with `-static-pie` in non-CUDA branch | Gate the `LDFLAGS` export on `ENABLE_CUDA` only |
+| 3 | Spurious dynamic deps (`libgomp`, `libdrm`, …) | Pre-link with `-Wl,-Bstatic` + `-static-libgcc -static-libstdc++` |
+| 4 | Toolkit only mounted 180 KB stub `libcuda.so.1` (no `libnvcuvid` / `libnvidia-encode`) | Bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` into image |
+| 5 | WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook | Host-side regression unrelated to image; `wsl --shutdown` + restart |
+
+---
 
-### Diagnostic playbook (for future re-entry)
+## 3. Diagnostics
 
-Quick all-in-one container probe used during this investigation:
+### 3a. Quick image probe (link state, env, driver libs, dlopen, encode)
 
 ```sh
-IMG=mwader/static-ffmpeg:8.1-cuda-debian-v43
+IMG=mwader/static-ffmpeg:8.1-cuda
 docker run --rm --gpus all --entrypoint sh "$IMG" -c '
   apk add --no-cache gcc musl-dev binutils strace >/dev/null
 
-  # 1. Confirm env + linkage
-  echo "LD_PRELOAD=$LD_PRELOAD"
+  echo "=== 1. Linkage ==="
   ldd /ffmpeg
+  readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW"
 
-  # 2. Confirm path file
+  echo "=== 2. musl loader path ==="
   cat /etc/ld-musl-x86_64.path
 
-  # 3. Confirm driver libs are mounted
+  echo "=== 3. Driver libs mounted ==="
   ls -lh /usr/lib64/libcuda.so.1 /usr/lib64/libnv*.so.1 \
          /usr/lib/wsl/drivers/nv_dispi.inf_amd64_*/libcuda.so.1.1 2>/dev/null
 
-  # 4. Standalone dlopen + cuInit smoke test
+  echo "=== 4. Standalone dlopen + cuInit ==="
   cat > /t.c <<EOF
 #include <dlfcn.h>
 #include <stdio.h>
@@ -647,167 +361,93 @@ int main(void){
 EOF
   gcc /t.c -o /t && /t
 
-  # 5. Trace what ffmpeg actually does when invoking h264_nvenc
-  strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg -hide_banner -loglevel error \
-    -f lavfi -i testsrc=size=320x240:rate=30 -t 1 -c:v h264_nvenc -f null - 2>&1 | tail -3
-  echo "--- cuda/nvidia syscalls in strace ---"
+  echo "=== 5. ffmpeg openat trace for h264_nvenc ==="
+  strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg \
+      -hide_banner -loglevel error \
+      -f lavfi -i testsrc=size=320x240:rate=30 -t 1 \
+      -c:v h264_nvenc -f null - 2>&1 | tail -3
   grep -E "cuda|nvidia|nvcuvid|libnv|/dev/dxg|/dev/nvidia" /tmp/ff.strace | head -40
 '
 ```
 
-### What works today (final state — May 3, 2026)
-
-- ✅ Build succeeds with all 51 `--enable-lib*` codecs + `--enable-ffnvcodec
-  --enable-cuvid --enable-nvenc --enable-nvdec` on Alpine + musl.
-- ✅ Image runs `ffmpeg -version`, `-buildconf`, hwaccels/encoders/decoders
-  enumeration showing cuda, nvenc, cuvid.
-- ✅ All non-CUDA codec tests pass (libsvtav1, libvvenc, libx265, libass,
-  librsvg, TLS, DNS).
-- ✅ All NVIDIA driver libs `dlopen` cleanly inside the container.
-- ✅ Standalone musl program in same container completes `cuInit(0)`
-  successfully and reads driver version 13020.
-- ✅ **`h264_nvenc` encode produces frames** (`frame= 60 ... speed=2.8x` etc.)
-  and the wrapped entrypoint exits 0.
-- ✅ MP4-to-stdout (`-f mp4 -movflags frag_keyframe+empty_moov -`) emits
-  byte-exact output (verified vs raw `--entrypoint /ffmpeg` invocation).
-- ✅ Real ffmpeg errors (bad codec, bad input, etc.) propagate unchanged
-  through the wrapper.
-- ✅ ffprobe runs unwrapped and stable for all standard probe operations.
-
-### Things tried that did NOT (alone) resolve the issue (kept for posterity)
-
-| Attempt | Result |
-|---|---|
-| `--gpus all` only (no caps) | Only stub libcuda mounted, no NVENC libs |
-| `LD_LIBRARY_PATH=/usr/lib64` only | `dlopen` finds file but glibc symbols missing |
-| Symlink `libdl.so.2 → libgcompat.so.0` only | dlopen of stub OK, real backend FAIL on `gnu_get_libc_version` |
-| nvshim with `gnu_get_libc_version` only | Next missing: `__register_atfork` |
-| Add `__register_atfork` + `secure_getenv` + `__cxa_thread_atexit_impl` | Next missing: `dlmopen` |
-| Add `dlmopen` + `__libc_dlopen_mode/dlsym/dlclose` | Next missing: `dlvsym` |
-| Add `dlvsym` | All driver libs dlopen cleanly + standalone `cuInit` succeeds |
-| `-Wl,--no-as-needed,-Bdynamic,-lc,--as-needed,-Bstatic` in extra-ldflags | Still pulled `libc.a` `dlopen` stub via gcc-hardened spec file |
-| Hide `/usr/lib/libc.a` during link | libgme.a configure-time symbol checks failed (gz*/inflate*) |
-| Absolute-path `-Wl,/lib/ld-musl-x86_64.so.1` in extra-ldflags | ✅ NVENC encode finally succeeds |
-| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective. **Worse**: leaving the `_exit` interposer in the shim silently swallowed *every* ffmpeg exit code (always returned 0). See §5c. |
-| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved, real exit codes (8/254/…) propagate unchanged |
-
-### Decision branch (resolved — stayed on Alpine)
-
-The escape hatch of switching `final-cuda` to `debian:bookworm-slim` was
-**not needed**. The Alpine + musl + gcompat + nvshim stack works end-to-end
-once the link-time absolute-path fix and the entrypoint wrapper are in place.
-
-The Alpine variant remains preferable because:
-
-1. The image is ~4x smaller than the Debian equivalent would be.
-2. Existing CI/build infrastructure for `mwader/static-ffmpeg` is Alpine-based;
-   no parallel `builder-glibc` stage needs to be maintained.
-3. The static archive produced for non-libc deps is identical between the
-   default and CUDA variants — only the link step differs.
-
-The only ongoing maintenance cost is **nvshim symbol drift**: each new NVIDIA
-driver release may reference an additional glibc-internal symbol that
-gcompat doesn't ship, requiring a one-line addition to `libnvshim.so`. The
-diagnostic playbook (next section) documents how to detect and fix this in
-under five minutes.
+### 3b. "Wrong exit code" regression check (guards against P6)
 
----
-
-## 14. Final architecture (the six-layer stack)
-
-The working CUDA variant is the composition of six independently-essential layers.
-Removing any one breaks NVENC end-to-end. They are listed in the order they take effect:
-
-| # | Layer | Where | Purpose |
-|---|---|---|---|
-| 1 | **Absolute-path libc link** | builder, ffmpeg `--extra-ldflags` | Forces `dlopen`/`dlsym`/`dlerror`/`dlclose` to resolve dynamically against the real musl libc instead of `libc.a`'s NULL-returning stub. Without this the binary appears to build fine but `dlopen()` of `libcuda.so.1` returns NULL with no syscall. |
-| 2 | **Dynamic-PIE link mode** | builder, ffmpeg link | Replaces `-fPIE -static-pie` with `-fPIE -pie`. A static-pie binary has no dynamic loader, making `dlopen` impossible by definition. |
-| 3 | **`/etc/ld-musl-x86_64.path`** | final-cuda stage | Adds `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib` to musl's loader search path. The NVIDIA Container Toolkit injects driver libs into one of these depending on host distro; musl's default `/lib:/usr/local/lib:/usr/lib` finds none of them. |
-| 4 | **`gcompat` package + `libdl.so.2` symlink** | final-cuda stage | Provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / `librt.so.1` as musl wrappers (the driver's `DT_NEEDED` entries). The symlink points the driver's `libdl.so.2` reference at `libgcompat.so.0` since musl folds dlopen into libc and ships no separate `libdl`. |
-| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. **Must NOT export `exit`/`_exit`/`_Exit`** — see §5c; interposing those swallows ffmpeg's real exit status. |
-| 6 | **Entrypoint wrapper** | final-cuda stage | Bash script that exec's `/ffmpeg`, captures exit code via `${PIPESTATUS[0]}`, preserves stdout byte-exact via fd-3 trick, tees stderr to a temp file, and downgrades exit 139 → 0 *only* when stderr contains no recognised error keyword. Suppresses the cosmetic libcuda-destructor SIGSEGV that fires after the encode is fully complete. |
-
-Layers 1–2 belong to the **builder stage** (link-time concerns).
-Layers 3–6 belong to the **`final-cuda` runtime stage** (loader, ABI, lifecycle concerns).
+```sh
+docker run --rm --gpus all --entrypoint sh "$IMG" -c '
+  apk add --no-cache gcc musl-dev >/dev/null
+  cat > /tmp/t.c <<EOF
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+__attribute__((noreturn)) void _exit(int s){
+  void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i);
+  dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?");
+  syscall(SYS_exit_group,s); __builtin_unreachable();
+}
+EOF
+  gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl
+  LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v this_codec_does_not_exist -f null -
+'
+# The traced _exit MUST show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc).
+# If it shows dso=/usr/local/lib/libnvshim.so → P6 regression is back.
+```
 
-### Diagram of the runtime call chain
+### 3c. dlopen-stub regression check (guards against P1)
 
-```
-docker run --gpus all  ⇒  toolkit injects libcuda.so.1 → /usr/lib64
-                          + sets NVIDIA_DRIVER_CAPABILITIES from image ENV
-       │
-       ▼
-ffmpeg-cuda-entrypoint (bash)               ← layer 6
-       │ exec
-       ▼
-/ffmpeg  (musl dynamic-PIE, libc-only NEEDED)
-       │ ld.so loads libc.musl-x86_64.so.1
-       │   (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path)   ← layer 3
-       │ LD_PRELOAD → /usr/local/lib/libnvshim.so                            ← layer 5
-       ▼
-ffnvcodec dynlink_loader.h:
-       dlopen("libcuda.so.1", RTLD_LAZY)    ← needs layer 1 (real PLT entry)
-       │
-       ▼ ld.so loads libcuda.so.1 (WSL stub)
-       │   resolves DT_NEEDED libdl.so.2 → libgcompat.so.0                   ← layer 4
-       │
-       ▼ libcuda dlopens its WSL backend libcuda.so.1.1
-       │   resolves glibc-internals via libnvshim.so                         ← layer 5
-       │
-       ▼ encode runs successfully, frames produced, output flushed
-       │
-       ▼ ffmpeg main() → avcodec_close → cuCtxDestroy
-       │   libcuda __cxa_finalize crashes during teardown          ☠ SIGSEGV
-       │
-       ▼ wrapper sees exit=139, no error keyword in stderr → exit 0         ← layer 6
+```sh
+docker run --gpus all --rm --entrypoint sh "$IMG" -c '
+  apk add --no-cache binutils >/dev/null 2>&1
+  readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror|dlclose"
+'
+# Each must be 0-size UND (or not exported at all). A non-zero size in .text
+# (e.g. " 25 FUNC ... 14 dlopen") means the static stub bug is back.
 ```
 
 ---
 
-## 15. ffprobe note
-
-`ffprobe` shares the same link-time and runtime-loader configuration as `ffmpeg`
-(layers 1–5 above), but does **not** need the entrypoint wrapper because:
+## 4. Build & verify
 
-- It doesn't open NVENC encoders, so `nvenc_free → cuCtxDestroy` is never invoked.
-- Its `-hwaccel` option is silently ignored (it's an `ffmpeg`-only flag).
-- It doesn't auto-initialize CUDA for normal probe/show operations.
-
-Tested invocations that all return exit 0 cleanly without the wrapper:
+### Build
 
 ```sh
-docker run --rm --gpus all --entrypoint /ffprobe IMG -version
-docker run --rm --gpus all --entrypoint /ffprobe IMG \
-    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 -show_streams -of json
-docker run --rm --gpus all --entrypoint /ffprobe IMG -i some_h264.mp4
-```
+cd /path/to/static-ffmpeg
 
-If a future ffmpeg/driver combination ever makes `ffprobe` reach the crashing
-destructor path, the same wrapper script can be installed with the binary path
-parametrised. Not worth the extra layer today.
+docker build --no-cache \
+    --build-arg ENABLE_CUDA=1 \
+    --target final-cuda \
+    -t mwader/static-ffmpeg:8.1-cuda .
+```
 
----
+> Use `--no-cache` if you previously built `:8.1-cuda` with broken link
+> flags — Docker will otherwise reuse the cached ffmpeg layer that contains
+> the static `dlopen` stub. Full rebuild ~45–75 min (libaom, libvmaf, x265,
+> svt-av1, vvenc dominate).
 
-## 16. Final verification recipe (May 3, 2026)
+If you only changed the `final-cuda` stage (env, ld-musl path, wrapper),
+`--no-cache` is unnecessary.
 
-Replace `IMG` with your actual tag.
+### Final verification recipe (all five must pass)
 
 ```sh
-IMG=mwader/static-ffmpeg:8.1-cuda-debian-v47   # or :8.1-cuda after retag
+IMG=mwader/static-ffmpeg:8.1-cuda
 
-# 1. Static-ness check (binary should have exactly one NEEDED entry: musl libc)
+# 1. Static-ness check (exactly one NEEDED entry: musl libc)
 docker run --rm --entrypoint sh "$IMG" -c '
   apk add --no-cache binutils >/dev/null 2>&1
   readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW"
 '
 
-# 2. NVENC encode end-to-end (the real test)
+# 2. NVENC encode end-to-end
 docker run --rm --gpus all "$IMG" \
     -hide_banner -loglevel error \
     -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
     -c:v h264_nvenc -f null - ; echo "exit=$? (must be 0)"
 
-# 3. MP4-to-stdout byte-exactness (wrapper passthrough check)
+# 3. MP4-to-stdout byte-exactness (wrapper passthrough)
 docker run --rm --gpus all "$IMG" \
     -hide_banner -loglevel error \
     -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
@@ -818,7 +458,7 @@ docker run --rm --gpus all "$IMG" \
 docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null
 echo "exit=$? (must be 0)"
 
-# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for §5c)
+# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for P6)
 docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \
     -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
     -c:v this_codec_does_not_exist -f null - ; echo "exit=$? (must be 8)"
@@ -826,4 +466,84 @@ docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \
     -i /no/such/file.mp4 -f null - ; echo "exit=$? (must be 254)"
 ```
 
-All four must succeed for the image to be considered shippable.
+---
+
+## 5. Runtime requirements
+
+### Host
+- NVIDIA driver installed.
+- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker.
+- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`).
+
+### Image-side env (set by Dockerfile)
+- `NVIDIA_VISIBLE_DEVICES=all`
+- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`
+  - `compute` → `libcuda.so.1`
+  - `video` → `libnvcuvid.so`, `libnvidia-encode.so`
+  - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`.
+
+### Toolkit driver-injection layouts covered by `/etc/ld-musl-x86_64.path`
+- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu`
+- RHEL/Fedora hosts   → `/usr/lib64`
+- WSL2                → `/usr/lib/wsl/lib`
+
+---
+
+## 6. Runtime call chain (six layers in action)
+
+```
+docker run --gpus all  ⇒  toolkit injects libcuda.so.1 → /usr/lib64
+                          + sets NVIDIA_DRIVER_CAPABILITIES from image ENV
+       │
+       ▼
+ffmpeg-cuda-entrypoint (bash)               ← Layer 6 (P5)
+       │ exec
+       ▼
+/ffmpeg  (musl dynamic-PIE, libc-only NEEDED)               ← Layer 2 (P1)
+       │ ld.so loads libc.musl-x86_64.so.1
+       │   (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path)   ← Layer 3 (P3)
+       │ LD_PRELOAD → /usr/local/lib/libnvshim.so                            ← Layer 5 (P4)
+       ▼
+ffnvcodec dynlink_loader.h:
+       dlopen("libcuda.so.1", RTLD_LAZY)    ← needs Layer 1 (real PLT entry, P1)
+       │
+       ▼ ld.so loads libcuda.so.1 (WSL stub)
+       │   resolves DT_NEEDED libdl.so.2 → libgcompat.so.0                   ← Layer 4 (P4)
+       │
+       ▼ libcuda dlopens its WSL backend libcuda.so.1.1
+       │   resolves glibc-internals via libnvshim.so                         ← Layer 5 (P4)
+       │
+       ▼ encode runs successfully, frames produced, output flushed
+       │
+       ▼ ffmpeg main() → avcodec_close → cuCtxDestroy
+       │   libcuda __cxa_finalize crashes during teardown          ☠ SIGSEGV (P5)
+       │
+       ▼ wrapper sees exit=139, no error keyword in stderr → exit 0         ← Layer 6 (P5)
+```
+
+---
+
+## 7. Comparison with other static ffmpeg + nvenc projects
+
+| Project | Static? | NVENC? | Approach |
+|---|---|---|---|
+| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen |
+| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works |
+| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic + runtime ldconfig | ✅ | Tarball, glibc-linked |
+| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs |
+| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container |
+
+Of these, only `:8.1-cuda` keeps every codec/lib statically linked — every
+other "static + nvenc" build is glibc-dynamic. The trade-off vs the default
+`:8.1` is exactly one libc.so dependency.
+
+---
+
+## 8. CI / publishing notes
+
+- Default tag: built for `linux/amd64,linux/arm64` as before.
+- CUDA tag: built for `linux/amd64` only.
+  - Pushed as `<tag>-cuda` (and re-tagged manifest-style as `<tag>-cuda-amd64` for clarity).
+  - `latest-cuda` follows latest stable.
+- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix.
+

From e3f8bdb60171be53e0a43a1998c365389b76ad36 Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 3 May 2026 21:21:45 +0200
Subject: [PATCH 7/8] update ci to run cuda after amd64 build to use cache

---
 .github/workflows/multiarch.yml | 114 ++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml
index 02e435f..26f15a7 100644
--- a/.github/workflows/multiarch.yml
+++ b/.github/workflows/multiarch.yml
@@ -12,51 +12,82 @@ env:
   REGISTRY_IMAGE: mwader/static-ffmpeg
 
 jobs:
-  build:
-    name: Build image (${{ matrix.variant }} / ${{ matrix.tag }})
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          # default fully-static build, multi-arch
-          - runs_on: ubicloud-standard-8-arm
-            tag: arm64
-            variant: default
-            target: ""
-            build_args: ""
-          - runs_on: ubuntu-latest
-            tag: amd64
-            variant: default
-            target: ""
-            build_args: ""
-          # CUDA variant (NVENC/NVDEC/CUVID), amd64 only.
-          # If/when ffnvcodec is regularly tested on Jetson/arm64, add an arm64 entry.
-          - runs_on: ubuntu-latest
-            tag: amd64
-            variant: cuda
-            target: final-cuda
-            build_args: ENABLE_CUDA=1
+  # arm64 default — independent, runs in parallel with amd64.
+  build-default-arm64:
+    name: Build image (default / arm64)
+    runs-on: ubicloud-standard-8-arm
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Docker build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          tags: image:default-arm64
+          load: true
+          cache-from: type=gha,scope=builder-arm64
+          cache-to: type=gha,scope=builder-arm64,mode=max
+      - name: Docker save
+        run: docker image save --output image-default-arm64.tar image:default-arm64
+      - uses: actions/upload-artifact@v4
+        with:
+          name: image-default-arm64
+          path: image-default-arm64.tar
+          retention-days: 1
 
-    runs-on: ${{ matrix.runs_on }}
+  # amd64 default — populates the shared builder-amd64 cache scope.
+  build-default-amd64:
+    name: Build image (default / amd64)
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
       - name: Docker build
-        run: |
-          docker build \
-            ${{ matrix.target && format('--target {0}', matrix.target) || '' }} \
-            ${{ matrix.build_args && format('--build-arg {0}', matrix.build_args) || '' }} \
-            --tag image:${{ matrix.variant }}-${{ matrix.tag }} \
-            .
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          tags: image:default-amd64
+          load: true
+          cache-from: type=gha,scope=builder-amd64
+          cache-to: type=gha,scope=builder-amd64,mode=max
       - name: Docker save
-        run: |
-          docker image save \
-            --output image-${{ matrix.variant }}-${{ matrix.tag }}.tar \
-            image:${{ matrix.variant }}-${{ matrix.tag }}
-      - name: Upload Docker image-${{ matrix.variant }}-${{ matrix.tag }}
-        uses: actions/upload-artifact@v4
+        run: docker image save --output image-default-amd64.tar image:default-amd64
+      - uses: actions/upload-artifact@v4
+        with:
+          name: image-default-amd64
+          path: image-default-amd64.tar
+          retention-days: 1
+
+  # CUDA variant (NVENC/NVDEC/CUVID), amd64 only.
+  # Runs *after* default-amd64 so it reuses the populated builder-amd64
+  # cache scope: every builder layer up to the final stage is a cache hit,
+  # and only the final-cuda stage has to materialize.
+  build-cuda-amd64:
+    name: Build image (cuda / amd64)
+    runs-on: ubuntu-latest
+    needs: build-default-amd64
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Docker build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          target: final-cuda
+          build-args: ENABLE_CUDA=1
+          tags: image:cuda-amd64
+          load: true
+          cache-from: type=gha,scope=builder-amd64
+          cache-to: type=gha,scope=builder-amd64,mode=max
+      - name: Docker save
+        run: docker image save --output image-cuda-amd64.tar image:cuda-amd64
+      - uses: actions/upload-artifact@v4
         with:
-          name: image-${{ matrix.variant }}-${{ matrix.tag }}
-          path: image-${{ matrix.variant }}-${{ matrix.tag }}.tar
+          name: image-cuda-amd64
+          path: image-cuda-amd64.tar
           retention-days: 1
 
   tag:
@@ -79,7 +110,8 @@ jobs:
     name: Merge and push default images
     runs-on: ubuntu-latest
     needs:
-      - build
+      - build-default-arm64
+      - build-default-amd64
       - tag
     steps:
       - name: Download digests
@@ -119,7 +151,7 @@ jobs:
     name: Push CUDA image (amd64 only)
     runs-on: ubuntu-latest
     needs:
-      - build
+      - build-cuda-amd64
       - tag
     steps:
       - name: Download digests

From 7c4a81966be0985cb6fead9c8e96794c11bdb6e5 Mon Sep 17 00:00:00 2001
From: ToshY <31921460+ToshY@users.noreply.github.com>
Date: Sun, 3 May 2026 22:58:58 +0200
Subject: [PATCH 8/8] let wrapper point directly to ffmpeg instead

---
 Dockerfile | 29 ++++++++++-------
 README.md  | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 746e9aa..4602cf6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1179,7 +1179,6 @@ RUN \
     --enable-version3 \
     $FDKAAC_FLAGS \
     $CUDA_FLAGS \
-    --enable-openssl \
     --enable-fontconfig \
     --enable-gray \
     --enable-iconv \
@@ -1236,6 +1235,7 @@ RUN \
     --enable-libxvid \
     --enable-libzimg \
     --enable-libzmq \
+    --enable-openssl \
   || (cat ffbuild/config.log ; false) && \
   make -j$(nproc) install
 
@@ -1383,7 +1383,11 @@ ENTRYPOINT ["/ffmpeg"]
 # --enable-libnpp / --enable-cuda-nvcc are NOT included (require glibc CUDA toolkit).
 # Use scale_cuda instead of scale_npp.
 FROM alpine:3.20.3 AS final-cuda1
-COPY --from=builder /usr/local/bin/ffmpeg /
+# Real ffmpeg ELF lives at /ffmpeg.bin; /ffmpeg is the bash wrapper (added below)
+# that execs it. This way `COPY --from=...:cuda /ffmpeg /ffmpeg.bin <dst>/` from
+# a downstream image gives a drop-in /ffmpeg that already includes the
+# teardown-SIGSEGV workaround — no custom ENTRYPOINT needed.
+COPY --from=builder /usr/local/bin/ffmpeg /ffmpeg.bin
 COPY --from=builder /usr/local/bin/ffprobe /
 COPY --from=builder /versions.json /
 COPY --from=builder /usr/local/share/doc/ffmpeg/* /doc/
@@ -1458,11 +1462,11 @@ RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gn
     > /etc/ld-musl-x86_64.path
 
 
-# Entrypoint wrapper: convert the benign teardown SIGSEGV (139 -> 0) that
-# libcuda's __cxa_finalize triggers under musl + gcompat. The crash happens
-# inside main() after the encode is complete and all output is flushed, so
-# no in-process hook can suppress it. Heuristic: only downgrade 139 when
-# stderr contains no recognisable error keyword. Real failure exit codes
+# Entrypoint wrapper installed AS /ffmpeg itself: convert the benign teardown
+# SIGSEGV (139 -> 0) that libcuda's __cxa_finalize triggers under musl + gcompat.
+# The crash happens inside main() after the encode is complete and all output is
+# flushed, so no in-process hook can suppress it. Heuristic: only downgrade 139
+# when stderr contains no recognisable error keyword. Real failure exit codes
 # (1, 8, 254, ...) propagate unchanged. See docs/ffmpeg-with-cuda.md (P5).
 RUN apk add --no-cache bash && \
     printf '%s\n' \
@@ -1473,7 +1477,7 @@ RUN apk add --no-cache bash && \
     'exec 3>&1' \
     'exec 4>&2' \
     'exec 2>"$shellerr"' \
-    '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \
+    '{ /ffmpeg.bin "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \
     'rc=${PIPESTATUS[0]}' \
     'exec 3>&-' \
     'exec 2>&4 4>&-' \
@@ -1482,12 +1486,13 @@ RUN apk add --no-cache bash && \
     '    exit 0' \
     'fi' \
     'exit "$rc"' \
-    > /usr/local/bin/ffmpeg-cuda-entrypoint && \
-    chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint
+    > /ffmpeg && \
+    chmod +x /ffmpeg
 
 # sanity tests (cannot exercise actual GPU encode without a GPU at build time).
-# LD_PRELOAD set inline since the env is only declared in the final stage below.
+# /ffmpeg goes through the wrapper -> /ffmpeg.bin; both must work.
 RUN ["/ffmpeg", "-version"]
+RUN ["/ffmpeg.bin", "-version"]
 RUN ["/ffprobe", "-version"]
 RUN ["/ffmpeg", "-hide_banner", "-buildconf"]
 RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda
@@ -1506,4 +1511,4 @@ LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
 ENV NVIDIA_VISIBLE_DEVICES=all \
     NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \
     LD_PRELOAD=/usr/local/lib/libnvshim.so
-ENTRYPOINT ["/usr/local/bin/ffmpeg-cuda-entrypoint"]
+ENTRYPOINT ["/ffmpeg"]
diff --git a/README.md b/README.md
index dbbfbad..c7ef754 100644
--- a/README.md
+++ b/README.md
@@ -177,6 +177,101 @@ docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_bann
 Supported encoders: `h264_nvenc`, `hevc_nvenc`, `av1_nvenc` (GPU dependent).
 Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …).
 
+#### Use in another image with `COPY --from`
+
+Unlike the default static binary, the CUDA variant has runtime dependencies
+beyond the binary itself. To get a working NVENC/NVDEC build in your own
+image you need to copy **all** of the following from `:8.1-cuda`:
+
+```Dockerfile
+FROM alpine:3.20
+
+# 1. The binaries. /ffmpeg in the cuda image is a bash wrapper that execs
+#    /ffmpeg.bin (the real ELF) — it downgrades the benign teardown SIGSEGV
+#    (exit 139 → 0) while preserving real ffmpeg exit codes. Both files must
+#    be copied; the wrapper expects to find /ffmpeg.bin at the same root.
+COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg     /ffmpeg
+COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg.bin /ffmpeg.bin
+COPY --from=mwader/static-ffmpeg:8.1-cuda /ffprobe    /usr/local/bin/
+
+# 2. musl loader path file — adds /usr/lib64, /usr/lib/wsl/lib, etc. so musl
+#    can find the toolkit-injected NVIDIA driver libs.
+COPY --from=mwader/static-ffmpeg:8.1-cuda /etc/ld-musl-x86_64.path /etc/ld-musl-x86_64.path
+
+# 3. The glibc → musl ABI shim (LD_PRELOAD'd into ffmpeg).
+COPY --from=mwader/static-ffmpeg:8.1-cuda /usr/local/lib/libnvshim.so /usr/local/lib/
+
+# 4. gcompat + bash + the libdl.so.2 → libgcompat.so.0 symlink the NVIDIA
+#    driver libs need at DT_NEEDED resolution time. bash is required by the
+#    /ffmpeg wrapper script.
+RUN apk add --no-cache gcompat libstdc++ bash && \
+    ln -sf /usr/lib/libgcompat.so.0 /usr/lib/libdl.so.2
+
+# 5. Toolkit env (compute → libcuda.so.1, video → libnvcuvid/libnvidia-encode).
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \
+    LD_PRELOAD=/usr/local/lib/libnvshim.so
+
+ENTRYPOINT ["/ffmpeg"]
+```
+
+Notes:
+
+- The base image **must be Alpine** (or otherwise musl-based with a compatible
+  musl major version). Glibc-based images — including `debian:*-slim`
+  (e.g. `bookworm-slim`), `ubuntu:*`, `python:*-slim`, `nvidia/cuda:*`,
+  `redhat/ubi*`, etc. — are **not** supported destinations: the binary's
+  `PT_INTERP` is `/lib/ld-musl-x86_64.so.1`, which doesn't exist on those
+  distros, and the `gcompat` shim in step 4 is Alpine-only. If you need a
+  Debian/Ubuntu runtime, run the published `mwader/static-ffmpeg:<tag>-cuda`
+  image directly (it's already Alpine-based) instead of `COPY --from`'ing
+  into a glibc base.
+- Skipping any of items 2–5 will produce a binary that builds and runs
+  `-version` fine but fails at the first NVENC/NVDEC call.
+- Run with `--gpus all` (and the NVIDIA Container Toolkit installed on the
+  host) for GPU access — same as running `mwader/static-ffmpeg:8.1-cuda`
+  directly.
+
+##### Multi-process images (Python / Node / app + ffmpeg)
+
+The example above sets `LD_PRELOAD=/usr/local/lib/libnvshim.so` as image-wide
+`ENV`. That's safe in an **ffmpeg-only** image (the published `:*-cuda` image
+runs only `/ffmpeg`, which was built and tested with the shim preloaded), but
+it is **not** safe in an image that also runs other musl binaries — `pip`,
+`python`, `node`, your app, etc. `libnvshim.so` exports glibc-only symbols and
+transitively pulls in `gcompat` (via `DT_NEEDED libdl.so.2`). Forcing that
+into every process tends to crash CPython and other musl interpreters with
+`SIGSEGV` (exit code 139) at startup.
+
+For multi-process images, scope the preload to ffmpeg only with a small
+wrapper instead of `ENV LD_PRELOAD`:
+
+```Dockerfile
+# Replace step 5's `LD_PRELOAD=...` ENV line with a wrapper that sets
+# LD_PRELOAD only for the ffmpeg process. Other processes (pip, python,
+# sh, ...) run with a clean environment. The wrapper at /usr/local/bin/ffmpeg
+# also exposes ffmpeg on PATH for your app to call as `ffmpeg`.
+RUN printf '%s\n' \
+    '#!/bin/sh' \
+    'exec env LD_PRELOAD=/usr/local/lib/libnvshim.so /ffmpeg "$@"' \
+    > /usr/local/bin/ffmpeg \
+    && chmod +x /usr/local/bin/ffmpeg
+
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility,video
+# (no ENV LD_PRELOAD here)
+```
+
+`/usr/local/bin/ffmpeg` (the wrapper) execs `/ffmpeg` (the static-ffmpeg bash
+wrapper that downgrades the benign teardown SIGSEGV) which execs
+`/ffmpeg.bin` (the real ELF). Exit codes propagate unchanged via `exec`. Your
+app continues to call `ffmpeg` from `PATH` as normal.
+
+If you also invoke `ffprobe` against CUDA-accelerated decoders and see it
+crash, wrap it the same way (rename the copied binary to `ffprobe.bin` first
+and put the wrapper at `/usr/local/bin/ffprobe`). For most ffprobe use cases
+this isn't needed.
+
 #### Limitations
 
 - `--enable-cuda-nvcc` and `--enable-libnpp` are **not** included — they require
@@ -188,6 +283,7 @@ Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …).
   musl libc (i.e. an Alpine-based image of the matching `musl` major version).
 - Without `--gpus all` (or without the NVIDIA Container Toolkit) the binary
   still runs but `nvenc`/`nvdec`/`cuda` initialization will fail at runtime.
+- amd64 only.
 
 ### Fonts usage with SVG or draw text filters etc