diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4ff3ec5..2fc32d5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,10 +8,30 @@ on:
 jobs:
   ci:
     strategy:
+      fail-fast: false
       matrix:
         include:
+          # default static build, both arches
           - runs_on: ubicloud-standard-30
+            variant: default
+            target: ""
+            build_args: |
+              ENABLE_FDKAAC=1
           - runs_on: ubicloud-standard-30-arm
+            variant: default
+            target: ""
+            build_args: |
+              ENABLE_FDKAAC=1
+          # CUDA variant (NVENC/NVDEC/CUVID), amd64 only for now.
+          # No GPU on the runner — the build only verifies that the binary
+          # links and that nvenc/cuvid/cuda show up in -encoders/-hwaccels.
+          - runs_on: ubicloud-standard-30
+            variant: cuda
+            target: final-cuda
+            build_args: |
+              ENABLE_FDKAAC=1
+              ENABLE_CUDA=1
+    name: ci (${{ matrix.variant }} / ${{ matrix.runs_on }})
     runs-on: ${{ matrix.runs_on }}
     steps:
       - uses: actions/checkout@v3
@@ -21,7 +41,7 @@ jobs:
         with:
           context: .
           push: false
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          build-args: |
-            ENABLE_FDKAAC=1
+          cache-from: type=gha,scope=${{ matrix.variant }}-${{ matrix.runs_on }}
+          cache-to: type=gha,mode=max,scope=${{ matrix.variant }}-${{ matrix.runs_on }}
+          target: ${{ matrix.target }}
+          build-args: ${{ matrix.build_args }}
diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml
index 2037ccc..26f15a7 100644
--- a/.github/workflows/multiarch.yml
+++ b/.github/workflows/multiarch.yml
@@ -12,28 +12,82 @@ env:
   REGISTRY_IMAGE: mwader/static-ffmpeg
 
 jobs:
-  build:
-    name: Build image
-    strategy:
-      matrix:
-        include:
-          - runs_on: ubicloud-standard-8-arm
-            tag: arm64
-          - runs_on: ubuntu-latest
-            tag: amd64
+  # arm64 default — independent, runs in parallel with amd64.
+  build-default-arm64:
+    name: Build image (default / arm64)
+    runs-on: ubicloud-standard-8-arm
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Docker build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          tags: image:default-arm64
+          load: true
+          cache-from: type=gha,scope=builder-arm64
+          cache-to: type=gha,scope=builder-arm64,mode=max
+      - name: Docker save
+        run: docker image save --output image-default-arm64.tar image:default-arm64
+      - uses: actions/upload-artifact@v4
+        with:
+          name: image-default-arm64
+          path: image-default-arm64.tar
+          retention-days: 1
+
+  # amd64 default — populates the shared builder-amd64 cache scope.
+  build-default-amd64:
+    name: Build image (default / amd64)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Docker build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          tags: image:default-amd64
+          load: true
+          cache-from: type=gha,scope=builder-amd64
+          cache-to: type=gha,scope=builder-amd64,mode=max
+      - name: Docker save
+        run: docker image save --output image-default-amd64.tar image:default-amd64
+      - uses: actions/upload-artifact@v4
+        with:
+          name: image-default-amd64
+          path: image-default-amd64.tar
+          retention-days: 1
 
-    runs-on: ${{ matrix.runs_on }}
+  # CUDA variant (NVENC/NVDEC/CUVID), amd64 only.
+  # Runs *after* default-amd64 so it reuses the populated builder-amd64
+  # cache scope: every builder layer up to the final stage is a cache hit,
+  # and only the final-cuda stage has to materialize.
+  build-cuda-amd64:
+    name: Build image (cuda / amd64)
+    runs-on: ubuntu-latest
+    needs: build-default-amd64
     steps:
       - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
       - name: Docker build
-        run: docker build --tag image:${{ matrix.tag }} .
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          target: final-cuda
+          build-args: ENABLE_CUDA=1
+          tags: image:cuda-amd64
+          load: true
+          cache-from: type=gha,scope=builder-amd64
+          cache-to: type=gha,scope=builder-amd64,mode=max
       - name: Docker save
-        run: docker image save --output image-${{ matrix.tag }}.tar image:${{ matrix.tag }}
-      - name: Upload Docker image-${{ matrix.tag }}
-        uses: actions/upload-artifact@v4
+        run: docker image save --output image-cuda-amd64.tar image:cuda-amd64
+      - uses: actions/upload-artifact@v4
         with:
-          name: image-${{ matrix.tag }}
-          path: image-${{ matrix.tag }}.tar
+          name: image-cuda-amd64
+          path: image-cuda-amd64.tar
           retention-days: 1
 
   tag:
@@ -53,22 +107,23 @@ jobs:
           ' >> "$GITHUB_OUTPUT"
 
   merge:
-    name: Merge and push images
+    name: Merge and push default images
     runs-on: ubuntu-latest
     needs:
-      - build
+      - build-default-arm64
+      - build-default-amd64
       - tag
     steps:
       - name: Download digests
         uses: actions/download-artifact@v4
         with:
           path: /tmp
-          pattern: image-*
+          pattern: image-default-*
           merge-multiple: true
       - name: Load Docker images
         run: |
-          docker image load --input /tmp/image-arm64.tar
-          docker image load --input /tmp/image-amd64.tar
+          docker image load --input /tmp/image-default-arm64.tar
+          docker image load --input /tmp/image-default-amd64.tar
       - name: Docker meta
         id: meta
         uses: docker/metadata-action@v5
@@ -81,8 +136,8 @@ jobs:
           password: ${{ secrets.DOCKERHUB_TOKEN }}
       - name: Create manifest list and push
         run: |
-          docker tag image:arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64
-          docker tag image:amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
+          docker tag image:default-arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64
+          docker tag image:default-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
           docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64
           docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
           docker manifest create \
@@ -91,3 +146,32 @@ jobs:
             --amend ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64
           docker manifest inspect ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}
           docker manifest push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}
+
+  merge-cuda:
+    name: Push CUDA image (amd64 only)
+    runs-on: ubuntu-latest
+    needs:
+      - build-cuda-amd64
+      - tag
+    steps:
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp
+          pattern: image-cuda-*
+          merge-multiple: true
+      - name: Load Docker image
+        run: docker image load --input /tmp/image-cuda-amd64.tar
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Tag and push CUDA image
+        run: |
+          # CUDA variant is amd64-only for now; published as a single-arch tag.
+          docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64
+          docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda
+          docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64
+          docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda
+
diff --git a/Dockerfile b/Dockerfile
index 1551539..4602cf6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1037,6 +1037,25 @@ RUN \
     --enable-static && \
   make -j$(nproc) install
 
+# NVIDIA codec headers (header-only; no CUDA toolkit needed). ffmpeg dlopen()s the
+# real driver libs (libcuda / libnvcuvid / libnvidia-encode) at runtime, injected
+# by the NVIDIA Container Toolkit. Only built when ENABLE_CUDA is set.
+# See docs/ffmpeg-with-cuda.md.
+# bump: ffnvcodec /FFNVCODEC_VERSION=([\d.]+)/ https://github.com/FFmpeg/nv-codec-headers.git|^13
+# bump: ffnvcodec after ./hashupdate Dockerfile FFNVCODEC $LATEST
+# bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases
+ARG FFNVCODEC_VERSION=13.0.19.0
+ARG FFNVCODEC_URL="https://github.com/FFmpeg/nv-codec-headers/archive/refs/tags/n${FFNVCODEC_VERSION}.tar.gz"
+ARG FFNVCODEC_SHA256=86d15d1a7c0ac73a0eafdfc57bebfeba7da8264595bf531cf4d8db1c22940116
+ARG ENABLE_CUDA=
+RUN \
+  if [ -n "$ENABLE_CUDA" ]; then \
+    wget $WGET_OPTS -O ffnvcodec.tar.gz "$FFNVCODEC_URL" && \
+    echo "$FFNVCODEC_SHA256  ffnvcodec.tar.gz" | sha256sum -c - && \
+    tar $TAR_OPTS ffnvcodec.tar.gz && cd nv-codec-headers-* && \
+    make PREFIX=/usr/local install ; \
+  fi
+
 # requires libdrm
 # bump: libva /LIBVA_VERSION=([\d.]+)/ https://github.com/intel/libva.git|^2
 # bump: libva after ./hashupdate Dockerfile LIBVA $LATEST
@@ -1112,91 +1131,113 @@ ARG FFMPEG_VERSION=8.1
 ARG FFMPEG_URL="https://ffmpeg.org/releases/ffmpeg-$FFMPEG_VERSION.tar.bz2"
 ARG FFMPEG_SHA256=c07039598df7d64d3c8b42c4e25b1959fc908621c6f6c2946881133f3b27eda2
 ARG ENABLE_FDKAAC=
-# sed changes --toolchain=hardened -pie to -static-pie
+# sed changes --toolchain=hardened -pie to -static-pie (default build only).
 #
-# ldflags stack-size=2097152 is to increase default stack size from 128KB (musl default) to something
-# more similar to glibc (2MB). This fixing segfault with libaom-av1 and libsvtav1 as they seems to pass
-# large things on the stack.
+# CUDA variant: keep -pie (musl dynamic-PIE) so ffnvcodec can dlopen() the
+# NVIDIA driver libs. All other deps stay statically archived; only the musl
+# loader/libc is dynamic. See docs/ffmpeg-with-cuda.md.
 #
-# ldfalgs -Wl,--allow-multiple-definition is a workaround for linking with multiple rust staticlib to
-# not cause collision in toolchain symbols, see comment in checkdupsym script for details.
+# ldflags stack-size=2097152 raises musl's 128KB default to ~glibc 2MB
+# (libaom/libsvtav1 pass large objects on the stack).
+# ldflags --allow-multiple-definition works around rust staticlib toolchain
+# symbol collisions (see checkdupsym).
 RUN \
   wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \
   echo "$FFMPEG_SHA256  ffmpeg.tar.bz2" | sha256sum -c - && \
   tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \
   FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \
-  sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure && \
+  CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \
+  if [[ -z "$ENABLE_CUDA" ]]; then \
+    # Default: fully static-pie musl binary, no loader, no dlopen.
+    sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \
+    EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" ; \
+    EXTRA_LIBS="" ; \
+  else \
+    # CUDA: musl dynamic-PIE. Link the dynamic libc by ABSOLUTE PATH (not -lc)
+    # to avoid musl's libc.a 25-byte dlopen() stub that always returns NULL —
+    # gcc's hardened toolchain can otherwise resolve dlopen/dlsym/dlerror from
+    # the static archive even when -Bdynamic is requested, breaking nvenc with
+    # a silent "Cannot load libcuda.so.1" (no openat syscall fires).
+    # See docs/ffmpeg-with-cuda.md (P1).
+    EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \
+        -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
+        -Wl,--as-needed -Wl,-Bstatic \
+        -static-libstdc++ -static-libgcc" ; \
+    EXTRA_LIBS="-lgomp -Wl,-Bdynamic -lc" ; \
+  fi && \
   ./configure \
-  --pkg-config-flags="--static" \
-  --extra-cflags="-fopenmp" \
-  --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" \
-  --toolchain=hardened \
-  --disable-debug \
-  --disable-shared \
-  --disable-ffplay \
-  --enable-static \
-  --enable-gpl \
-  --enable-version3 \
-  $FDKAAC_FLAGS \
-  --enable-fontconfig \
-  --enable-gray \
-  --enable-iconv \
-  --enable-lcms2 \
-  --enable-libaom \
-  --enable-libaribb24 \
-  --enable-libass \
-  --enable-libbluray \
-  --enable-libdav1d \
-  --enable-libdavs2 \
-  --enable-libfreetype \
-  --enable-libfribidi \
-  --enable-libgme \
-  --enable-libgsm \
-  --enable-libharfbuzz \
-  --enable-libjxl \
-  --enable-libkvazaar \
-  --enable-libmodplug \
-  --enable-libmp3lame \
-  --enable-libmysofa \
-  --enable-libopencore-amrnb \
-  --enable-libopencore-amrwb \
-  --enable-libopenjpeg \
-  --enable-libopus \
-  --enable-librabbitmq \
-  --enable-librav1e \
-  --enable-librsvg \
-  --enable-librtmp \
-  --enable-librubberband \
-  --enable-libshine \
-  --enable-libsnappy \
-  --enable-libsoxr \
-  --enable-libspeex \
-  --enable-libsrt \
-  --enable-libssh \
-  --enable-libsvtav1 \
-  --enable-libtheora \
-  --enable-libtwolame \
-  --enable-libuavs3d \
-  --enable-libvidstab \
-  --enable-libvmaf \
-  --enable-libvo-amrwbenc \
-  --enable-libvorbis \
-  --enable-libvpl \
-  --enable-libvpx \
-  --enable-libvvenc \
-  --enable-libwebp \
-  --enable-libx264 \
-  --enable-libx265 \
-  --enable-libxavs2 \
-  --enable-libxevd \
-  --enable-libxeve \
-  --enable-libxml2 \
-  --enable-libxvid \
-  --enable-libzimg \
-  --enable-libzmq \
-  --enable-openssl \
-  || (cat ffbuild/config.log ; false) \
-  && make -j$(nproc) install
+    --pkg-config-flags="--static" \
+    --extra-cflags="-fopenmp" \
+    --extra-ldflags="$EXTRA_LDFLAGS" \
+    --extra-libs="$EXTRA_LIBS" \
+    --toolchain=hardened \
+    --disable-debug \
+    --disable-shared \
+    --disable-ffplay \
+    --enable-static \
+    --enable-gpl \
+    --enable-version3 \
+    $FDKAAC_FLAGS \
+    $CUDA_FLAGS \
+    --enable-fontconfig \
+    --enable-gray \
+    --enable-iconv \
+    --enable-lcms2 \
+    --enable-libaom \
+    --enable-libaribb24 \
+    --enable-libass \
+    --enable-libbluray \
+    --enable-libdav1d \
+    --enable-libdavs2 \
+    --enable-libfreetype \
+    --enable-libfribidi \
+    --enable-libgme \
+    --enable-libgsm \
+    --enable-libharfbuzz \
+    --enable-libjxl \
+    --enable-libkvazaar \
+    --enable-libmodplug \
+    --enable-libmp3lame \
+    --enable-libmysofa \
+    --enable-libopencore-amrnb \
+    --enable-libopencore-amrwb \
+    --enable-libopenjpeg \
+    --enable-libopus \
+    --enable-librabbitmq \
+    --enable-librav1e \
+    --enable-librsvg \
+    --enable-librtmp \
+    --enable-librubberband \
+    --enable-libshine \
+    --enable-libsnappy \
+    --enable-libsoxr \
+    --enable-libspeex \
+    --enable-libsrt \
+    --enable-libssh \
+    --enable-libsvtav1 \
+    --enable-libtheora \
+    --enable-libtwolame \
+    --enable-libuavs3d \
+    --enable-libvidstab \
+    --enable-libvmaf \
+    --enable-libvo-amrwbenc \
+    --enable-libvorbis \
+    --enable-libvpl \
+    --enable-libvpx \
+    --enable-libvvenc \
+    --enable-libwebp \
+    --enable-libx264 \
+    --enable-libx265 \
+    --enable-libxavs2 \
+    --enable-libxevd \
+    --enable-libxeve \
+    --enable-libxml2 \
+    --enable-libxvid \
+    --enable-libzimg \
+    --enable-libzmq \
+    --enable-openssl \
+  || (cat ffbuild/config.log ; false) && \
+  make -j$(nproc) install
 
 RUN \
   EXPAT_VERSION=$(pkg-config --modversion expat) \
@@ -1273,13 +1314,17 @@ RUN \
   libzimg: env.ZIMG_VERSION, \
   libzmq: env.LIBZMQ_VERSION, \
   openssl: env.OPENSSL_VERSION, \
+  ffnvcodec: env.FFNVCODEC_VERSION, \
   }' > /versions.json
 
-# make sure binaries has no dependencies, is relro, pie and stack nx
+# make sure binaries has no dependencies, is relro, pie and stack nx.
+# CUDA build is musl dynamic-PIE; --cuda allows the musl loader/libc as the
+# only NEEDED entry.
 COPY checkelf /
 RUN \
-  /checkelf /usr/local/bin/ffmpeg && \
-  /checkelf /usr/local/bin/ffprobe
+  CHECKELF_FLAGS=$(if [ -n "$ENABLE_CUDA" ]; then echo "--cuda"; fi) && \
+  /checkelf $CHECKELF_FLAGS /usr/local/bin/ffmpeg && \
+  /checkelf $CHECKELF_FLAGS /usr/local/bin/ffprobe
 # workaround for using -Wl,--allow-multiple-definition
 # see comment in checkdupsym for details
 COPY checkdupsym /
@@ -1320,6 +1365,150 @@ RUN ["/ffmpeg", "-f", "lavfi", "-i", "testsrc", "-c:v", "libx265", "-t", "100ms"
 FROM scratch AS final2
 COPY --from=final1 / /
 
-FROM final2
+FROM final2 AS final
+LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
+ENTRYPOINT ["/ffmpeg"]
+
+# CUDA / NVENC / NVDEC variant. See docs/ffmpeg-with-cuda.md for full design.
+#
+# Build:  docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:<ver>-cuda .
+# Run:    docker run --gpus all --rm mwader/static-ffmpeg:<ver>-cuda \
+#             -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4
+#
+# Requires NVIDIA driver on host + nvidia-container-toolkit. The binary is a musl
+# dynamic-PIE so the loader is present and the NVIDIA driver libs (libcuda.so.1,
+# libnvcuvid.so, libnvidia-encode.so) injected by the toolkit can be dlopen()'d.
+# No CUDA toolkit needed at build or run time.
+#
+# --enable-libnpp / --enable-cuda-nvcc are NOT included (require glibc CUDA toolkit).
+# Use scale_cuda instead of scale_npp.
+FROM alpine:3.20.3 AS final-cuda1
+# Real ffmpeg ELF lives at /ffmpeg.bin; /ffmpeg is the bash wrapper (added below)
+# that execs it. This way `COPY --from=...:cuda /ffmpeg /ffmpeg.bin <dst>/` from
+# a downstream image gives a drop-in /ffmpeg that already includes the
+# teardown-SIGSEGV workaround — no custom ENTRYPOINT needed.
+COPY --from=builder /usr/local/bin/ffmpeg /ffmpeg.bin
+COPY --from=builder /usr/local/bin/ffprobe /
+COPY --from=builder /versions.json /
+COPY --from=builder /usr/local/share/doc/ffmpeg/* /doc/
+COPY --from=builder /etc/ssl/cert.pem /etc/ssl/cert.pem
+COPY --from=builder /etc/fonts/ /etc/fonts/
+COPY --from=builder /usr/share/fonts/ /usr/share/fonts/
+COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/
+COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/
+
+# gcompat: glibc->musl shim. NVIDIA driver libs are built against glibc and have
+# DT_NEEDED entries for libc.so.6 / libpthread.so.0 / libdl.so.2 / libm.so.6 /
+# librt.so.1 — gcompat provides those SONAMEs as musl wrappers. libstdc++ is
+# pulled in for NVIDIA helper libs (e.g. libnvidia-ngx). gcompat omits libdl.so.2
+# (musl folds dlopen into libc) so symlink it manually.
+RUN apk add --no-cache gcompat libstdc++ && \
+    ln -sf libgcompat.so.0 /lib/libdl.so.2
+
+# nvshim: tiny LD_PRELOAD library exporting glibc-internal symbols that gcompat
+# does NOT provide but the real NVIDIA driver backend calls during cuInit().
+# Without these, the stub libcuda dlopens fine but its backend fails with
+# "Error relocating: <sym>: symbol not found", which ffmpeg surfaces as the
+# misleading "Cannot load libcuda.so.1".
+#
+# IMPORTANT: this shim must NOT interpose exit / _exit / _Exit. Doing so
+# silently swallows ffmpeg's real exit codes (every error returns 0).
+# Process-lifecycle policy belongs in the bash entrypoint wrapper below.
+# See docs/ffmpeg-with-cuda.md (P6).
+RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \
+    mkdir -p /usr/local/lib && \
+    printf '%s\n' \
+      '#define _GNU_SOURCE' \
+      '#include <signal.h>' \
+      '#include <pthread.h>' \
+      '#include <stdlib.h>' \
+      '#include <string.h>' \
+      '#include <dlfcn.h>' \
+      '#include <unistd.h>' \
+      'const char *gnu_get_libc_version(void) { return "2.35"; }' \
+      'const char *gnu_get_libc_release(void) { return "stable"; }' \
+      'int __libc_current_sigrtmin(void) { return SIGRTMIN; }' \
+      'int __libc_current_sigrtmax(void) { return SIGRTMAX; }' \
+      'int __register_atfork(void (*p)(void), void (*pa)(void), void (*c)(void), void *dso) {' \
+      '    (void)dso; return pthread_atfork(p, pa, c);' \
+      '}' \
+      'int __libc_single_threaded = 0;' \
+      'int __cxa_thread_atexit_impl(void (*f)(void*), void *o, void *dso) {' \
+      '    (void)f; (void)o; (void)dso; return 0;' \
+      '}' \
+      'char *secure_getenv(const char *name) { return getenv(name); }' \
+      '/* dlmopen: glibc-only namespaced dlopen; musl has no link namespaces. */' \
+      'typedef long Lmid_t;' \
+      'void *dlmopen(Lmid_t lmid, const char *file, int mode) {' \
+      '    (void)lmid; return dlopen(file, mode);' \
+      '}' \
+      '/* glibc-internal dl* variants used by nss / driver init. */' \
+      'void *__libc_dlopen_mode(const char *name, int mode) { return dlopen(name, mode); }' \
+      'void *__libc_dlsym(void *handle, const char *name) { return dlsym(handle, name); }' \
+      'int   __libc_dlclose(void *handle) { return dlclose(handle); }' \
+      '/* dlvsym: glibc versioned dlsym; musl has no symbol versioning. */' \
+      'void *dlvsym(void *handle, const char *name, const char *version) {' \
+      '    (void)version; return dlsym(handle, name);' \
+      '}' \
+      > /tmp/nvshim.c && \
+    gcc -shared -fPIC -nostartfiles -o /usr/local/lib/libnvshim.so /tmp/nvshim.c -lpthread -ldl && \
+    rm /tmp/nvshim.c && \
+    apk del .nvshim-build
+
+# musl loader fallback search path. The NVIDIA Container Toolkit injects driver
+# libs into one of these depending on host distro; musl's defaults
+# (/lib:/usr/local/lib:/usr/lib) miss all three.
+RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gnu\n/usr/lib/wsl/lib\n' \
+    > /etc/ld-musl-x86_64.path
+
+
+# Entrypoint wrapper installed AS /ffmpeg itself: convert the benign teardown
+# SIGSEGV (139 -> 0) that libcuda's __cxa_finalize triggers under musl + gcompat.
+# The crash happens inside main() after the encode is complete and all output is
+# flushed, so no in-process hook can suppress it. Heuristic: only downgrade 139
+# when stderr contains no recognisable error keyword. Real failure exit codes
+# (1, 8, 254, ...) propagate unchanged. See docs/ffmpeg-with-cuda.md (P5).
+RUN apk add --no-cache bash && \
+    printf '%s\n' \
+    '#!/bin/bash' \
+    'errfile=$(mktemp)' \
+    'shellerr=$(mktemp)' \
+    'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \
+    'exec 3>&1' \
+    'exec 4>&2' \
+    'exec 2>"$shellerr"' \
+    '{ /ffmpeg.bin "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \
+    'rc=${PIPESTATUS[0]}' \
+    'exec 3>&-' \
+    'exec 2>&4 4>&-' \
+    'grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true' \
+    'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \
+    '    exit 0' \
+    'fi' \
+    'exit "$rc"' \
+    > /ffmpeg && \
+    chmod +x /ffmpeg
+
+# sanity tests (cannot exercise actual GPU encode without a GPU at build time).
+# /ffmpeg goes through the wrapper -> /ffmpeg.bin; both must work.
+RUN ["/ffmpeg", "-version"]
+RUN ["/ffmpeg.bin", "-version"]
+RUN ["/ffprobe", "-version"]
+RUN ["/ffmpeg", "-hide_banner", "-buildconf"]
+RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda
+RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc
+RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid
+
+# clamp all files into one layer
+FROM scratch AS final-cuda2
+COPY --from=final-cuda1 / /
+
+FROM final-cuda2 AS final-cuda
 LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com"
+# Default toolkit env so callers only need `--gpus all`.
+#   compute -> libcuda.so.1 ; video -> libnvcuvid + libnvidia-encode (NVENC/NVDEC) ;
+#   utility -> libnvidia-ml + nvidia-smi.
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \
+    LD_PRELOAD=/usr/local/lib/libnvshim.so
 ENTRYPOINT ["/ffmpeg"]
diff --git a/README.md b/README.md
index 59715de..c7ef754 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,7 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent
 - [libzimg](https://github.com/sekrit-twc/zimg)
 - [libzmq](https://github.com/zeromq/libzmq)
 - [openssl](https://openssl.org)
+- NVIDIA NVENC / NVDEC / CUVID via [nv-codec-headers](https://github.com/FFmpeg/nv-codec-headers) (only in the CUDA variant, [see below](#cuda--nvenc--nvdec-nvidia-gpu-acceleration))
 - and all native ffmpeg codecs, formats, filters etc.
 
 ### Files in the image
@@ -114,6 +115,10 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent
 `MAJOR.MINOR.PATCH[-BUILD]` Specific version of FFmpeg with the features that was in master at the time of tagging.
 `-BUILD` means that was an additional build with that version to add of fix something.
 
+`<tag>-cuda` (and `latest-cuda`) — same FFmpeg version compiled with NVIDIA
+NVENC / NVDEC / CUVID support, see [CUDA / NVENC / NVDEC](#cuda--nvenc--nvdec-nvidia-gpu-acceleration)
+below. Currently amd64 only (published as `<tag>-cuda` → `<tag>-cuda-amd64`).
+
 ### Security
 
 Binaries are built with various hardening features but it's *still a good idea to run them
@@ -126,6 +131,160 @@ Due to license issues the docker image does not include libfdk-aac by default. A
 docker build --build-arg ENABLE_FDKAAC=1 . -t my-ffmpeg-static:latest
 ```
 
+### CUDA / NVENC / NVDEC (NVIDIA GPU acceleration)
+
+The default image is fully static and does **not** support NVIDIA GPU acceleration
+(a fully static-pie musl binary has no dynamic loader, so it cannot `dlopen()` the
+NVIDIA driver libraries at runtime).
+
+A separate **CUDA variant** can be built that includes `ffnvcodec`, `nvenc`,
+`nvdec` and `cuvid` support. In this variant the binary is a *musl dynamic-PIE*
+(all FFmpeg dependencies remain statically archived; only the musl loader / libc
+stays dynamic) so that FFmpeg can `dlopen()` the NVIDIA driver libs
+(`libcuda.so.1`, `libnvcuvid.so`, `libnvidia-encode.so`) which the
+[NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
+injects into the container at runtime via `--gpus all`.
+
+No CUDA toolkit is needed to build or to run — only header-only
+[`nv-codec-headers`](https://github.com/FFmpeg/nv-codec-headers) at build time
+and the host's NVIDIA driver at run time.
+
+#### Build
+
+```sh
+docker build --build-arg ENABLE_CUDA=1 --target final-cuda \
+    -t my-ffmpeg-static:cuda .
+```
+
+#### Run
+
+Requires the NVIDIA driver on the host and `nvidia-container-toolkit` installed
+and configured in Docker.
+
+```sh
+docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" my-ffmpeg-static:cuda \
+    -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \
+    -c:a copy -c:v h264_nvenc -b:v 5M output.mp4
+```
+
+Verify GPU support inside the container:
+
+```sh
+docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -hwaccels
+docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -encoders | grep nvenc
+```
+
+Supported encoders: `h264_nvenc`, `hevc_nvenc`, `av1_nvenc` (GPU dependent).
+Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …).
+
+#### Use in another image with `COPY --from`
+
+Unlike the default static binary, the CUDA variant has runtime dependencies
+beyond the binary itself. To get a working NVENC/NVDEC build in your own
+image you need to copy **all** of the following from `:8.1-cuda`:
+
+```Dockerfile
+FROM alpine:3.20
+
+# 1. The binaries. /ffmpeg in the cuda image is a bash wrapper that execs
+#    /ffmpeg.bin (the real ELF) — it downgrades the benign teardown SIGSEGV
+#    (exit 139 → 0) while preserving real ffmpeg exit codes. Both files must
+#    be copied; the wrapper expects to find /ffmpeg.bin at the same root.
+COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg     /ffmpeg
+COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg.bin /ffmpeg.bin
+COPY --from=mwader/static-ffmpeg:8.1-cuda /ffprobe    /usr/local/bin/
+
+# 2. musl loader path file — adds /usr/lib64, /usr/lib/wsl/lib, etc. so musl
+#    can find the toolkit-injected NVIDIA driver libs.
+COPY --from=mwader/static-ffmpeg:8.1-cuda /etc/ld-musl-x86_64.path /etc/ld-musl-x86_64.path
+
+# 3. The glibc → musl ABI shim (LD_PRELOAD'd into ffmpeg).
+COPY --from=mwader/static-ffmpeg:8.1-cuda /usr/local/lib/libnvshim.so /usr/local/lib/
+
+# 4. gcompat + bash + the libdl.so.2 → libgcompat.so.0 symlink the NVIDIA
+#    driver libs need at DT_NEEDED resolution time. bash is required by the
+#    /ffmpeg wrapper script.
+RUN apk add --no-cache gcompat libstdc++ bash && \
+    ln -sf /usr/lib/libgcompat.so.0 /usr/lib/libdl.so.2
+
+# 5. Toolkit env (compute → libcuda.so.1, video → libnvcuvid/libnvidia-encode).
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \
+    LD_PRELOAD=/usr/local/lib/libnvshim.so
+
+ENTRYPOINT ["/ffmpeg"]
+```
+
+Notes:
+
+- The base image **must be Alpine** (or otherwise musl-based with a compatible
+  musl major version). Glibc-based images — including `debian:*-slim`
+  (e.g. `bookworm-slim`), `ubuntu:*`, `python:*-slim`, `nvidia/cuda:*`,
+  `redhat/ubi*`, etc. — are **not** supported destinations: the binary's
+  `PT_INTERP` is `/lib/ld-musl-x86_64.so.1`, which doesn't exist on those
+  distros, and the `gcompat` shim in step 4 is Alpine-only. If you need a
+  Debian/Ubuntu runtime, run the published `mwader/static-ffmpeg:<tag>-cuda`
+  image directly (it's already Alpine-based) instead of `COPY --from`'ing
+  into a glibc base.
+- Skipping any of items 2–5 will produce a binary that builds and runs
+  `-version` fine but fails at the first NVENC/NVDEC call.
+- Run with `--gpus all` (and the NVIDIA Container Toolkit installed on the
+  host) for GPU access — same as running `mwader/static-ffmpeg:8.1-cuda`
+  directly.
+
+##### Multi-process images (Python / Node / app + ffmpeg)
+
+The example above sets `LD_PRELOAD=/usr/local/lib/libnvshim.so` as image-wide
+`ENV`. That's safe in an **ffmpeg-only** image (the published `:*-cuda` image
+runs only `/ffmpeg`, which was built and tested with the shim preloaded), but
+it is **not** safe in an image that also runs other musl binaries — `pip`,
+`python`, `node`, your app, etc. `libnvshim.so` exports glibc-only symbols and
+transitively pulls in `gcompat` (via `DT_NEEDED libdl.so.2`). Forcing that
+into every process tends to crash CPython and other musl interpreters with
+`SIGSEGV` (exit code 139) at startup.
+
+For multi-process images, scope the preload to ffmpeg only with a small
+wrapper instead of `ENV LD_PRELOAD`:
+
+```Dockerfile
+# Replace step 5's `LD_PRELOAD=...` ENV line with a wrapper that sets
+# LD_PRELOAD only for the ffmpeg process. Other processes (pip, python,
+# sh, ...) run with a clean environment. The wrapper at /usr/local/bin/ffmpeg
+# also exposes ffmpeg on PATH for your app to call as `ffmpeg`.
+RUN printf '%s\n' \
+    '#!/bin/sh' \
+    'exec env LD_PRELOAD=/usr/local/lib/libnvshim.so /ffmpeg "$@"' \
+    > /usr/local/bin/ffmpeg \
+    && chmod +x /usr/local/bin/ffmpeg
+
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility,video
+# (no ENV LD_PRELOAD here)
+```
+
+`/usr/local/bin/ffmpeg` (the wrapper) execs `/ffmpeg` (the static-ffmpeg bash
+wrapper that downgrades the benign teardown SIGSEGV) which execs
+`/ffmpeg.bin` (the real ELF). Exit codes propagate unchanged via `exec`. Your
+app continues to call `ffmpeg` from `PATH` as normal.
+
+If you also invoke `ffprobe` against CUDA-accelerated decoders and see it
+crash, wrap it the same way (rename the copied binary to `ffprobe.bin` first
+and put the wrapper at `/usr/local/bin/ffprobe`). For most ffprobe use cases
+this isn't needed.
+
+#### Limitations
+
+- `--enable-cuda-nvcc` and `--enable-libnpp` are **not** included — they require
+  the full glibc-based CUDA toolkit and would defeat the static/musl design.
+  Use `scale_cuda` instead of `scale_npp` for GPU resizing.
+- The CUDA variant is **not fully static**. The binary depends on the musl
+  loader/libc that ship in the `alpine` base of the `final-cuda` stage. If you
+  copy the binary into another image, that image must provide a compatible
+  musl libc (i.e. an Alpine-based image of the matching `musl` major version).
+- Without `--gpus all` (or without the NVIDIA Container Toolkit) the binary
+  still runs but `nvenc`/`nvdec`/`cuda` initialization will fail at runtime.
+- amd64 only.
+
 ### Fonts usage with SVG or draw text filters etc
 
 The image ships with some basic fonts (`font-terminus font-inconsolata font-dejavu font-awesome`) that can be used when running the image directly. If your copying the binaries into some image you have to install fonts somehow. How to do this depends a bit on distributions but in general look for font packages and how to make [fontconfig](https://www.freedesktop.org/wiki/Software/fontconfig/) know about them.
@@ -288,6 +447,5 @@ usage and potential distribution of such.
 
 - Add libopenapv
 - Add libplacebo, chromaprint, etc. ...
-- Add acceleration support (GPU, CUDA, ...)
 - Add *.a *.so libraries, headers and pkg-config somehow
 
diff --git a/checkelf b/checkelf
index b4233b4..d500d42 100755
--- a/checkelf
+++ b/checkelf
@@ -1,14 +1,35 @@
 #!/bin/sh
 set -eu
 
-NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no)
+# Usage: checkelf [--cuda] <binary>
+#
+# In default mode the binary must have NO external library deps (fully static-pie).
+# In --cuda mode the binary is a musl dynamic-PIE: only the musl loader and libc
+# (which are the same .so) are allowed as NEEDED entries, so that ffmpeg can
+# dlopen() the NVIDIA driver libs (libcuda.so.1, libnvcuvid.so, libnvidia-encode.so)
+# at runtime when the container is started with `--gpus all`.
+
+MODE=default
+if [ "${1:-}" = "--cuda" ]; then
+    MODE=cuda
+    shift
+fi
+
+if [ "$MODE" = "cuda" ]; then
+    # Allow only the musl loader / libc lines from `ldd`. Anything else is unexpected.
+    EXTRA=$(ldd "$1" 2>/dev/null | grep -E -v 'ld-musl|libc\.musl|statically linked' || true)
+    NOEXTLIBS=$(test -z "$EXTRA" && echo yes || echo no)
+else
+    NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no)
+fi
 RELRO=$(readelf -l "$1" | grep -q GNU_RELRO && echo yes || echo no)
 BIND_NOW=$(readelf -d "$1" | grep -q BIND_NOW && echo yes || echo no)
 PIE=$(readelf -h "$1" | grep -q DYN && echo yes || echo no)
 STACKNX=$(readelf -W -l "$1" | grep GNU_STACK | grep -q -v RWE && echo yes || echo no)
 
 file "$1"
-echo "No external libs: $NOEXTLIBS"
+echo "Mode: $MODE"
+echo "No unexpected external libs: $NOEXTLIBS"
 echo "Relocate read-only: $RELRO"
 echo "Resolve at startup: $BIND_NOW"
 echo "Position independent code: $PIE"
diff --git a/docs/ffmpeg-with-cuda.md b/docs/ffmpeg-with-cuda.md
new file mode 100644
index 0000000..e1286dc
--- /dev/null
+++ b/docs/ffmpeg-with-cuda.md
@@ -0,0 +1,549 @@
+# Adding NVIDIA CUDA / NVENC / NVDEC support to `static-ffmpeg`
+
+**Date:** 2026-04-24 → 2026-05-03
+**Tracking issue:** [#480 — Support for CUDA](https://github.com/wader/static-ffmpeg/issues/480)
+**Outcome:** Separate `:<tag>-cuda` image variant; default `:<tag>` remains a fully static-pie binary.
+
+---
+
+## TL;DR
+
+| | Default `:8.1` | CUDA `:8.1-cuda` |
+|---|---|---|
+| Linkage | static-pie musl | musl **dynamic-PIE** (libc only) |
+| `readelf -d` NEEDED | (none) | exactly one: `libc.musl-x86_64.so.1` |
+| GPU | ❌ | ✅ NVENC / NVDEC / CUVID |
+| Arch | amd64 + arm64 | amd64 only |
+| Base image | scratch | alpine |
+| ffmpeg exit codes | upstream | identical to upstream |
+
+The CUDA variant works on Alpine + musl by combining six independently-essential
+layers (link-time + runtime). Each layer fixes one specific failure mode that
+appeared during development. The layers are summarized below; full
+problem → cause → fix sections follow.
+
+| # | Layer | Stage | Fixes |
+|---|---|---|---|
+| 1 | Absolute-path link of `/lib/ld-musl-x86_64.so.1` | builder | dlopen returning NULL silently (P1) |
+| 2 | Dynamic-PIE link mode (`-fPIE -pie`, not `-static-pie`) | builder | dlopen impossible on static-pie (P1) |
+| 3 | `/etc/ld-musl-x86_64.path` listing toolkit injection dirs | runtime | musl can't find `/usr/lib64`, `/usr/lib/wsl/lib` (P3) |
+| 4 | `gcompat` package + `libdl.so.2 → libgcompat.so.0` symlink | runtime | NVIDIA driver libs need `libc.so.6` / `libdl.so.2` (P4) |
+| 5 | `libnvshim.so` LD_PRELOAD (ABI-shim symbols only) | runtime | glibc-internal symbols missing from gcompat (P4) |
+| 6 | Bash entrypoint wrapper (139 → 0 only) | runtime | benign teardown SIGSEGV from libcuda dtors (P5) |
+
+---
+
+## 1. Architecture decision
+
+### Two separate variants, not one
+
+- The default `mwader/static-ffmpeg` is a fully static-pie musl binary that drops into `FROM scratch`. We must not silently break that for existing users.
+- CUDA requires `dlopen()` of host driver libraries → fundamentally incompatible with `static-pie` on musl (no dynamic loader).
+- CUDA users need the NVIDIA Container Toolkit and a GPU host — different deployment.
+- → Different tag = explicit user opt-in + clear support boundary.
+
+### Build-arg `ENABLE_CUDA`
+
+A single `ARG ENABLE_CUDA=` controls everything:
+
+- Adds `nv-codec-headers` (header-only, no CUDA toolkit at build time).
+- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec`.
+- Switches link mode from `static-pie` to musl dynamic-PIE.
+- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`.
+- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs.
+- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry).
+
+CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`.
+
+### Explicitly NOT supported
+
+| Feature | Reason |
+|---|---|
+| `--enable-cuda-nvcc` | Requires the full ~3 GB glibc-based CUDA toolkit at build time |
+| `--enable-libnpp` / `scale_npp` | Same — glibc-only; use `scale_cuda` instead |
+| `arm64` | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack) |
+| `FROM scratch` / distroless target images | No musl loader available |
+
+---
+
+## 2. Problem → Root cause → Fix
+
+Each subsection records one failure mode encountered during development.
+
+---
+
+### P1. `[h264_nvenc] Cannot load libcuda.so.1` — `dlopen()` silently returns NULL
+
+**Symptom.** Binary builds, `checkelf --cuda` passes, but at runtime
+`dlopen("libcuda.so.1")` returns NULL. `strace -e openat` shows ffmpeg never
+even attempts to open any libcuda file — no syscall fires at all.
+
+**Root cause.** Two independent musl traps stacked together:
+
+1. **`-static-pie` has no dynamic loader.** A static-pie musl binary cannot
+   `dlopen()` anything by definition.
+2. **musl's static `libc.a` ships a 25-byte `dlopen` stub** that always returns
+   `NULL` with `errno=ENOSYS`. Even after switching to dynamic-PIE, gcc's
+   `--toolchain=hardened` spec file kept emitting late references that pulled
+   `libc.a` back in, restoring the stub inside the binary. The bug was
+   invisible to standard checks: `BIND_NOW`, `RELRO`, `PIE`, NX stack all
+   passed; `ldd` still showed only one extra NEEDED entry. Only
+   `readelf -s --dyn-syms /ffmpeg | grep dlopen` revealed:
+   ```
+   21987: 000000000338c50e   25 FUNC WEAK DEFAULT 14 dlopen
+   ```
+   — `dlopen` defined inside `.text` at 25 bytes, not `UND`.
+
+   Variants tried that did NOT fix it:
+   - `--extra-libs=' -lgomp -Wl,-Bdynamic -lc '` reorder — gcc spec file re-pulled `libc.a`.
+   - Hiding `/usr/lib/libc.a` during link — broke libgme configure-time symbol checks.
+
+**Fix (Layers 1 + 2).**
+
+1. Link mode: replace `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie`.
+2. Link the musl combined loader/libc by **absolute path** in
+   `--extra-ldflags`, so the linker resolution is immune to `-Bstatic` /
+   `-Bdynamic` toggles and gcc spec-file re-emissions:
+   ```sh
+   --extra-ldflags='-fopenmp -Wl,--allow-multiple-definition \
+       -Wl,-z,stack-size=2097152 \
+       -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \
+       -Wl,--as-needed -Wl,-Bstatic \
+       -static-libstdc++ -static-libgcc'
+   --extra-libs='-lgomp -Wl,-Bdynamic -lc'
+   ```
+
+   On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc;
+   one absolute filename covers everything we needed `-lc` for. An absolute
+   filename is opened literally regardless of `-Bstatic` mode and cannot be
+   re-resolved against `libc.a`.
+
+**Verification.**
+```sh
+readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose'
+# Each must be 0-size UND, OR not exported (resolved internally against
+# the absolute-path libc — both work). The functional NVENC encode is
+# the ground truth; readelf is the cheap pre-flight.
+```
+
+**Lesson.** Never link musl `libc.a` into a binary that calls `dlopen` — it
+will silently use the stub. The `-Bdynamic -lc -Bstatic` reorder is fragile
+under `--toolchain=hardened`; prefer the absolute-path form.
+
+---
+
+### P2. `checkelf` rejects the dynamic-PIE binary
+
+**Symptom.** The CUDA build's hardening check rejects the binary because it
+has a `NEEDED` entry (libc), whereas the default build has zero.
+
+**Fix.** Add `--cuda` flag to `checkelf`. In `--cuda` mode it allows the
+musl loader/libc entry from `ldd` output (everything else still rejected).
+All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved.
+
+---
+
+### P3. `dlopen("libcuda.so.1")` reports "Library not found"
+
+**Symptom.** With driver libs actually mounted by the toolkit,
+`dlopen("libcuda.so.1")` still fails with "Library not found".
+
+**Root cause.** musl's default loader search path is
+`/lib:/usr/local/lib:/usr/lib`. The NVIDIA Container Toolkit injects driver
+libs to `/usr/lib64` (RHEL/Fedora/WSL convention) or
+`/usr/lib/x86_64-linux-gnu` (Debian/Ubuntu). musl also doesn't read
+`/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently
+ignored.
+
+**Fix (Layer 3).** Ship a static `/etc/ld-musl-x86_64.path`:
+```
+/usr/lib/x86_64-linux-gnu
+/usr/lib64
+/usr/lib/wsl/lib
+/usr/lib
+/usr/local/lib
+/lib
+```
+Listing all is safe — musl silently skips paths that don't exist.
+
+---
+
+### P4. NVIDIA driver libs reference glibc-internal symbols missing from musl
+
+**Symptom.** Even with libs found, `dlopen("libcuda.so.1.1")` (the WSL2
+backend) fails with `Error relocating: <sym>: symbol not found`. Iteratively
+discovered missing symbols: `gnu_get_libc_version`, `__register_atfork`,
+`dlmopen`, `dlvsym`, etc.
+
+**Root cause.** NVIDIA driver libs are built against glibc.
+`gcompat` provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` /
+`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds
+`dlopen` into libc) and a number of glibc-internal helpers used by recent
+drivers.
+
+**Fix (Layers 4 + 5).**
+
+- Install `gcompat` package.
+- Symlink `libdl.so.2 → libgcompat.so.0` (driver's `DT_NEEDED libdl.so.2`).
+- Build a small `libnvshim.so` exporting the missing glibc-internal symbols
+  and `LD_PRELOAD` it. Final shim payload:
+
+  | Symbol | Implementation |
+  |---|---|
+  | `gnu_get_libc_version` | return `"2.35"` |
+  | `gnu_get_libc_release` | return `"stable"` |
+  | `__libc_current_sigrtmin` / `__libc_current_sigrtmax` | musl macros exposed as functions |
+  | `__register_atfork` | redirect to `pthread_atfork` |
+  | `__cxa_thread_atexit_impl` | no-op |
+  | `__libc_single_threaded` | data symbol, value 0 |
+  | `secure_getenv` | redirect to `getenv` |
+  | `dlmopen` | redirect to `dlopen` (ignore Lmid_t) |
+  | `dlvsym` | redirect to `dlsym` (ignore version) |
+  | `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose` | wrappers |
+
+  > **Critical: `libnvshim.so` must NOT export `exit` / `_exit` / `_Exit`.**
+  > See P6 — interposing those swallows ffmpeg's real exit status.
+
+**Maintenance note.** Each new NVIDIA driver release may reference one more
+glibc-internal symbol. Diagnostic recipe in §3 finds it in <5 minutes; fix
+is a one-line addition to `libnvshim.so`.
+
+---
+
+### P5. NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown
+
+**Symptom.** Encode completes successfully (`frame= 60 ... muxing overhead`,
+output bytes fully written), then ffmpeg exits with 139.
+
+**Root cause.** libcuda's `__cxa_finalize` / `DT_FINI` destructors run during
+`avcodec_close → nvenc_free → cuCtxDestroy` while still inside `main()`.
+Those destructors call into glibc-internal state (TLS-destructor unwinding,
+pthread_atfork handlers) that musl + gcompat don't fully provide, and crash.
+
+Because the crash is inside `main()` (not after `exit()` is called), no
+in-process hook — atexit, `LD_PRELOAD` signal handlers, etc. — can suppress
+it cleanly. Attempts at in-process suppression all failed:
+
+| Attempt | Result |
+|---|---|
+| `nvshim` `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* `main()` returns; atexit never runs |
+| In-process signal handler | Same — crash is in destructor before signal can dispatch |
+
+**Fix (Layer 6).** Out-of-process bash entrypoint wrapper that captures the
+real exit code via `${PIPESTATUS[0]}` and downgrades **only** `139 → 0`,
+gated on stderr containing no recognized error keyword. Real failures
+(mid-encode CUDA OOM, init failures, etc.) propagate unchanged because they
+always print an identifiable error first.
+
+```bash
+#!/bin/bash
+errfile=$(mktemp)
+shellerr=$(mktemp)
+trap "rm -f \"$errfile\" \"$shellerr\"" EXIT
+exec 3>&1
+exec 4>&2
+exec 2>"$shellerr"
+{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4
+rc=${PIPESTATUS[0]}
+exec 3>&-
+exec 2>&4 4>&-
+# Filter the bash job-control "Segmentation fault (core dumped)" line.
+grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true
+# Suppress *only* the known-benign teardown SIGSEGV.
+if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then
+    exit 0
+fi
+exit "$rc"
+```
+
+ffprobe doesn't need the wrapper — it doesn't open NVENC encoders, so the
+crashing destructor path isn't reached.
+
+---
+
+### P6. ffmpeg silently exits 0 on every error path
+
+**Symptom.** Every fatal-error invocation of the CUDA build returned exit
+code `0` to the shell, despite ffmpeg printing the correct error messages.
+Verified against the non-CUDA `:8.1` baseline:
+
+| Scenario | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) |
+|---|---|---|---|
+| `-c:v this_codec_does_not_exist` | `8` | `0` ❌ | `8` ✅ |
+| `-i /no/such/file.mp4` | `254` | `0` ❌ | `254` ✅ |
+| `-vf this_filter_does_not_exist` | `8` | `0` ❌ | `8` ✅ |
+| Successful encode | `0` | `0` ✅ | `0` ✅ |
+| Successful encode (post-teardown SEGV) | n/a | `139` (raw) | `0` (wrapped) |
+
+This was masked at first by an "upgrade exit 0 → 1 when stderr matches a
+fatal-error keyword" branch in the wrapper. That made tests pass with a
+plausible-looking exit `1`, but it was a workaround, not a fix — the wrong
+exit code (`1` instead of `8`/`254`) broke any caller that switched on the
+specific code.
+
+**Root-cause discovery.** An `LD_PRELOAD` `dladdr` tracer interposing `_exit`
+revealed that on every code path — bad-codec, bad-input, even successful
+`-version` — the call to `_exit` came from `libnvshim.so`:
+```
+[exittrace] _exit(0) ra=0x...  dso=/usr/local/lib/libnvshim.so
+```
+
+`libnvshim.so` had been given an `_exit` interposer (and at one point an
+`exit` interposer too) as part of the abandoned in-process attempt to
+suppress the teardown SIGSEGV (P5). The interposer always invoked
+`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status
+and hard-coded `0`. None of the standard ELF / readelf / `nm` checks flag
+this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and
+musl's PLT happily binds `_exit` to whichever DSO comes first in symbol
+search order — `LD_PRELOAD` always wins.
+
+**Fix.** Drop the `_exit` (and `exit`) overrides from `libnvshim.so`
+entirely. They were never needed for any glibc→musl ABI gap (those are all
+the symbols in P4). Process-lifecycle suppression belongs in the
+out-of-process bash wrapper (P5), where it can read the real exit status via
+`${PIPESTATUS[0]}` and pattern-match on actual error keywords.
+
+After removing the interposers, all standard ffmpeg exit codes match the
+non-CUDA build byte-for-byte.
+
+**Lesson (now baked into Layer 5).** `LD_PRELOAD` shims should be the
+*minimum* symbol set that closes the glibc→musl ABI gap. Any
+process-lifecycle hook (exit, signal, atexit) added to such a shim will
+silently apply to *every* call from the host program, not just the one
+CUDA-driver call you were trying to fix. **Keep lifecycle policy
+out-of-process.**
+
+---
+
+### P7. Other small issues encountered (one-line each)
+
+| # | Issue | Fix |
+|---|---|---|
+| 1 | `nv-codec-headers` checksum mismatch | Recompute SHA256 against actual GitHub release tarball |
+| 2 | ffmpeg link failed because `LDFLAGS` was set unconditionally and conflicted with `-static-pie` in non-CUDA branch | Gate the `LDFLAGS` export on `ENABLE_CUDA` only |
+| 3 | Spurious dynamic deps (`libgomp`, `libdrm`, …) | Pre-link with `-Wl,-Bstatic` + `-static-libgcc -static-libstdc++` |
+| 4 | Toolkit only mounted 180 KB stub `libcuda.so.1` (no `libnvcuvid` / `libnvidia-encode`) | Bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` into image |
+| 5 | WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook | Host-side regression unrelated to image; `wsl --shutdown` + restart |
+
+---
+
+## 3. Diagnostics
+
+### 3a. Quick image probe (link state, env, driver libs, dlopen, encode)
+
+```sh
+IMG=mwader/static-ffmpeg:8.1-cuda
+docker run --rm --gpus all --entrypoint sh "$IMG" -c '
+  apk add --no-cache gcc musl-dev binutils strace >/dev/null
+
+  echo "=== 1. Linkage ==="
+  ldd /ffmpeg
+  readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW"
+
+  echo "=== 2. musl loader path ==="
+  cat /etc/ld-musl-x86_64.path
+
+  echo "=== 3. Driver libs mounted ==="
+  ls -lh /usr/lib64/libcuda.so.1 /usr/lib64/libnv*.so.1 \
+         /usr/lib/wsl/drivers/nv_dispi.inf_amd64_*/libcuda.so.1.1 2>/dev/null
+
+  echo "=== 4. Standalone dlopen + cuInit ==="
+  cat > /t.c <<EOF
+#include <dlfcn.h>
+#include <stdio.h>
+int main(void){
+  void *h = dlopen("libcuda.so.1", RTLD_LAZY);
+  if(!h){fprintf(stderr,"FAIL: %s\n",dlerror());return 1;}
+  int (*ci)(unsigned)=(int(*)(unsigned))dlsym(h,"cuInit");
+  fprintf(stderr,"cuInit=%d\n", ci?ci(0):-99);
+  return 0;
+}
+EOF
+  gcc /t.c -o /t && /t
+
+  echo "=== 5. ffmpeg openat trace for h264_nvenc ==="
+  strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg \
+      -hide_banner -loglevel error \
+      -f lavfi -i testsrc=size=320x240:rate=30 -t 1 \
+      -c:v h264_nvenc -f null - 2>&1 | tail -3
+  grep -E "cuda|nvidia|nvcuvid|libnv|/dev/dxg|/dev/nvidia" /tmp/ff.strace | head -40
+'
+```
+
+### 3b. "Wrong exit code" regression check (guards against P6)
+
+```sh
+docker run --rm --gpus all --entrypoint sh "$IMG" -c '
+  apk add --no-cache gcc musl-dev >/dev/null
+  cat > /tmp/t.c <<EOF
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+__attribute__((noreturn)) void _exit(int s){
+  void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i);
+  dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?");
+  syscall(SYS_exit_group,s); __builtin_unreachable();
+}
+EOF
+  gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl
+  LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v this_codec_does_not_exist -f null -
+'
+# The traced _exit MUST show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc).
+# If it shows dso=/usr/local/lib/libnvshim.so → P6 regression is back.
+```
+
+### 3c. dlopen-stub regression check (guards against P1)
+
+```sh
+docker run --gpus all --rm --entrypoint sh "$IMG" -c '
+  apk add --no-cache binutils >/dev/null 2>&1
+  readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror|dlclose"
+'
+# Each must be 0-size UND (or not exported at all). A non-zero size in .text
+# (e.g. " 25 FUNC ... 14 dlopen") means the static stub bug is back.
+```
+
+---
+
+## 4. Build & verify
+
+### Build
+
+```sh
+cd /path/to/static-ffmpeg
+
+docker build --no-cache \
+    --build-arg ENABLE_CUDA=1 \
+    --target final-cuda \
+    -t mwader/static-ffmpeg:8.1-cuda .
+```
+
+> Use `--no-cache` if you previously built `:8.1-cuda` with broken link
+> flags — Docker will otherwise reuse the cached ffmpeg layer that contains
+> the static `dlopen` stub. Full rebuild ~45–75 min (libaom, libvmaf, x265,
+> svt-av1, vvenc dominate).
+
+If you only changed the `final-cuda` stage (env, ld-musl path, wrapper),
+`--no-cache` is unnecessary.
+
+### Final verification recipe (all five must pass)
+
+```sh
+IMG=mwader/static-ffmpeg:8.1-cuda
+
+# 1. Static-ness check (exactly one NEEDED entry: musl libc)
+docker run --rm --entrypoint sh "$IMG" -c '
+  apk add --no-cache binutils >/dev/null 2>&1
+  readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW"
+'
+
+# 2. NVENC encode end-to-end
+docker run --rm --gpus all "$IMG" \
+    -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \
+    -c:v h264_nvenc -f null - ; echo "exit=$? (must be 0)"
+
+# 3. MP4-to-stdout byte-exactness (wrapper passthrough)
+docker run --rm --gpus all "$IMG" \
+    -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v h264_nvenc -f mp4 -movflags frag_keyframe+empty_moov - 2>/dev/null \
+    | wc -c   # must print > 0
+
+# 4. ffprobe sanity (no wrapper)
+docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null
+echo "exit=$? (must be 0)"
+
+# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for P6)
+docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \
+    -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \
+    -c:v this_codec_does_not_exist -f null - ; echo "exit=$? (must be 8)"
+docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \
+    -i /no/such/file.mp4 -f null - ; echo "exit=$? (must be 254)"
+```
+
+---
+
+## 5. Runtime requirements
+
+### Host
+- NVIDIA driver installed.
+- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker.
+- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`).
+
+### Image-side env (set by Dockerfile)
+- `NVIDIA_VISIBLE_DEVICES=all`
+- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`
+  - `compute` → `libcuda.so.1`
+  - `video` → `libnvcuvid.so`, `libnvidia-encode.so`
+  - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`.
+
+### Toolkit driver-injection layouts covered by `/etc/ld-musl-x86_64.path`
+- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu`
+- RHEL/Fedora hosts   → `/usr/lib64`
+- WSL2                → `/usr/lib/wsl/lib`
+
+---
+
+## 6. Runtime call chain (six layers in action)
+
+```
+docker run --gpus all  ⇒  toolkit injects libcuda.so.1 → /usr/lib64
+                          + sets NVIDIA_DRIVER_CAPABILITIES from image ENV
+       │
+       ▼
+ffmpeg-cuda-entrypoint (bash)               ← Layer 6 (P5)
+       │ exec
+       ▼
+/ffmpeg  (musl dynamic-PIE, libc-only NEEDED)               ← Layer 2 (P1)
+       │ ld.so loads libc.musl-x86_64.so.1
+       │   (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path)   ← Layer 3 (P3)
+       │ LD_PRELOAD → /usr/local/lib/libnvshim.so                            ← Layer 5 (P4)
+       ▼
+ffnvcodec dynlink_loader.h:
+       dlopen("libcuda.so.1", RTLD_LAZY)    ← needs Layer 1 (real PLT entry, P1)
+       │
+       ▼ ld.so loads libcuda.so.1 (WSL stub)
+       │   resolves DT_NEEDED libdl.so.2 → libgcompat.so.0                   ← Layer 4 (P4)
+       │
+       ▼ libcuda dlopens its WSL backend libcuda.so.1.1
+       │   resolves glibc-internals via libnvshim.so                         ← Layer 5 (P4)
+       │
+       ▼ encode runs successfully, frames produced, output flushed
+       │
+       ▼ ffmpeg main() → avcodec_close → cuCtxDestroy
+       │   libcuda __cxa_finalize crashes during teardown          ☠ SIGSEGV (P5)
+       │
+       ▼ wrapper sees exit=139, no error keyword in stderr → exit 0         ← Layer 6 (P5)
+```
+
+---
+
+## 7. Comparison with other static ffmpeg + nvenc projects
+
+| Project | Static? | NVENC? | Approach |
+|---|---|---|---|
+| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen |
+| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works |
+| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic + runtime ldconfig | ✅ | Tarball, glibc-linked |
+| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs |
+| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container |
+
+Of these, only `:8.1-cuda` keeps every codec/lib statically linked — every
+other "static + nvenc" build is glibc-dynamic. The trade-off vs the default
+`:8.1` is exactly one libc.so dependency.
+
+---
+
+## 8. CI / publishing notes
+
+- Default tag: built for `linux/amd64,linux/arm64` as before.
+- CUDA tag: built for `linux/amd64` only.
+  - Pushed as `<tag>-cuda` (and re-tagged manifest-style as `<tag>-cuda-amd64` for clarity).
+  - `latest-cuda` follows latest stable.
+- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix.
+