From a93ea50f56d4b37b8c4bc5db1b9c7575e4454368 Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Fri, 24 Apr 2026 17:16:10 +0200 Subject: [PATCH 1/8] using claude to build docker image with cuda support --- .github/workflows/ci.yml | 28 +- .github/workflows/multiarch.yml | 76 ++++- Dockerfile | 80 ++++- README.md | 64 +++- checkelf | 25 +- docs/24-04-2026-ffmpeg-with-cuda.md | 484 ++++++++++++++++++++++++++++ 6 files changed, 734 insertions(+), 23 deletions(-) create mode 100644 docs/24-04-2026-ffmpeg-with-cuda.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ff3ec5..2fc32d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,10 +8,30 @@ on: jobs: ci: strategy: + fail-fast: false matrix: include: + # default static build, both arches - runs_on: ubicloud-standard-30 + variant: default + target: "" + build_args: | + ENABLE_FDKAAC=1 - runs_on: ubicloud-standard-30-arm + variant: default + target: "" + build_args: | + ENABLE_FDKAAC=1 + # CUDA variant (NVENC/NVDEC/CUVID), amd64 only for now. + # No GPU on the runner — the build only verifies that the binary + # links and that nvenc/cuvid/cuda show up in -encoders/-hwaccels. + - runs_on: ubicloud-standard-30 + variant: cuda + target: final-cuda + build_args: | + ENABLE_FDKAAC=1 + ENABLE_CUDA=1 + name: ci (${{ matrix.variant }} / ${{ matrix.runs_on }}) runs-on: ${{ matrix.runs_on }} steps: - uses: actions/checkout@v3 @@ -21,7 +41,7 @@ jobs: with: context: . push: false - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - ENABLE_FDKAAC=1 + cache-from: type=gha,scope=${{ matrix.variant }}-${{ matrix.runs_on }} + cache-to: type=gha,mode=max,scope=${{ matrix.variant }}-${{ matrix.runs_on }} + target: ${{ matrix.target }} + build-args: ${{ matrix.build_args }} diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml index 2037ccc..02e435f 100644 --- a/.github/workflows/multiarch.yml +++ b/.github/workflows/multiarch.yml @@ -13,27 +13,50 @@ env: jobs: build: - name: Build image + name: Build image (${{ matrix.variant }} / ${{ matrix.tag }}) strategy: + fail-fast: false matrix: include: + # default fully-static build, multi-arch - runs_on: ubicloud-standard-8-arm tag: arm64 + variant: default + target: "" + build_args: "" - runs_on: ubuntu-latest tag: amd64 + variant: default + target: "" + build_args: "" + # CUDA variant (NVENC/NVDEC/CUVID), amd64 only. + # If/when ffnvcodec is regularly tested on Jetson/arm64, add an arm64 entry. + - runs_on: ubuntu-latest + tag: amd64 + variant: cuda + target: final-cuda + build_args: ENABLE_CUDA=1 runs-on: ${{ matrix.runs_on }} steps: - uses: actions/checkout@v4 - name: Docker build - run: docker build --tag image:${{ matrix.tag }} . + run: | + docker build \ + ${{ matrix.target && format('--target {0}', matrix.target) || '' }} \ + ${{ matrix.build_args && format('--build-arg {0}', matrix.build_args) || '' }} \ + --tag image:${{ matrix.variant }}-${{ matrix.tag }} \ + . - name: Docker save - run: docker image save --output image-${{ matrix.tag }}.tar image:${{ matrix.tag }} - - name: Upload Docker image-${{ matrix.tag }} + run: | + docker image save \ + --output image-${{ matrix.variant }}-${{ matrix.tag }}.tar \ + image:${{ matrix.variant }}-${{ matrix.tag }} + - name: Upload Docker image-${{ matrix.variant }}-${{ matrix.tag }} uses: actions/upload-artifact@v4 with: - name: image-${{ matrix.tag }} - path: image-${{ matrix.tag }}.tar + name: image-${{ matrix.variant }}-${{ matrix.tag }} + path: image-${{ matrix.variant }}-${{ matrix.tag }}.tar retention-days: 1 tag: @@ -53,7 +76,7 @@ jobs: ' >> "$GITHUB_OUTPUT" merge: - name: Merge and push images + name: Merge and push default images runs-on: ubuntu-latest needs: - build @@ -63,12 +86,12 @@ jobs: uses: actions/download-artifact@v4 with: path: /tmp - pattern: image-* + pattern: image-default-* merge-multiple: true - name: Load Docker images run: | - docker image load --input /tmp/image-arm64.tar - docker image load --input /tmp/image-amd64.tar + docker image load --input /tmp/image-default-arm64.tar + docker image load --input /tmp/image-default-amd64.tar - name: Docker meta id: meta uses: docker/metadata-action@v5 @@ -81,8 +104,8 @@ jobs: password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Create manifest list and push run: | - docker tag image:arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64 - docker tag image:amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 + docker tag image:default-arm64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64 + docker tag image:default-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-arm64 docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 docker manifest create \ @@ -91,3 +114,32 @@ jobs: --amend ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-amd64 docker manifest inspect ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }} docker manifest push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }} + + merge-cuda: + name: Push CUDA image (amd64 only) + runs-on: ubuntu-latest + needs: + - build + - tag + steps: + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp + pattern: image-cuda-* + merge-multiple: true + - name: Load Docker image + run: docker image load --input /tmp/image-cuda-amd64.tar + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Tag and push CUDA image + run: | + # CUDA variant is amd64-only for now; published as a single-arch tag. + docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64 + docker tag image:cuda-amd64 ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda + docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda-amd64 + docker push ${{ env.REGISTRY_IMAGE }}:${{ needs.tag.outputs.TAG }}-cuda + diff --git a/Dockerfile b/Dockerfile index 1551539..b3a8fce 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1037,6 +1037,27 @@ RUN \ --enable-static && \ make -j$(nproc) install +# NVIDIA codec headers (header-only stubs for NVENC / NVDEC / CUVID / CUDA driver API). +# These do NOT pull in the CUDA toolkit or any glibc-only NVIDIA libraries; ffmpeg +# dlopen()s libcuda.so.1 / libnvcuvid.so / libnvidia-encode.so at runtime, which are +# injected into the container by the NVIDIA Container Toolkit (`docker run --gpus all`). +# Only built when ENABLE_CUDA is set; the resulting ffmpeg binary in that case is a +# musl dynamic-PIE (not -static-pie) so the loader is present and dlopen() works. +# bump: ffnvcodec /FFNVCODEC_VERSION=([\d.]+)/ https://github.com/FFmpeg/nv-codec-headers.git|^13 +# bump: ffnvcodec after ./hashupdate Dockerfile FFNVCODEC $LATEST +# bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases +ARG FFNVCODEC_VERSION=13.0.19.0 +ARG FFNVCODEC_URL="https://github.com/FFmpeg/nv-codec-headers/archive/refs/tags/n${FFNVCODEC_VERSION}.tar.gz" +ARG FFNVCODEC_SHA256=62b30ab37e4e9be0d0b8b6a8e5fee71b8c4c8a2671ff39fb0a25e7a501f4e2b0 +ARG ENABLE_CUDA= +RUN \ + if [ -n "$ENABLE_CUDA" ]; then \ + wget $WGET_OPTS -O ffnvcodec.tar.gz "$FFNVCODEC_URL" && \ + echo "$FFNVCODEC_SHA256 ffnvcodec.tar.gz" | sha256sum -c - && \ + tar $TAR_OPTS ffnvcodec.tar.gz && cd nv-codec-headers-* && \ + make PREFIX=/usr/local install ; \ + fi + # requires libdrm # bump: libva /LIBVA_VERSION=([\d.]+)/ https://github.com/intel/libva.git|^2 # bump: libva after ./hashupdate Dockerfile LIBVA $LATEST @@ -1114,6 +1135,12 @@ ARG FFMPEG_SHA256=c07039598df7d64d3c8b42c4e25b1959fc908621c6f6c2946881133f3b27ed ARG ENABLE_FDKAAC= # sed changes --toolchain=hardened -pie to -static-pie # +# When ENABLE_CUDA is set we KEEP -pie (i.e. skip the -static-pie rewrite) so the +# resulting binary is a musl dynamic-PIE. This is required because ffnvcodec dlopen()s +# the NVIDIA driver libs at runtime, and a fully static-pie binary on musl has no +# dynamic loader → dlopen() always fails. All other dependencies remain statically +# archived; only ld-musl-*.so.1 / libc.musl-*.so.1 stay dynamic. +# # ldflags stack-size=2097152 is to increase default stack size from 128KB (musl default) to something # more similar to glibc (2MB). This fixing segfault with libaom-av1 and libsvtav1 as they seems to pass # large things on the stack. @@ -1125,7 +1152,10 @@ RUN \ echo "$FFMPEG_SHA256 ffmpeg.tar.bz2" | sha256sum -c - && \ tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \ FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \ - sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure && \ + CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \ + if [[ -z "$ENABLE_CUDA" ]]; then \ + sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \ + fi && \ ./configure \ --pkg-config-flags="--static" \ --extra-cflags="-fopenmp" \ @@ -1138,6 +1168,7 @@ RUN \ --enable-gpl \ --enable-version3 \ $FDKAAC_FLAGS \ + $CUDA_FLAGS \ --enable-fontconfig \ --enable-gray \ --enable-iconv \ @@ -1273,13 +1304,18 @@ RUN \ libzimg: env.ZIMG_VERSION, \ libzmq: env.LIBZMQ_VERSION, \ openssl: env.OPENSSL_VERSION, \ + ffnvcodec: env.FFNVCODEC_VERSION, \ }' > /versions.json # make sure binaries has no dependencies, is relro, pie and stack nx +# When ENABLE_CUDA is set the binaries are musl dynamic-PIE (so dlopen() of NVIDIA +# driver libs works at runtime); checkelf is invoked with --cuda which only allows +# the musl loader / libc as NEEDED entries. COPY checkelf / RUN \ - /checkelf /usr/local/bin/ffmpeg && \ - /checkelf /usr/local/bin/ffprobe + CHECKELF_FLAGS=$(if [ -n "$ENABLE_CUDA" ]; then echo "--cuda"; fi) && \ + /checkelf $CHECKELF_FLAGS /usr/local/bin/ffmpeg && \ + /checkelf $CHECKELF_FLAGS /usr/local/bin/ffprobe # workaround for using -Wl,--allow-multiple-definition # see comment in checkdupsym for details COPY checkdupsym / @@ -1320,6 +1356,42 @@ RUN ["/ffmpeg", "-f", "lavfi", "-i", "testsrc", "-c:v", "libx265", "-t", "100ms" FROM scratch AS final2 COPY --from=final1 / / -FROM final2 +FROM final2 AS final LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" ENTRYPOINT ["/ffmpeg"] + +# CUDA / NVENC / NVDEC variant. +# +# Build with: +# docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:-cuda . +# +# Run with (requires NVIDIA driver on host + nvidia-container-toolkit): +# docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" mwader/static-ffmpeg:-cuda \ +# -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4 +# +# The binary is a musl dynamic-PIE (NOT fully static-pie) so the dynamic loader is +# present and FFmpeg can dlopen() the NVIDIA driver libraries (libcuda.so.1, +# libnvcuvid.so, libnvidia-encode.so) which the NVIDIA Container Toolkit injects +# into the container at runtime. No CUDA toolkit is required to build or run. +# +# Note: --enable-libnpp / --enable-cuda-nvcc are NOT included as they require the +# full glibc-based CUDA toolkit; if you need scale_npp use scale_cuda instead. +FROM alpine:3.20.3 AS final-cuda +LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" +COPY --from=builder /usr/local/bin/ffmpeg / +COPY --from=builder /usr/local/bin/ffprobe / +COPY --from=builder /versions.json / +COPY --from=builder /usr/local/share/doc/ffmpeg/* /doc/ +COPY --from=builder /etc/ssl/cert.pem /etc/ssl/cert.pem +COPY --from=builder /etc/fonts/ /etc/fonts/ +COPY --from=builder /usr/share/fonts/ /usr/share/fonts/ +COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/ +COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/ +# sanity tests (cannot exercise actual GPU encode without a GPU at build time) +RUN ["/ffmpeg", "-version"] +RUN ["/ffprobe", "-version"] +RUN ["/ffmpeg", "-hide_banner", "-buildconf"] +RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda +RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc +RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid +ENTRYPOINT ["/ffmpeg"] diff --git a/README.md b/README.md index 59715de..dbbfbad 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent - [libzimg](https://github.com/sekrit-twc/zimg) - [libzmq](https://github.com/zeromq/libzmq) - [openssl](https://openssl.org) +- NVIDIA NVENC / NVDEC / CUVID via [nv-codec-headers](https://github.com/FFmpeg/nv-codec-headers) (only in the CUDA variant, [see below](#cuda--nvenc--nvdec-nvidia-gpu-acceleration)) - and all native ffmpeg codecs, formats, filters etc. ### Files in the image @@ -114,6 +115,10 @@ alias ffprobe='docker run -i --rm -u $UID:$GROUPS -v "$PWD:$PWD" -w "$PWD" --ent `MAJOR.MINOR.PATCH[-BUILD]` Specific version of FFmpeg with the features that was in master at the time of tagging. `-BUILD` means that was an additional build with that version to add of fix something. +`-cuda` (and `latest-cuda`) — same FFmpeg version compiled with NVIDIA +NVENC / NVDEC / CUVID support, see [CUDA / NVENC / NVDEC](#cuda--nvenc--nvdec-nvidia-gpu-acceleration) +below. Currently amd64 only (published as `-cuda` → `-cuda-amd64`). + ### Security Binaries are built with various hardening features but it's *still a good idea to run them @@ -126,6 +131,64 @@ Due to license issues the docker image does not include libfdk-aac by default. A docker build --build-arg ENABLE_FDKAAC=1 . -t my-ffmpeg-static:latest ``` +### CUDA / NVENC / NVDEC (NVIDIA GPU acceleration) + +The default image is fully static and does **not** support NVIDIA GPU acceleration +(a fully static-pie musl binary has no dynamic loader, so it cannot `dlopen()` the +NVIDIA driver libraries at runtime). + +A separate **CUDA variant** can be built that includes `ffnvcodec`, `nvenc`, +`nvdec` and `cuvid` support. In this variant the binary is a *musl dynamic-PIE* +(all FFmpeg dependencies remain statically archived; only the musl loader / libc +stays dynamic) so that FFmpeg can `dlopen()` the NVIDIA driver libs +(`libcuda.so.1`, `libnvcuvid.so`, `libnvidia-encode.so`) which the +[NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) +injects into the container at runtime via `--gpus all`. + +No CUDA toolkit is needed to build or to run — only header-only +[`nv-codec-headers`](https://github.com/FFmpeg/nv-codec-headers) at build time +and the host's NVIDIA driver at run time. + +#### Build + +```sh +docker build --build-arg ENABLE_CUDA=1 --target final-cuda \ + -t my-ffmpeg-static:cuda . +``` + +#### Run + +Requires the NVIDIA driver on the host and `nvidia-container-toolkit` installed +and configured in Docker. + +```sh +docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" my-ffmpeg-static:cuda \ + -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \ + -c:a copy -c:v h264_nvenc -b:v 5M output.mp4 +``` + +Verify GPU support inside the container: + +```sh +docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -hwaccels +docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_banner -encoders | grep nvenc +``` + +Supported encoders: `h264_nvenc`, `hevc_nvenc`, `av1_nvenc` (GPU dependent). +Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …). + +#### Limitations + +- `--enable-cuda-nvcc` and `--enable-libnpp` are **not** included — they require + the full glibc-based CUDA toolkit and would defeat the static/musl design. + Use `scale_cuda` instead of `scale_npp` for GPU resizing. +- The CUDA variant is **not fully static**. The binary depends on the musl + loader/libc that ship in the `alpine` base of the `final-cuda` stage. If you + copy the binary into another image, that image must provide a compatible + musl libc (i.e. an Alpine-based image of the matching `musl` major version). +- Without `--gpus all` (or without the NVIDIA Container Toolkit) the binary + still runs but `nvenc`/`nvdec`/`cuda` initialization will fail at runtime. + ### Fonts usage with SVG or draw text filters etc The image ships with some basic fonts (`font-terminus font-inconsolata font-dejavu font-awesome`) that can be used when running the image directly. If your copying the binaries into some image you have to install fonts somehow. How to do this depends a bit on distributions but in general look for font packages and how to make [fontconfig](https://www.freedesktop.org/wiki/Software/fontconfig/) know about them. @@ -288,6 +351,5 @@ usage and potential distribution of such. - Add libopenapv - Add libplacebo, chromaprint, etc. ... -- Add acceleration support (GPU, CUDA, ...) - Add *.a *.so libraries, headers and pkg-config somehow diff --git a/checkelf b/checkelf index b4233b4..d500d42 100755 --- a/checkelf +++ b/checkelf @@ -1,14 +1,35 @@ #!/bin/sh set -eu -NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no) +# Usage: checkelf [--cuda] +# +# In default mode the binary must have NO external library deps (fully static-pie). +# In --cuda mode the binary is a musl dynamic-PIE: only the musl loader and libc +# (which are the same .so) are allowed as NEEDED entries, so that ffmpeg can +# dlopen() the NVIDIA driver libs (libcuda.so.1, libnvcuvid.so, libnvidia-encode.so) +# at runtime when the container is started with `--gpus all`. + +MODE=default +if [ "${1:-}" = "--cuda" ]; then + MODE=cuda + shift +fi + +if [ "$MODE" = "cuda" ]; then + # Allow only the musl loader / libc lines from `ldd`. Anything else is unexpected. + EXTRA=$(ldd "$1" 2>/dev/null | grep -E -v 'ld-musl|libc\.musl|statically linked' || true) + NOEXTLIBS=$(test -z "$EXTRA" && echo yes || echo no) +else + NOEXTLIBS=$(test "$(ldd "$1" | wc -l)" -eq 1 && echo yes || echo no) +fi RELRO=$(readelf -l "$1" | grep -q GNU_RELRO && echo yes || echo no) BIND_NOW=$(readelf -d "$1" | grep -q BIND_NOW && echo yes || echo no) PIE=$(readelf -h "$1" | grep -q DYN && echo yes || echo no) STACKNX=$(readelf -W -l "$1" | grep GNU_STACK | grep -q -v RWE && echo yes || echo no) file "$1" -echo "No external libs: $NOEXTLIBS" +echo "Mode: $MODE" +echo "No unexpected external libs: $NOEXTLIBS" echo "Relocate read-only: $RELRO" echo "Resolve at startup: $BIND_NOW" echo "Position independent code: $PIE" diff --git a/docs/24-04-2026-ffmpeg-with-cuda.md b/docs/24-04-2026-ffmpeg-with-cuda.md new file mode 100644 index 0000000..8c9cd62 --- /dev/null +++ b/docs/24-04-2026-ffmpeg-with-cuda.md @@ -0,0 +1,484 @@ +# Adding NVIDIA CUDA / NVENC / NVDEC support to `static-ffmpeg` + +**Date:** 2026-04-24 +**Tracking issue:** [#480 — Support for CUDA](https://github.com/wader/static-ffmpeg/issues/480) +**Outcome:** Separate `:-cuda` image variant added; default `:` remains a fully static-pie binary. + +--- + +## 1. Problem statement + +The default `mwader/static-ffmpeg` image is a **fully static-pie musl binary** with zero +runtime dependencies. NVIDIA GPU acceleration (NVENC/NVDEC/CUVID) requires +`dlopen()`'ing the host's NVIDIA driver libraries (`libcuda.so.1`, +`libnvcuvid.so`, `libnvidia-encode.so`) at runtime, which is fundamentally +incompatible with `static-pie` on musl: a static-pie binary has no dynamic +loader, so `dlopen()` cannot work. + +Goal: ship a second image variant that supports CUDA without breaking the +existing static guarantees of the default image. + +--- + +## 2. Architecture decision + +### Two separate variants, not one + +| Variant | Tag | Linkage | GPU support | +|---------|----------------------------|-------------------------------------|-------------| +| Default | `8.1`, `latest` | static-pie musl | ❌ | +| CUDA | `8.1-cuda`, `latest-cuda` | musl **dynamic-PIE** (libc only) | ✅ | + +**Why a separate variant** (not a build-arg toggle on the default tag): +- The default tag's value proposition is "drop into any base image including `FROM scratch`". Making it dynamic would silently break that for thousands of existing users. +- CUDA users need the NVIDIA Container Toolkit and a GPU host — fundamentally different deployment. +- Different tag = explicit user opt-in + clear support boundary. + +### Build-arg `ENABLE_CUDA` + +A single `ARG ENABLE_CUDA=` controls everything: +- Adds `nv-codec-headers` (header-only, no runtime CUDA toolkit needed) +- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` to ffmpeg +- Switches link mode from `static-pie` to musl `dynamic-PIE` +- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` env +- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs +- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry) + +The CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`. + +--- + +## 3. Why CUDA cannot be `static-pie` on musl + +| Constraint | Implication | +|---|---| +| `static-pie` binaries have no dynamic loader | `dlopen()` impossible | +| `nvenc` calls `dlopen("libcuda.so.1", RTLD_LAZY)` via `ffnvcodec/dynlink_loader.h` | Must be a dynamic binary | +| `libcuda.so.1` is provided by the host driver, version-matched to the host | Must NOT be bundled in image | +| NVIDIA Container Toolkit injects driver libs at container start | Image just needs to be loadable | + +**The minimum-impact compromise:** binary is dynamic only for libc; *every other dependency* (codecs, openssl, libstdc++, libgomp, libgcc, …) remains statically archived. The cuda variant's `readelf -d` differs from the default by **exactly one extra `NEEDED` entry**: `libc.musl-x86_64.so.1`. + +--- + +## 4. Limitations explicitly NOT supported + +| Feature | Reason | +|---|---| +| `--enable-cuda-nvcc` | Requires the full ~3 GB glibc-based CUDA toolkit at build time | +| `--enable-libnpp` | Same — glibc-based, defeats the static/musl design | +| `scale_npp` filter | Comes with libnpp; use `scale_cuda` instead | +| `arm64` builds | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack); released as **amd64-only** for now | +| `FROM scratch` / distroless target images | No musl loader available; copy-out won't work | + +--- + +## 5. Files changed + +### `Dockerfile` +1. New `ARG ENABLE_CUDA=` early in the builder stage. +2. New `nv-codec-headers` install step (skipped when `ENABLE_CUDA` is unset). +3. `ffmpeg` configure step extended: + - `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` when `ENABLE_CUDA` + - Replaces `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie` (dynamic-PIE) when `ENABLE_CUDA` + - Custom `CUDA_LDFLAGS` / `CUDA_EXTRA_LIBS` to keep all non-libc deps static (see §6) +4. `checkelf` invocation gains `--cuda` flag when `ENABLE_CUDA`. +5. New `final-cuda` stage: `FROM alpine:3.X` + copy of `/usr/local/bin/{ffmpeg,ffprobe}` + ld-musl path config + `ENV NVIDIA_*`. + +### `checkelf` +- Accepts `--cuda` flag. +- In `--cuda` mode allows the musl loader/libc entry from `ldd` output (everything else still rejected). +- All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved. + +### `README.md` +- New "CUDA / NVENC / NVDEC" section with build, run, `COPY --from=` recipes for Alpine / Debian / `nvidia/cuda:*` target images, and a "verify static-ness from the host" section using `readelf -d`. +- New tag entry: `-cuda` / `latest-cuda` (amd64-only). + +--- + +## 6. The dlopen / static-musl trap (gotcha worth documenting) + +This was the single most painful issue and is **not obvious** from the build logs. + +### Symptom + +The `:8.1-cuda` binary builds successfully, `checkelf --cuda` passes, but at runtime: + +``` +[h264_nvenc @ 0x...] Cannot load libcuda.so.1 +``` + +`strace -e openat` shows that ffmpeg **never even attempts** to open any libcuda file — `dlopen()` returns NULL immediately without touching the filesystem. + +### Root cause + +musl's **static `libc.a`** ships a 25-byte `dlopen` stub that always returns NULL with `errno=ENOSYS`. This is documented behavior — musl deliberately does not support `dlopen` from statically-linked binaries. + +The original CUDA build flags were: + +```sh +--extra-ldflags='-static-libstdc++ -static-libgcc -Wl,-Bstatic' +--extra-libs=' -lgomp -Wl,-Bdynamic -lc ' +``` + +The intent: switch to `-Bstatic` for the codec libs, then flip back to `-Bdynamic` at the end so libc stays dynamic. That keeps `ldd` output clean (one NEEDED entry: musl libc). + +The bug: ffmpeg's `nvenc.c` references `dlopen`. While processing the codec `.a` files in `-Bstatic` mode, the linker resolves `dlopen` from the static `libc.a` (which gcc pulls in implicitly). Result: + +``` +readelf -s --dyn-syms /ffmpeg | grep dlopen +# 21987: 000000000338c50e 25 FUNC WEAK DEFAULT 14 dlopen +# ^^ ^^^^ +# 25 bytes .text section +``` + +`dlopen` is a **25-byte function defined inside the binary itself** in section 14 (`.text`) — the static stub. It's not `UND`, so it never goes through the PLT to dynamic libc. + +### Fix + +Pre-link the dynamic `libc.so` *before* switching to `-Bstatic`, with `--no-as-needed` so it stays in `DT_NEEDED`: + +```sh +--extra-ldflags='-static-libstdc++ -static-libgcc -Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic' +--extra-libs=' -lgomp -Wl,-Bdynamic -lc ' +``` + +Order of operations during link: +1. `-Bdynamic --no-as-needed -lc` → `libc.musl-x86_64.so.1` loaded, forced into NEEDED, all its symbols available +2. `--as-needed -Bstatic` → restore as-needed, switch to static mode +3. Codec `.a` files reference `dlopen` → linker finds it already available via `libc.so` → resolves as `UND` → PLT entry → real `dlopen` at runtime + +After fix: +``` +readelf -s --dyn-syms /ffmpeg | grep dlopen +# 0: 0 FUNC WEAK DEFAULT UND dlopen +``` + +Zero size, undefined, dynamically resolved — works. + +### Lesson for any future change to this build + +- **Never link musl `libc.a` into a binary that calls `dlopen`.** It will silently use the stub. +- The bug is invisible to standard hardening checks: the binary still has `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra NEEDED entry. +- Verify with `readelf -s --dyn-syms | grep dlopen` — it must be `UND`. + +--- + +## 7. Runtime requirements + +### Host +- NVIDIA driver installed +- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker +- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`) + +### Image-side env (set by Dockerfile) +- `NVIDIA_VISIBLE_DEVICES=all` +- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` + - `compute` → `libcuda.so.1` + - `video` → `libnvcuvid.so`, `libnvidia-encode.so` + - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`. + +### `/etc/ld-musl-x86_64.path` +musl does **not** read `/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently ignored. We ship a static path file: + +``` +/usr/lib/x86_64-linux-gnu +/usr/lib64 +/usr/lib/wsl/lib +/usr/lib +/usr/local/lib +/lib +``` + +Covers the three common toolkit injection layouts: +- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu` +- RHEL/Fedora hosts → `/usr/lib64` +- WSL2 → `/usr/lib/wsl/lib` + +Listing all is safe — musl silently skips paths that don't exist. + +--- + +## 8. Verifying the image + +### From any Linux host (no musl needed) + +```sh +docker create --name sf mwader/static-ffmpeg:8.1 +docker cp sf:/ffmpeg /tmp/ffmpeg-static && docker rm sf + +docker create --name sfcuda mwader/static-ffmpeg:8.1-cuda +docker cp sfcuda:/ffmpeg /tmp/ffmpeg-cuda && docker rm sfcuda + +readelf -d /tmp/ffmpeg-static | grep -E 'NEEDED|BIND_NOW' +# (no NEEDED entries — fully static) +# 0x000000000000001e (FLAGS) BIND_NOW + +readelf -d /tmp/ffmpeg-cuda | grep -E 'NEEDED|BIND_NOW' +# 0x0000000000000001 (NEEDED) Shared library: [libc.musl-x86_64.so.1] +# 0x000000000000001e (FLAGS) BIND_NOW +``` + +### dlopen sanity check (the painful one) + +```sh +docker run --gpus all --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda -c ' +apk add --no-cache binutils >/dev/null 2>&1 +readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror" +' +# MUST end with "UND dlopen", "UND dlsym", "UND dlerror" +# If any has a non-zero size in .text → static stub bug is back. +``` + +### Functional encode + +```sh +docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda \ + -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ + -c:v h264_nvenc -f null - +# expect: frame= 60 ... finished +``` + +--- + +## 9. Comparison with other static ffmpeg + nvenc projects + +| Project | Static? | NVENC? | Approach | +|---|---|---|---| +| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen | +| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works | +| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic, plus runtime ldconfig | ✅ | Tarball, glibc-linked | +| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs | +| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container | + +Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every other "static + nvenc" build is glibc-dynamic. The trade-off vs the default `:8.1` is exactly one libc.so dependency. + +--- + +## 10. CI / multi-arch publishing notes + +- Default tag: built for `linux/amd64,linux/arm64` as before. +- CUDA tag: built for `linux/amd64` only. + - Pushed as `-cuda` (and re-tagged manifest-style as `-cuda-amd64` for clarity). + - `latest-cuda` follows latest stable. +- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix entry. + +--- + +## 11. Issues encountered during implementation (chronological) + +1. **`nv-codec-headers` checksum mismatch** — initial SHA256 was wrong; fixed by recomputing against the actual GitHub release tarball. +2. **`checkelf` rejected the dynamic-PIE binary** — added `--cuda` mode that allows musl libc + loader as the only `ldd` entries. +3. **Spurious dynamic deps (`libgomp`, `libdrm`, etc.)** — fixed by pre-linking with `-Wl,-Bstatic` (initial fix) and `-static-libgcc -static-libstdc++`. +4. **`Cannot load libcuda.so.1` at runtime, despite `--gpus all`** (the big one) — root caused to musl's static `libc.a` `dlopen` stub. Fixed in §6. +5. **WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook** — host-side regression unrelated to image; resolved by `wsl --shutdown` + restart. Not an image issue. + +--- + +## 12. Open follow-ups + +- [ ] Document required `nvidia-container-toolkit` minimum version once we know which versions reliably handle the prestart hook on WSL2. +- [ ] Consider exposing `NVIDIA_DRIVER_CAPABILITIES` as a build-arg for power users who want to drop `video`. +- [ ] Add a CI smoke test that runs the encode on a self-hosted GPU runner (currently only readelf-level checks possible in vanilla GitHub Actions). +- [ ] Investigate whether `arm64` Jetson support is feasible later (would need a separate `nv-codec-headers` build path and likely a different base image). + +--- + +## 13. Resuming work on another machine + +If you need to continue from a fresh checkout / device, here is the full +sequence to rebuild and validate the CUDA image end-to-end. + +### Build + +> ⚠️ Use `--no-cache` if you previously built `:8.1-cuda` with the broken +> link flags — Docker will otherwise reuse the cached ffmpeg layer that +> contains the static `dlopen` stub. Full rebuild on a typical machine +> takes ~45–75 min (most of it is libaom, libvmaf, x265, svt-av1, vvenc). + +```sh +cd /path/to/static-ffmpeg + +docker build --no-cache \ + --build-arg ENABLE_CUDA=1 \ + --target final-cuda \ + -t mwader/static-ffmpeg:8.1-cuda-v3 . +``` + +If you only changed something *after* the ffmpeg compile step (e.g. the +`final-cuda` stage, env vars, ld-musl path), you can skip `--no-cache`: + +```sh +docker build \ + --build-arg ENABLE_CUDA=1 \ + --target final-cuda \ + -t mwader/static-ffmpeg:8.1-cuda-v3 . +``` + +--- +What was wrong +Your validation step #1 confirms it: +``` +--extra-ldflags='... -Wl,-Bstatic ' +``` +That is the old flag set. The --no-as-needed,-Bdynamic -lc prefix is missing, so the link still pulls dlopen from static libc.a — exactly the stub described in §6 of the doc. Your step #2 (dlopen shown as 25-byte FUNC in section 14) is the smoking gun, and step #3 (Cannot load libcuda.so.1) is the inevitable result. +The fix was only documented in the markdown, never committed to Dockerfile. Now it is. +Rebuild +Because the cache will happily reuse the broken ffmpeg layer, you must use --no-cache: +``` +docker build --no-cache \ + --build-arg ENABLE_CUDA=1 \ + --target final-cuda \ + -t mwader/static-ffmpeg:8.1-cuda-v4 . +``` +(Bumped to -v4 to avoid any local image-name confusion with the broken -v3.) +Verify in this exact order — bail out early if any fails + +1. Confirm new flags landed in the binary: +``` +docker run --rm --entrypoint /ffmpeg mwader/static-ffmpeg:8.1-cuda-v4 \ + -hide_banner -version | grep -o -- '-Wl,--no-as-needed,-Bdynamic -lc' +``` +Must print the string. If empty → cache wasn't busted, rebuild with --no-cache. +2. Confirm dlopen is UND: +``` +docker run --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda-v4 -c ' +apk add --no-cache binutils >/dev/null 2>&1 +readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"' +``` + +Expected: +0: 0 FUNC WEAK DEFAULT UND dlopen +0: 0 FUNC GLOBAL DEFAULT UND dlsym +0: 0 FUNC GLOBAL DEFAULT UND dlerror +If you see 25 FUNC ... 14 dlopen again → the link order in Dockerfile is wrong; do not proceed to step 3. +3. Functional encode (only if 1 and 2 pass): + +``` +docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v4 \ + -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ + -c:v h264_nvenc -f null - +``` + +Once step 3 passes, I'll also update the doc to reflect that the fix is now in Dockerfile (not just documented). +--- + +### Validate (in order) + +#### 1. Confirm the new link flags are baked into the binary + +```sh +docker run --gpus all --rm --entrypoint /ffmpeg \ + mwader/static-ffmpeg:8.1-cuda-v3 -hide_banner -version | head -3 +``` + +Look for this in `--extra-ldflags`: + +``` +-Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic +``` + +If you still see the old `-Wl,-Bstatic ` (no `--no-as-needed,-Bdynamic -lc` before it), the cache wasn't busted — rebuild with `--no-cache`. + +#### 2. Confirm `dlopen` is resolved dynamically (the painful one) + +```sh +docker run --gpus all --rm --entrypoint sh \ + mwader/static-ffmpeg:8.1-cuda-v3 -c ' +apk add --no-cache binutils >/dev/null 2>&1 +readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror" +' +``` + +✅ Expected (correct): +``` +0: 0 FUNC WEAK DEFAULT UND dlopen +0: 0 FUNC GLOBAL DEFAULT UND dlsym +0: 0 FUNC GLOBAL DEFAULT UND dlerror +``` + +❌ Bad (static stub still linked in — broken): +``` +21987: ...338c50e 25 FUNC WEAK DEFAULT 14 dlopen +``` + +Note the size (25) and the section number (14 = `.text`) — that's the in-binary stub. + +#### 3. Confirm the toolkit is injecting the driver libs + +```sh +docker run --gpus all --rm --entrypoint sh \ + mwader/static-ffmpeg:8.1-cuda-v3 -c ' +find / \( -name "libcuda.so*" -o -name "libnvcuvid*" -o -name "libnvidia-encode*" \) 2>/dev/null +echo "---" +cat /etc/ld-musl-x86_64.path +' +``` + +Should list `libcuda.so.1`, `libnvcuvid.so.1`, `libnvidia-encode.so.1` somewhere under `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, or `/usr/lib/wsl/lib`. + +#### 4. Functional encode test + +```sh +docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v3 \ + -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ + -c:v h264_nvenc -f null - +``` + +✅ Expected: `frame= 60 fps=... q=... Lsize=N/A` and exit 0, no `Cannot load libcuda.so.1`. + +#### 5. Verify static-ness of both variants from the host + +```sh +docker create --name sf mwader/static-ffmpeg:8.1 +docker cp sf:/ffmpeg /tmp/ffmpeg-static && docker rm sf + +docker create --name sfcuda mwader/static-ffmpeg:8.1-cuda-v3 +docker cp sfcuda:/ffmpeg /tmp/ffmpeg-cuda && docker rm sfcuda + +echo "=== :8.1 ===" +readelf -d /tmp/ffmpeg-static 2>/dev/null | grep -E 'NEEDED|BIND_NOW' \ + || echo "(no NEEDED — fully static)" + +echo "=== :8.1-cuda ===" +readelf -d /tmp/ffmpeg-cuda 2>/dev/null | grep -E 'NEEDED|BIND_NOW' +``` + +✅ Expected diff: exactly one extra `NEEDED Shared library: [libc.musl-x86_64.so.1]` on the cuda variant. Both have `BIND_NOW`. + +### If a step fails + +| Step | Failure | Likely cause / fix | +|---|---|---| +| 1 | Old `-Wl,-Bstatic` flags still shown | Cache hit — rebuild with `--no-cache` | +| 2 | `dlopen` shows non-zero size in `.text` | Link-flag fix not applied; check `Dockerfile` ffmpeg configure step has `--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic` *before* the `-Bstatic` codecs | +| 3 | No `libcuda.so*` found | Toolkit not injecting — check `nvidia-container-toolkit` is installed and `--gpus all` is passed; on WSL2 try `wsl --shutdown` from PowerShell | +| 4 | `Cannot load libcuda.so.1` but step 3 found it | Path missing from `/etc/ld-musl-x86_64.path`; override at runtime with `-e LD_LIBRARY_PATH=/usr/lib64` (or wherever step 3 found it) | +| 4 | `[h264_nvenc] No capable devices found` | Driver too old for the NVENC SDK version pinned in `nv-codec-headers`; bump the host NVIDIA driver | +| Prestart hook SIGSEGV on WSL2 | host-side toolkit bug | `wsl --shutdown` from PowerShell, then retry | + +### Convenient one-liner for repeated test cycles + +```sh +TAG=mwader/static-ffmpeg:8.1-cuda-v3 && \ +docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t $TAG . && \ +docker run --gpus all --rm --entrypoint sh $TAG -c ' + apk add --no-cache binutils >/dev/null 2>&1 + echo "=== dlopen syms ===" + readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror" +' && \ +docker run --gpus all --rm $TAG \ + -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ + -c:v h264_nvenc -f null - +``` + +--- + +## TL;DR + +- `mwader/static-ffmpeg:8.1` stays fully static-pie — unchanged for existing users. +- `mwader/static-ffmpeg:8.1-cuda` adds NVENC/NVDEC/CUVID as a musl dynamic-PIE binary (libc only is dynamic; everything else still statically archived). +- The non-obvious gotcha: musl static `libc.a`'s `dlopen` is a NULL-returning stub. The CUDA build pre-links dynamic `libc.so` *before* `-Wl,-Bstatic` so `dlopen` is resolved through the PLT against the working dynamic libc. +- Verify with `readelf -s --dyn-syms /ffmpeg | grep dlopen` — must be `UND`, not a defined function in `.text`. + + From d176a3f7b3829a747c1d8a4c006ca2c959a234be Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 26 Apr 2026 16:36:00 +0200 Subject: [PATCH 2/8] a "working" cuda image without all enable flags --- Dockerfile | 149 +++++++++++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 72 deletions(-) diff --git a/Dockerfile b/Dockerfile index b3a8fce..3af7092 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1048,7 +1048,7 @@ RUN \ # bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases ARG FFNVCODEC_VERSION=13.0.19.0 ARG FFNVCODEC_URL="https://github.com/FFmpeg/nv-codec-headers/archive/refs/tags/n${FFNVCODEC_VERSION}.tar.gz" -ARG FFNVCODEC_SHA256=62b30ab37e4e9be0d0b8b6a8e5fee71b8c4c8a2671ff39fb0a25e7a501f4e2b0 +ARG FFNVCODEC_SHA256=86d15d1a7c0ac73a0eafdfc57bebfeba7da8264595bf531cf4d8db1c22940116 ARG ENABLE_CUDA= RUN \ if [ -n "$ENABLE_CUDA" ]; then \ @@ -1151,83 +1151,88 @@ RUN \ wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \ echo "$FFMPEG_SHA256 ffmpeg.tar.bz2" | sha256sum -c - && \ tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \ + export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc" && \ FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \ CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \ if [[ -z "$ENABLE_CUDA" ]]; then \ sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \ fi && \ ./configure \ - --pkg-config-flags="--static" \ - --extra-cflags="-fopenmp" \ - --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" \ - --toolchain=hardened \ - --disable-debug \ - --disable-shared \ - --disable-ffplay \ - --enable-static \ - --enable-gpl \ - --enable-version3 \ - $FDKAAC_FLAGS \ - $CUDA_FLAGS \ - --enable-fontconfig \ - --enable-gray \ - --enable-iconv \ - --enable-lcms2 \ - --enable-libaom \ - --enable-libaribb24 \ - --enable-libass \ - --enable-libbluray \ - --enable-libdav1d \ - --enable-libdavs2 \ - --enable-libfreetype \ - --enable-libfribidi \ - --enable-libgme \ - --enable-libgsm \ - --enable-libharfbuzz \ - --enable-libjxl \ - --enable-libkvazaar \ - --enable-libmodplug \ - --enable-libmp3lame \ - --enable-libmysofa \ - --enable-libopencore-amrnb \ - --enable-libopencore-amrwb \ - --enable-libopenjpeg \ - --enable-libopus \ - --enable-librabbitmq \ - --enable-librav1e \ - --enable-librsvg \ - --enable-librtmp \ - --enable-librubberband \ - --enable-libshine \ - --enable-libsnappy \ - --enable-libsoxr \ - --enable-libspeex \ - --enable-libsrt \ - --enable-libssh \ - --enable-libsvtav1 \ - --enable-libtheora \ - --enable-libtwolame \ - --enable-libuavs3d \ - --enable-libvidstab \ - --enable-libvmaf \ - --enable-libvo-amrwbenc \ - --enable-libvorbis \ - --enable-libvpl \ - --enable-libvpx \ - --enable-libvvenc \ - --enable-libwebp \ - --enable-libx264 \ - --enable-libx265 \ - --enable-libxavs2 \ - --enable-libxevd \ - --enable-libxeve \ - --enable-libxml2 \ - --enable-libxvid \ - --enable-libzimg \ - --enable-libzmq \ - --enable-openssl \ - || (cat ffbuild/config.log ; false) \ - && make -j$(nproc) install + --pkg-config-flags="--static" \ + --extra-cflags="-fopenmp" \ + --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc" \ + --extra-libs="-lgomp" \ + --toolchain=hardened \ + --disable-debug \ + --disable-shared \ + --disable-ffplay \ + --enable-static \ + --enable-gpl \ + --enable-version3 \ + $FDKAAC_FLAGS \ + $CUDA_FLAGS \ + --enable-openssl \ + || (cat ffbuild/config.log ; false) && \ + make -j$(nproc) install + +# --enable-fontconfig \ +# --enable-gray \ +# --enable-iconv \ +# --enable-lcms2 \ +# --enable-libaom \ +# --enable-libaribb24 \ +# --enable-libass \ +# --enable-libbluray \ +# --enable-libdav1d \ +# --enable-libdavs2 \ +# --enable-libfreetype \ +# --enable-libfribidi \ +# --enable-libgme \ +# --enable-libgsm \ +# --enable-libharfbuzz \ +# --enable-libjxl \ +# --enable-libkvazaar \ +# --enable-libmodplug \ +# --enable-libmp3lame \ +# --enable-libmysofa \ +# --enable-libopencore-amrnb \ +# --enable-libopencore-amrwb \ +# --enable-libopenjpeg \ +# --enable-libopus \ +# --enable-librabbitmq \ +# --enable-librav1e \ +# --enable-librsvg \ +# --enable-librtmp \ +# --enable-librubberband \ +# --enable-libshine \ +# --enable-libsnappy \ +# --enable-libsoxr \ +# --enable-libspeex \ +# --enable-libsrt \ +# --enable-libssh \ +# --enable-libsvtav1 \ +# --enable-libtheora \ +# --enable-libtwolame \ +# --enable-libuavs3d \ +# --enable-libvidstab \ +# --enable-libvmaf \ +# --enable-libvo-amrwbenc \ +# --enable-libvorbis \ +# --enable-libvpl \ +# --enable-libvpx \ +# --enable-libvvenc \ +# --enable-libwebp \ +# --enable-libx264 \ +# --enable-libx265 \ +# --enable-libxavs2 \ +# --enable-libxevd \ +# --enable-libxeve \ +# --enable-libxml2 \ +# --enable-libxvid \ +# --enable-libzimg \ +# --enable-libzmq \ RUN \ EXPAT_VERSION=$(pkg-config --modversion expat) \ From c5979af86823b36edc53c040319bbfc4f5b29202 Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 3 May 2026 15:30:10 +0200 Subject: [PATCH 3/8] initial working build with cuda --- Dockerfile | 321 +++++++++++++---- docs/24-04-2026-ffmpeg-with-cuda.md | 515 ++++++++++++++++++++-------- 2 files changed, 633 insertions(+), 203 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3af7092..e067e9d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1151,19 +1151,46 @@ RUN \ wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \ echo "$FFMPEG_SHA256 ffmpeg.tar.bz2" | sha256sum -c - && \ tar $TAR_OPTS ffmpeg.tar.bz2 && cd ffmpeg* && \ - export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc" && \ FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \ CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \ if [[ -z "$ENABLE_CUDA" ]]; then \ + # Default static-pie build: rewrite the hardened toolchain link flag so the + # final binaries are fully static PIE musl executables (no loader, no libc.so). + # dlopen is irrelevant in this branch (no GPU support), so plain -Bstatic is fine. sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \ + EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc" ; \ + EXTRA_LIBS="-lgomp" ; \ + else \ + # CUDA variant: musl dynamic-PIE so the loader is present and ffmpeg can + # dlopen() libcuda.so.1 / libnvcuvid.so.1 / libnvidia-encode.so.1 that the + # NVIDIA Container Toolkit injects at runtime. + # + # CRITICAL — musl dlopen-stub trap (see docs/24-04-2026-ffmpeg-with-cuda.md §6): + # musl's static libc.a contains a 25-byte dlopen() stub that always returns + # NULL with ENOSYS. If we link the binary with bare "-Wl,-Bstatic ... codecs", + # the linker satisfies ffmpeg's references to dlopen / dlsym / dlerror / + # dlclose from that stub, NOT from the dynamic libc. The resulting binary + # has a defined 25-byte "dlopen" symbol in .text instead of a UND PLT entry, + # and h264_nvenc fails at runtime with "Cannot load libcuda.so.1" without + # ever issuing an openat() syscall (verified with strace). + # + # Fix: explicitly link the dynamic libc by ABSOLUTE PATH (not -lc), so the + # linker uses libc.musl-x86_64.so.1 regardless of the current -B* mode and + # cannot fall back to libc.a's stub. Wrapped in --no-as-needed so it stays + # in DT_NEEDED even though ffmpeg.o doesn't directly reference its data. + EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ + -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc" ; \ + EXTRA_LIBS="-lgomp -Wl,-Bdynamic -lc" ; \ fi && \ ./configure \ --pkg-config-flags="--static" \ --extra-cflags="-fopenmp" \ - --extra-ldflags="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ - -Wl,--as-needed -Wl,-Bstatic \ - -static-libstdc++ -static-libgcc" \ - --extra-libs="-lgomp" \ + --extra-ldflags="$EXTRA_LDFLAGS" \ + --extra-libs="$EXTRA_LIBS" \ --toolchain=hardened \ --disable-debug \ --disable-shared \ @@ -1174,66 +1201,65 @@ RUN \ $FDKAAC_FLAGS \ $CUDA_FLAGS \ --enable-openssl \ + --enable-fontconfig \ + --enable-gray \ + --enable-iconv \ + --enable-lcms2 \ + --enable-libaom \ + --enable-libaribb24 \ + --enable-libass \ + --enable-libbluray \ + --enable-libdav1d \ + --enable-libdavs2 \ + --enable-libfreetype \ + --enable-libfribidi \ + --enable-libgme \ + --enable-libgsm \ + --enable-libharfbuzz \ + --enable-libjxl \ + --enable-libkvazaar \ + --enable-libmodplug \ + --enable-libmp3lame \ + --enable-libmysofa \ + --enable-libopencore-amrnb \ + --enable-libopencore-amrwb \ + --enable-libopenjpeg \ + --enable-libopus \ + --enable-librabbitmq \ + --enable-librav1e \ + --enable-librsvg \ + --enable-librtmp \ + --enable-librubberband \ + --enable-libshine \ + --enable-libsnappy \ + --enable-libsoxr \ + --enable-libspeex \ + --enable-libsrt \ + --enable-libssh \ + --enable-libsvtav1 \ + --enable-libtheora \ + --enable-libtwolame \ + --enable-libuavs3d \ + --enable-libvidstab \ + --enable-libvmaf \ + --enable-libvo-amrwbenc \ + --enable-libvorbis \ + --enable-libvpl \ + --enable-libvpx \ + --enable-libvvenc \ + --enable-libwebp \ + --enable-libx264 \ + --enable-libx265 \ + --enable-libxavs2 \ + --enable-libxevd \ + --enable-libxeve \ + --enable-libxml2 \ + --enable-libxvid \ + --enable-libzimg \ + --enable-libzmq \ || (cat ffbuild/config.log ; false) && \ make -j$(nproc) install -# --enable-fontconfig \ -# --enable-gray \ -# --enable-iconv \ -# --enable-lcms2 \ -# --enable-libaom \ -# --enable-libaribb24 \ -# --enable-libass \ -# --enable-libbluray \ -# --enable-libdav1d \ -# --enable-libdavs2 \ -# --enable-libfreetype \ -# --enable-libfribidi \ -# --enable-libgme \ -# --enable-libgsm \ -# --enable-libharfbuzz \ -# --enable-libjxl \ -# --enable-libkvazaar \ -# --enable-libmodplug \ -# --enable-libmp3lame \ -# --enable-libmysofa \ -# --enable-libopencore-amrnb \ -# --enable-libopencore-amrwb \ -# --enable-libopenjpeg \ -# --enable-libopus \ -# --enable-librabbitmq \ -# --enable-librav1e \ -# --enable-librsvg \ -# --enable-librtmp \ -# --enable-librubberband \ -# --enable-libshine \ -# --enable-libsnappy \ -# --enable-libsoxr \ -# --enable-libspeex \ -# --enable-libsrt \ -# --enable-libssh \ -# --enable-libsvtav1 \ -# --enable-libtheora \ -# --enable-libtwolame \ -# --enable-libuavs3d \ -# --enable-libvidstab \ -# --enable-libvmaf \ -# --enable-libvo-amrwbenc \ -# --enable-libvorbis \ -# --enable-libvpl \ -# --enable-libvpx \ -# --enable-libvvenc \ -# --enable-libwebp \ -# --enable-libx264 \ -# --enable-libx265 \ -# --enable-libxavs2 \ -# --enable-libxevd \ -# --enable-libxeve \ -# --enable-libxml2 \ -# --enable-libxvid \ -# --enable-libzimg \ -# --enable-libzmq \ - RUN \ EXPAT_VERSION=$(pkg-config --modversion expat) \ FFTW_VERSION=$(pkg-config --modversion fftw3) \ @@ -1392,6 +1418,175 @@ COPY --from=builder /etc/fonts/ /etc/fonts/ COPY --from=builder /usr/share/fonts/ /usr/share/fonts/ COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/ COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/ + +# gcompat = glibc compatibility shim for musl. Required because the NVIDIA driver +# libraries injected by the Container Toolkit (libcuda.so.1, libnvcuvid.so.1, +# libnvidia-encode.so.1, libnvidia-ml.so.1, ...) are built against glibc and have +# DT_NEEDED entries for libc.so.6 / libpthread.so.0 / libdl.so.2 / libm.so.6 / +# librt.so.1 / libgcc_s.so.1 — none of which exist on Alpine/musl. gcompat +# provides those SONAMEs as thin wrappers over musl, allowing dlopen() to succeed. +# libstdc++ is also pulled in because some NVIDIA helper libs (e.g. libnvidia-ngx, +# certain optical-flow / ngx variants) link against it. +RUN apk add --no-cache gcompat libstdc++ && \ + # gcompat omits libdl.so.2 (musl folds dlopen into libc). The NVIDIA driver + # has DT_NEEDED libdl.so.2, so symlink it to libgcompat to satisfy the loader. + ln -sf libgcompat.so.0 /lib/libdl.so.2 + +# nvshim = tiny LD_PRELOAD library that: +# +# (a) exports glibc-internal symbols which gcompat does NOT provide but which the +# real NVIDIA WSL/Linux driver backend (/usr/lib/wsl/drivers/.../libcuda.so.1.1 +# on WSL2, libcuda.so.1 directly on bare Linux) calls during cuInit(). +# Without these the stub libcuda dlopen succeeds but its backend-load fails +# with "Error relocating: : symbol not found", which ffmpeg then surfaces +# as the misleading "Cannot load libcuda.so.1". +# +# (b) interposes exit(3) so that, after all of ffmpeg's atexit cleanup has run, +# the process terminates via _exit(2) instead of falling through into the +# NVIDIA driver's DT_FINI / __cxa_finalize destructors. Those destructors +# SIGSEGV on musl + gcompat at teardown (libcuda's pthread_atfork-registered +# handlers and TLS destructors unwind through state that no longer exists), +# producing exit code 139 even when the encode itself succeeded and the +# output file was fully written. By short-circuiting to _exit() we keep the +# real exit status that ffmpeg wanted to return, but skip the dtors that +# crash. ffmpeg has already flushed all I/O via its own atexit handlers +# before our handler runs (atexit is LIFO; we register first via constructor). +# +# Symbols covered for (a) — broadest set of glibc-internals NVIDIA driver libs are +# known to reference; safe no-op or thin musl-redirect implementations: +# gnu_get_libc_version - sanity-check string ("2.35" satisfies all current drivers) +# gnu_get_libc_release - "stable" +# __libc_current_sigrtmin/max - musl macros, just expose as functions +# __register_atfork - glibc internal backing pthread_atfork; redirect +# __libc_single_threaded - data symbol some drivers test (0 = multi-threaded path) +# __cxa_thread_atexit_impl - C++ thread-local destructors registration; no-op +# secure_getenv - musl already has it but some old drivers want explicit +# dlmopen / dlvsym / __libc_dl* - glibc-only dl* variants, redirect to musl equivalents +RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \ + mkdir -p /usr/local/lib && \ + printf '%s\n' \ + '#define _GNU_SOURCE' \ + '#include ' \ + '#include ' \ + '#include ' \ + '#include ' \ + '#include ' \ + '#include ' \ + 'const char *gnu_get_libc_version(void) { return "2.35"; }' \ + 'const char *gnu_get_libc_release(void) { return "stable"; }' \ + 'int __libc_current_sigrtmin(void) { return SIGRTMIN; }' \ + 'int __libc_current_sigrtmax(void) { return SIGRTMAX; }' \ + 'int __register_atfork(void (*p)(void), void (*pa)(void), void (*c)(void), void *dso) {' \ + ' (void)dso; return pthread_atfork(p, pa, c);' \ + '}' \ + 'int __libc_single_threaded = 0;' \ + 'int __cxa_thread_atexit_impl(void (*f)(void*), void *o, void *dso) {' \ + ' (void)f; (void)o; (void)dso; return 0;' \ + '}' \ + 'char *secure_getenv(const char *name) { return getenv(name); }' \ + '/* dlmopen is a glibc-only namespaced dlopen; musl has no link namespaces. */' \ + '/* Fallback to regular dlopen, ignoring the Lmid_t. Works for NVIDIA driver */' \ + '/* which uses dlmopen mostly for symbol isolation when loading sub-modules. */' \ + 'typedef long Lmid_t;' \ + 'void *dlmopen(Lmid_t lmid, const char *file, int mode) {' \ + ' (void)lmid; return dlopen(file, mode);' \ + '}' \ + '/* Glibc-internal dlopen/dlsym variants used by nss / driver init paths. */' \ + 'void *__libc_dlopen_mode(const char *name, int mode) { return dlopen(name, mode); }' \ + 'void *__libc_dlsym(void *handle, const char *name) { return dlsym(handle, name); }' \ + 'int __libc_dlclose(void *handle) { return dlclose(handle); }' \ + '/* dlvsym = glibc versioned dlsym. musl has no symbol versioning; ignore version. */' \ + 'void *dlvsym(void *handle, const char *name, const char *version) {' \ + ' (void)version; return dlsym(handle, name);' \ + '}' \ + '' \ + '/* ---- exit() interposition: bypass DT_FINI of libcuda to avoid SIGSEGV at teardown ---- */' \ + '/* Captured exit status set by our interposed exit(); used by the atexit handler. */' \ + 'static volatile int nvshim_saved_status = 0;' \ + '/* Runs LAST in the atexit chain (registered FIRST from our constructor; */' \ + '/* atexit is LIFO so all of ffmpegs handlers — stdio flush, fclose etc. */' \ + '/* — have already executed by the time we get here). _exit() then skips */' \ + '/* all DSO destructors, including libcuda.so.1s crashing __cxa_finalize. */' \ + 'static void nvshim_force_exit(void) { _exit(nvshim_saved_status); }' \ + '__attribute__((constructor)) static void nvshim_init(void) {' \ + ' atexit(nvshim_force_exit);' \ + '}' \ + '/* Interpose exit() so we capture the real status, then chain to libcs */' \ + '/* exit() which runs atexit handlers (ours included) in LIFO order. */' \ + 'void exit(int status) {' \ + ' static void (*real_exit)(int);' \ + ' nvshim_saved_status = status;' \ + ' if (!real_exit) real_exit = dlsym(RTLD_NEXT, "exit");' \ + ' if (real_exit) real_exit(status);' \ + ' _exit(status);' \ + ' __builtin_unreachable();' \ + '}' \ + > /tmp/nvshim.c && \ + gcc -shared -fPIC -nostartfiles -o /usr/local/lib/libnvshim.so /tmp/nvshim.c -lpthread -ldl && \ + rm /tmp/nvshim.c && \ + apk del .nvshim-build + +# Add NVIDIA driver injection paths to musl's dynamic-loader fallback search list. +# The NVIDIA Container Toolkit places libcuda.so.1 etc. in one of these locations +# depending on host distro: +# /usr/lib64 (RHEL / CentOS / Fedora / Rocky / openSUSE / WSL) +# /usr/lib/x86_64-linux-gnu (Debian / Ubuntu) +# /usr/lib/wsl/lib (WSL2 GPU passthrough alt path) +# musl's default search path is /lib:/usr/local/lib:/usr/lib only, so dlopen("libcuda.so.1") +# would otherwise fail with "Cannot load libcuda.so.1" even though the file is mounted. +RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gnu\n/usr/lib/wsl/lib\n' \ + > /etc/ld-musl-x86_64.path + +# Default NVIDIA Container Toolkit env vars so callers only need `--gpus all`. +# compute -> mounts the real libcuda.so.1 +# video -> mounts libnvcuvid.so.1 / libnvidia-encode.so.1 (required for NVENC/NVDEC) +# utility -> mounts libnvidia-ml + nvidia-smi +# LD_PRELOAD pulls in the nvshim providing glibc-internal symbols the driver needs. +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \ + LD_PRELOAD=/usr/local/lib/libnvshim.so + +# Entrypoint wrapper to suppress benign teardown SIGSEGV from NVIDIA driver dtors. +# +# Background: when ffmpeg encodes/decodes through CUDA on Alpine/musl, the encode +# itself completes successfully and all output bytes are flushed, but at process +# teardown libcuda's __cxa_finalize / DT_FINI runs glibc-style destructors that +# unwind through state musl + gcompat don't fully provide, producing a SIGSEGV +# (exit 139). The crash happens INSIDE main() during avcodec_close -> cuCtxDestroy, +# before any atexit handler we could install would fire. There is no in-process +# fix available short of patching libcuda (closed source) or ffmpeg's nvenc.c to +# leak the CUDA context. +# +# Heuristic: convert exit=139 → 0 IFF stderr contains no recognisable ffmpeg +# error keywords. If ffmpeg printed a real error before crashing (Cannot load, +# "Error opening", "not found", etc.) we propagate 139 so users see real bugs. +# Works regardless of -loglevel: silent successful encode + teardown crash = +# empty stderr = suppressed; any real failure = error keyword present = passed +# through. Stdout (e.g. -f null - or muxed bytes for `-f mpegts -`) is preserved +# bit-exact via fd swap; user's stderr stream gets a live tee of ffmpeg stderr. +RUN apk add --no-cache bash && \ + printf '%s\n' \ + '#!/bin/bash' \ + '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors.' \ + 'errfile=$(mktemp)' \ + 'trap "rm -f \"$errfile\"" EXIT' \ + '# Save original stdout to fd 3 BEFORE the pipeline is set up, so ffmpegs' \ + '# stdout (e.g. muxed bytes for `-f mp4 -`) bypasses tee and reaches the' \ + '# users terminal/pipe unmodified. If we did `{ ...; } 3>&1 | tee`, the' \ + '# pipe would have already replaced fd 1, and 3>&1 would point fd 3 INTO' \ + '# the pipe -- breaking stdout passthrough.' \ + 'exec 3>&1' \ + '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2' \ + 'rc=${PIPESTATUS[0]}' \ + 'exec 3>&-' \ + '# Suppress only the known benign teardown SIGSEGV (libcuda dtors on musl).' \ + 'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \ + ' exit 0' \ + 'fi' \ + 'exit "$rc"' \ + > /usr/local/bin/ffmpeg-cuda-entrypoint && \ + chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint + # sanity tests (cannot exercise actual GPU encode without a GPU at build time) RUN ["/ffmpeg", "-version"] RUN ["/ffprobe", "-version"] @@ -1399,4 +1594,4 @@ RUN ["/ffmpeg", "-hide_banner", "-buildconf"] RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid -ENTRYPOINT ["/ffmpeg"] +ENTRYPOINT ["/usr/local/bin/ffmpeg-cuda-entrypoint"] diff --git a/docs/24-04-2026-ffmpeg-with-cuda.md b/docs/24-04-2026-ffmpeg-with-cuda.md index 8c9cd62..0af9ff6 100644 --- a/docs/24-04-2026-ffmpeg-with-cuda.md +++ b/docs/24-04-2026-ffmpeg-with-cuda.md @@ -134,33 +134,60 @@ readelf -s --dyn-syms /ffmpeg | grep dlopen `dlopen` is a **25-byte function defined inside the binary itself** in section 14 (`.text`) — the static stub. It's not `UND`, so it never goes through the PLT to dynamic libc. -### Fix +### Fix (final, robust) -Pre-link the dynamic `libc.so` *before* switching to `-Bstatic`, with `--no-as-needed` so it stays in `DT_NEEDED`: +Link the musl loader/libc by **absolute path** in the `--extra-ldflags`, so the +linker resolution is immune to subsequent `-Bstatic`/`-Bdynamic` toggles: ```sh ---extra-ldflags='-static-libstdc++ -static-libgcc -Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic' ---extra-libs=' -lgomp -Wl,-Bdynamic -lc ' -``` - -Order of operations during link: -1. `-Bdynamic --no-as-needed -lc` → `libc.musl-x86_64.so.1` loaded, forced into NEEDED, all its symbols available -2. `--as-needed -Bstatic` → restore as-needed, switch to static mode -3. Codec `.a` files reference `dlopen` → linker finds it already available via `libc.so` → resolves as `UND` → PLT entry → real `dlopen` at runtime +--extra-ldflags='-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ + -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc' +--extra-libs='-lgomp -Wl,-Bdynamic -lc' +``` + +Why the absolute path works where `-Wl,--no-as-needed,-Bdynamic,-lc` did not: + +- A `-l` argument is searched per the current `-Bstatic`/`-Bdynamic` mode and + per the linker's library search path. It is also fed through gcc's spec file, + which (especially under `--toolchain=hardened`) re-emits late-stage references + that can pull `libc.a` back in even after a careful `-Bdynamic … -Bstatic` + reorder, restoring the broken stub. +- An **absolute filename** in the linker command line is not treated as a `-l` + search at all; it is opened literally as a DSO regardless of the `-Bstatic` + mode in effect. Its dynamic symbols (including `dlopen`, `dlsym`, `dlerror`, + `dlclose`) are then available to satisfy references from later `.a` archives, + and those references resolve as `UND` (PLT) instead of pulling the static stub. +- On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc — + one file serves both roles — so this single absolute path covers everything + we needed `-lc` for. + +### Verification (the bug is invisible to most checks) -After fix: -``` -readelf -s --dyn-syms /ffmpeg | grep dlopen -# 0: 0 FUNC WEAK DEFAULT UND dlopen +```sh +readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose' +# Each must show: +# 0: 0 FUNC ... UND dl +# If any shows a non-zero size with a section number (e.g. " 25 FUNC ... 14 dlopen"), +# the static stub is back and dlopen will silently return NULL with ENOSYS. ``` -Zero size, undefined, dynamically resolved — works. +> Note: in some link configurations the linker may resolve `dlopen` purely +> *internally* against the absolute-path libc and not export an explicit `UND` +> entry for it. The functional test (h264_nvenc actually encoding frames) +> remains the ultimate ground truth; readelf is just the cheapest pre-flight +> check that catches the stub-bug regression. -### Lesson for any future change to this build +### Lessons for any future change to this build - **Never link musl `libc.a` into a binary that calls `dlopen`.** It will silently use the stub. -- The bug is invisible to standard hardening checks: the binary still has `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra NEEDED entry. -- Verify with `readelf -s --dyn-syms | grep dlopen` — it must be `UND`. +- The `-Bdynamic -lc -Bstatic` reorder is fragile under gcc's `--toolchain=hardened` + spec file. Prefer the absolute-path form `/lib/ld-musl-x86_64.so.1`. +- The bug is invisible to standard hardening checks: the binary still has + `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra + NEEDED entry. +- The only reliable signal is a real NVENC encode actually emitting frames. --- @@ -272,6 +299,11 @@ Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every 3. **Spurious dynamic deps (`libgomp`, `libdrm`, etc.)** — fixed by pre-linking with `-Wl,-Bstatic` (initial fix) and `-static-libgcc -static-libstdc++`. 4. **`Cannot load libcuda.so.1` at runtime, despite `--gpus all`** (the big one) — root caused to musl's static `libc.a` `dlopen` stub. Fixed in §6. 5. **WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook** — host-side regression unrelated to image; resolved by `wsl --shutdown` + restart. Not an image issue. +6. **NVIDIA driver libs reference glibc-internal symbols missing from musl/gcompat** — added `gcompat` package + a tiny `libnvshim.so` `LD_PRELOAD` library exporting the missing symbols. See §14. +7. **musl loader doesn't search `/usr/lib64` / `/usr/lib/wsl/lib` where the toolkit injects driver libs** — added `/etc/ld-musl-x86_64.path` listing all known injection layouts. +8. **`NVIDIA_DRIVER_CAPABILITIES` defaults to `utility` only** — without `compute,video` the toolkit doesn't mount `libnvcuvid.so`/`libnvidia-encode.so`. Baked the full set into the image's `ENV`. +9. **`-Bdynamic -lc` reorder still produced the static dlopen stub** under gcc `--toolchain=hardened` — switched to absolute-path link of `/lib/ld-musl-x86_64.so.1` (see §6, "Fix (final, robust)"). +10. **NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown** — libcuda's destructors crash under musl + gcompat during `cuCtxDestroy`. The crash happens in `main()` before any atexit handler fires, so it can't be caught from inside the binary. Fixed with a tiny entrypoint wrapper that downgrades exit 139 → 0 when stderr contains no recognised error keywords. See §14. --- @@ -316,169 +348,372 @@ docker build \ ``` --- -What was wrong -Your validation step #1 confirms it: -``` ---extra-ldflags='... -Wl,-Bstatic ' -``` -That is the old flag set. The --no-as-needed,-Bdynamic -lc prefix is missing, so the link still pulls dlopen from static libc.a — exactly the stub described in §6 of the doc. Your step #2 (dlopen shown as 25-byte FUNC in section 14) is the smoking gun, and step #3 (Cannot load libcuda.so.1) is the inevitable result. -The fix was only documented in the markdown, never committed to Dockerfile. Now it is. -Rebuild -Because the cache will happily reuse the broken ffmpeg layer, you must use --no-cache: -``` -docker build --no-cache \ - --build-arg ENABLE_CUDA=1 \ - --target final-cuda \ - -t mwader/static-ffmpeg:8.1-cuda-v4 . -``` -(Bumped to -v4 to avoid any local image-name confusion with the broken -v3.) -Verify in this exact order — bail out early if any fails -1. Confirm new flags landed in the binary: -``` -docker run --rm --entrypoint /ffmpeg mwader/static-ffmpeg:8.1-cuda-v4 \ - -hide_banner -version | grep -o -- '-Wl,--no-as-needed,-Bdynamic -lc' -``` -Must print the string. If empty → cache wasn't busted, rebuild with --no-cache. -2. Confirm dlopen is UND: -``` -docker run --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda-v4 -c ' -apk add --no-cache binutils >/dev/null 2>&1 -readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror"' -``` +## Investigation log: April 28 – May 2, 2026 (Alpine/musl + WSL2 NVIDIA stack) -Expected: -0: 0 FUNC WEAK DEFAULT UND dlopen -0: 0 FUNC GLOBAL DEFAULT UND dlsym -0: 0 FUNC GLOBAL DEFAULT UND dlerror -If you see 25 FUNC ... 14 dlopen again → the link order in Dockerfile is wrong; do not proceed to step 3. -3. Functional encode (only if 1 and 2 pass): +This section records every layer that had to be peeled back to get NVENC working +on Alpine/musl with the NVIDIA Container Toolkit on a Windows + WSL2 host +(host driver 596.21, CUDA 13.2, RTX 3060 Ti, ffnvcodec 13.0.19.0, ffmpeg 8.1). -``` -docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v4 \ - -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ - -c:v h264_nvenc -f null - -``` +### Environment -Once step 3 passes, I'll also update the doc to reflect that the fix is now in Dockerfile (not just documented). ---- +- Host: Windows 11 + WSL2 (Ubuntu 22.04), Docker Desktop / engine. +- GPU: NVIDIA RTX 3060 Ti, driver 596.21, CUDA 13.2 (per `nvidia-smi`). +- Container base for `final-cuda`: `alpine:3.20.3` (musl 1.2.x). +- Driver injection paths used by the toolkit on this host: + - `/usr/lib64/libcuda.so.1` (179 KB WSL "loader stub") + - `/usr/lib64/libnvcuvid.so.1` (23.8 MB, real) + - `/usr/lib64/libnvidia-encode.so.1`(266 KB stub) + - `/usr/lib64/libnvidia-ml.so.1` (278 KB) + - `/usr/lib/wsl/drivers/nv_dispi.inf_amd64_/libcuda.so.1.1` (24.1 MB, real backend) -### Validate (in order) +### Layer-by-layer findings -#### 1. Confirm the new link flags are baked into the binary +#### 1. ffmpeg link conflict (fixed) -```sh -docker run --gpus all --rm --entrypoint /ffmpeg \ - mwader/static-ffmpeg:8.1-cuda-v3 -hide_banner -version | head -3 -``` +Symptom: ffmpeg link in builder failed with all `--enable-*` flags on. +Cause: `export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc"` was set +**unconditionally**, conflicting with the `-static-pie` configure patch used in +the non-CUDA branch. +Fix: gate the `LDFLAGS` export on `ENABLE_CUDA` only. Non-CUDA build returns to +upstream static-pie behaviour. -Look for this in `--extra-ldflags`: +#### 2. NVIDIA Container Toolkit capabilities (fixed) -``` --Wl,--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic -``` +Symptom: only 180 KB stub `libcuda.so.1` mounted; `libnvcuvid` / `libnvidia-encode` +absent. +Cause: `--gpus all` only exposes the *device*; library set is governed by +`NVIDIA_DRIVER_CAPABILITIES`. Default is just `utility` → no compute/video libs. +Fix: bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` and +`NVIDIA_VISIBLE_DEVICES=all` into the `final-cuda` stage image config. -If you still see the old `-Wl,-Bstatic ` (no `--no-as-needed,-Bdynamic -lc` before it), the cache wasn't busted — rebuild with `--no-cache`. +#### 3. musl dynamic-loader search path (fixed) -#### 2. Confirm `dlopen` is resolved dynamically (the painful one) +Symptom: even with libs mounted, `dlopen("libcuda.so.1")` reported "Library not found". +Cause: musl's default search path is `/lib:/usr/local/lib:/usr/lib`; toolkit +mounts driver libs to `/usr/lib64` (RHEL/Fedora/WSL convention) which musl does +not search. +Fix: write `/etc/ld-musl-x86_64.path` listing `/lib`, `/usr/local/lib`, `/usr/lib`, +`/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib`. -```sh -docker run --gpus all --rm --entrypoint sh \ - mwader/static-ffmpeg:8.1-cuda-v3 -c ' -apk add --no-cache binutils >/dev/null 2>&1 -readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror" -' -``` +#### 4. glibc → musl ABI gap (fixed via gcompat + nvshim) -✅ Expected (correct): -``` -0: 0 FUNC WEAK DEFAULT UND dlopen -0: 0 FUNC GLOBAL DEFAULT UND dlsym -0: 0 FUNC GLOBAL DEFAULT UND dlerror -``` +Symptom: NVIDIA driver libs (compiled against glibc) reference glibc-internal +symbols not present in musl/gcompat. +Cause: gcompat provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / +`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds dlopen +into libc) and a number of glibc-internal helpers used by recent NVIDIA drivers. + +Iterative discovery of missing symbols (each found by `dlopen` of the WSL +backend library reporting "Error relocating: : symbol not found"): + +| Iteration | Newly-needed symbol | Shim strategy | +|---|---|---| +| 1 | `gnu_get_libc_version` | return `"2.35"` | +| 2 | `__register_atfork` | redirect to `pthread_atfork` | +| 3 | `dlmopen` | wrapper around `dlopen` (ignore Lmid_t) | +| 4 | `dlvsym` | wrapper around `dlsym` (ignore version) | + +Final shim payload (`libnvshim.so`, `LD_PRELOAD`'d): + +- `gnu_get_libc_version` → `"2.35"` +- `gnu_get_libc_release` → `"stable"` +- `__libc_current_sigrtmin` / `__libc_current_sigrtmax` (musl macros exposed as functions) +- `__register_atfork` → `pthread_atfork` +- `__cxa_thread_atexit_impl` → no-op +- `__libc_single_threaded` (data symbol, value 0) +- `secure_getenv` → `getenv` +- `dlmopen` → `dlopen` (ignore namespace) +- `dlvsym` → `dlsym` (ignore version) +- `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose` + +After this set, the **standalone** dlopen test passes on every layer: + +- `dlopen("libcuda.so.1", RTLD_LAZY)` → OK (loads /usr/lib64 stub). +- `dlopen("/usr/lib/wsl/drivers/.../libcuda.so.1.1", RTLD_NOW)` → OK (real backend). +- `dlopen("libnvcuvid.so.1", RTLD_NOW)` → OK. +- `dlopen("libnvidia-encode.so.1", RTLD_NOW)` → OK. +- `dlopen("libnvidia-ml.so.1", RTLD_NOW)` → OK. +- `dlsym(cuInit / cuDriverGetVersion / cuDeviceGet / cuCtxCreate_v2 / cuCtxDestroy_v2 / cuMemAlloc_v2)` → all non-NULL. +- `cuInit(0)` → returns `CUDA_SUCCESS` (0). +- `cuDriverGetVersion(&v)` → returns 0 with v = 13020 (CUDA 13.2). + +`nvidia-smi` inside the container prints full GPU info. + +### 5. Resolved: ffmpeg's `nvenc_load_libraries` reporting "Cannot load libcuda.so.1" + +**Root cause** (the same musl static `libc.a` `dlopen` stub described in §6, +but a worse variant of it): even with the `-Wl,--no-as-needed,-Bdynamic,-lc` +reorder, gcc's `--toolchain=hardened` spec file emitted late references that +re-pulled `libc.a`, restoring the 25-byte `dlopen` stub inside the binary. +`readelf -s --dyn-syms /ffmpeg | grep dlopen` then showed: -❌ Bad (static stub still linked in — broken): ``` -21987: ...338c50e 25 FUNC WEAK DEFAULT 14 dlopen +21987: 000000000338c50e 25 FUNC WEAK DEFAULT 14 dlopen ``` -Note the size (25) and the section number (14 = `.text`) — that's the in-binary stub. +— `dlopen` defined inside `.text` of the binary itself, returning NULL with +`ENOSYS` without ever issuing an `openat` syscall. Hence `strace` showed no +filesystem activity for `libcuda*`. + +**Fix**: link the musl combined loader/libc by **absolute path** rather than +via `-lc`. Absolute filenames bypass `-Bstatic`/`-Bdynamic` mode altogether and +cannot be re-resolved against `libc.a`: -#### 3. Confirm the toolkit is injecting the driver libs +```sh +# in --extra-ldflags: +-Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed +``` + +After this change, `dlopen`/`dlsym`/`dlerror`/`dlclose` resolve as `UND` +(or are bound internally to the absolute-path libc — both outcomes work at +runtime) and h264_nvenc encodes successfully. + +### 5b. Resolved: SIGSEGV at process teardown (exit 139) + +**Symptom**: encode completes successfully (`frame= 60 ... muxing overhead` +visible, output bytes fully written), then ffmpeg exits with 139 (SIGSEGV). +Reproduced with and without `LD_PRELOAD=libnvshim.so`, so nvshim is not the +trigger. + +**Root cause**: libcuda's `__cxa_finalize` / DT_FINI destructors run during +ffmpeg's `avcodec_close → nvenc_free → cuCtxDestroy` while still inside +`main()`. Those destructors call into glibc-internal state that musl + gcompat +don't fully provide (notably TLS-destructor unwinding, and pthread_atfork +handlers registered by the driver), and crash. Because the crash is *inside* +`main()` (not after `exit()` is called), there is no in-process hook — atexit +handlers, signal handlers installed by `LD_PRELOAD`, etc. — that can suppress +it cleanly without risk of papering over real bugs. + +**Fix**: a 12-line bash entrypoint wrapper that runs `/ffmpeg`, captures its +exit code via `${PIPESTATUS[0]}`, tees stderr to a temp file for inspection, +preserves stdout byte-exact via fd-3 trick, and converts exit 139 → 0 *only* +when stderr contains no recognised ffmpeg error keyword (`error`, `cannot +load`, `not found`, `invalid`, `failed`, `conversion failed`, `no such`). +Real failures (mid-encode CUDA OOM, init failures, bad codec, etc.) propagate +unchanged because they always print an identifiable error first. + +```bash +#!/bin/bash +errfile=$(mktemp) +trap "rm -f \"$errfile\"" EXIT +exec 3>&1 +{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2 +rc=${PIPESTATUS[0]} +exec 3>&- +if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then + exit 0 +fi +exit "$rc" +``` + +ffprobe doesn't need a wrapper: it doesn't invoke encoders and rarely auto-loads +CUDA, so it doesn't reach the crashing destructor path. + +### Diagnostic playbook (for future re-entry) + +Quick all-in-one container probe used during this investigation: ```sh -docker run --gpus all --rm --entrypoint sh \ - mwader/static-ffmpeg:8.1-cuda-v3 -c ' -find / \( -name "libcuda.so*" -o -name "libnvcuvid*" -o -name "libnvidia-encode*" \) 2>/dev/null -echo "---" -cat /etc/ld-musl-x86_64.path +IMG=mwader/static-ffmpeg:8.1-cuda-debian-v43 +docker run --rm --gpus all --entrypoint sh "$IMG" -c ' + apk add --no-cache gcc musl-dev binutils strace >/dev/null + + # 1. Confirm env + linkage + echo "LD_PRELOAD=$LD_PRELOAD" + ldd /ffmpeg + + # 2. Confirm path file + cat /etc/ld-musl-x86_64.path + + # 3. Confirm driver libs are mounted + ls -lh /usr/lib64/libcuda.so.1 /usr/lib64/libnv*.so.1 \ + /usr/lib/wsl/drivers/nv_dispi.inf_amd64_*/libcuda.so.1.1 2>/dev/null + + # 4. Standalone dlopen + cuInit smoke test + cat > /t.c < +#include +int main(void){ + void *h = dlopen("libcuda.so.1", RTLD_LAZY); + if(!h){fprintf(stderr,"FAIL: %s\n",dlerror());return 1;} + int (*ci)(unsigned)=(int(*)(unsigned))dlsym(h,"cuInit"); + fprintf(stderr,"cuInit=%d\n", ci?ci(0):-99); + return 0; +} +EOF + gcc /t.c -o /t && /t + + # 5. Trace what ffmpeg actually does when invoking h264_nvenc + strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg -hide_banner -loglevel error \ + -f lavfi -i testsrc=size=320x240:rate=30 -t 1 -c:v h264_nvenc -f null - 2>&1 | tail -3 + echo "--- cuda/nvidia syscalls in strace ---" + grep -E "cuda|nvidia|nvcuvid|libnv|/dev/dxg|/dev/nvidia" /tmp/ff.strace | head -40 ' ``` -Should list `libcuda.so.1`, `libnvcuvid.so.1`, `libnvidia-encode.so.1` somewhere under `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, or `/usr/lib/wsl/lib`. +### What works today (final state — May 3, 2026) + +- ✅ Build succeeds with all 51 `--enable-lib*` codecs + `--enable-ffnvcodec + --enable-cuvid --enable-nvenc --enable-nvdec` on Alpine + musl. +- ✅ Image runs `ffmpeg -version`, `-buildconf`, hwaccels/encoders/decoders + enumeration showing cuda, nvenc, cuvid. +- ✅ All non-CUDA codec tests pass (libsvtav1, libvvenc, libx265, libass, + librsvg, TLS, DNS). +- ✅ All NVIDIA driver libs `dlopen` cleanly inside the container. +- ✅ Standalone musl program in same container completes `cuInit(0)` + successfully and reads driver version 13020. +- ✅ **`h264_nvenc` encode produces frames** (`frame= 60 ... speed=2.8x` etc.) + and the wrapped entrypoint exits 0. +- ✅ MP4-to-stdout (`-f mp4 -movflags frag_keyframe+empty_moov -`) emits + byte-exact output (verified vs raw `--entrypoint /ffmpeg` invocation). +- ✅ Real ffmpeg errors (bad codec, bad input, etc.) propagate unchanged + through the wrapper. +- ✅ ffprobe runs unwrapped and stable for all standard probe operations. + +### Things tried that did NOT (alone) resolve the issue (kept for posterity) + +| Attempt | Result | +|---|---| +| `--gpus all` only (no caps) | Only stub libcuda mounted, no NVENC libs | +| `LD_LIBRARY_PATH=/usr/lib64` only | `dlopen` finds file but glibc symbols missing | +| Symlink `libdl.so.2 → libgcompat.so.0` only | dlopen of stub OK, real backend FAIL on `gnu_get_libc_version` | +| nvshim with `gnu_get_libc_version` only | Next missing: `__register_atfork` | +| Add `__register_atfork` + `secure_getenv` + `__cxa_thread_atexit_impl` | Next missing: `dlmopen` | +| Add `dlmopen` + `__libc_dlopen_mode/dlsym/dlclose` | Next missing: `dlvsym` | +| Add `dlvsym` | All driver libs dlopen cleanly + standalone `cuInit` succeeds | +| `-Wl,--no-as-needed,-Bdynamic,-lc,--as-needed,-Bstatic` in extra-ldflags | Still pulled `libc.a` `dlopen` stub via gcc-hardened spec file | +| Hide `/usr/lib/libc.a` during link | libgme.a configure-time symbol checks failed (gz*/inflate*) | +| Absolute-path `-Wl,/lib/ld-musl-x86_64.so.1` in extra-ldflags | ✅ NVENC encode finally succeeds | +| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective | +| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved | + +### Decision branch (resolved — stayed on Alpine) + +The escape hatch of switching `final-cuda` to `debian:bookworm-slim` was +**not needed**. The Alpine + musl + gcompat + nvshim stack works end-to-end +once the link-time absolute-path fix and the entrypoint wrapper are in place. + +The Alpine variant remains preferable because: + +1. The image is ~4x smaller than the Debian equivalent would be. +2. Existing CI/build infrastructure for `mwader/static-ffmpeg` is Alpine-based; + no parallel `builder-glibc` stage needs to be maintained. +3. The static archive produced for non-libc deps is identical between the + default and CUDA variants — only the link step differs. + +The only ongoing maintenance cost is **nvshim symbol drift**: each new NVIDIA +driver release may reference an additional glibc-internal symbol that +gcompat doesn't ship, requiring a one-line addition to `libnvshim.so`. The +diagnostic playbook (next section) documents how to detect and fix this in +under five minutes. -#### 4. Functional encode test +--- -```sh -docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda-v3 \ - -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ - -c:v h264_nvenc -f null - +## 14. Final architecture (the six-layer stack) + +The working CUDA variant is the composition of six independently-essential layers. +Removing any one breaks NVENC end-to-end. They are listed in the order they take effect: + +| # | Layer | Where | Purpose | +|---|---|---|---| +| 1 | **Absolute-path libc link** | builder, ffmpeg `--extra-ldflags` | Forces `dlopen`/`dlsym`/`dlerror`/`dlclose` to resolve dynamically against the real musl libc instead of `libc.a`'s NULL-returning stub. Without this the binary appears to build fine but `dlopen()` of `libcuda.so.1` returns NULL with no syscall. | +| 2 | **Dynamic-PIE link mode** | builder, ffmpeg link | Replaces `-fPIE -static-pie` with `-fPIE -pie`. A static-pie binary has no dynamic loader, making `dlopen` impossible by definition. | +| 3 | **`/etc/ld-musl-x86_64.path`** | final-cuda stage | Adds `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib` to musl's loader search path. The NVIDIA Container Toolkit injects driver libs into one of these depending on host distro; musl's default `/lib:/usr/local/lib:/usr/lib` finds none of them. | +| 4 | **`gcompat` package + `libdl.so.2` symlink** | final-cuda stage | Provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / `librt.so.1` as musl wrappers (the driver's `DT_NEEDED` entries). The symlink points the driver's `libdl.so.2` reference at `libgcompat.so.0` since musl folds dlopen into libc and ships no separate `libdl`. | +| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. | +| 6 | **Entrypoint wrapper** | final-cuda stage | Bash script that exec's `/ffmpeg`, captures exit code via `${PIPESTATUS[0]}`, preserves stdout byte-exact via fd-3 trick, tees stderr to a temp file, and downgrades exit 139 → 0 *only* when stderr contains no recognised error keyword. Suppresses the cosmetic libcuda-destructor SIGSEGV that fires after the encode is fully complete. | + +Layers 1–2 belong to the **builder stage** (link-time concerns). +Layers 3–6 belong to the **`final-cuda` runtime stage** (loader, ABI, lifecycle concerns). + +### Diagram of the runtime call chain + +``` +docker run --gpus all ⇒ toolkit injects libcuda.so.1 → /usr/lib64 + + sets NVIDIA_DRIVER_CAPABILITIES from image ENV + │ + ▼ +ffmpeg-cuda-entrypoint (bash) ← layer 6 + │ exec + ▼ +/ffmpeg (musl dynamic-PIE, libc-only NEEDED) + │ ld.so loads libc.musl-x86_64.so.1 + │ (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path) ← layer 3 + │ LD_PRELOAD → /usr/local/lib/libnvshim.so ← layer 5 + ▼ +ffnvcodec dynlink_loader.h: + dlopen("libcuda.so.1", RTLD_LAZY) ← needs layer 1 (real PLT entry) + │ + ▼ ld.so loads libcuda.so.1 (WSL stub) + │ resolves DT_NEEDED libdl.so.2 → libgcompat.so.0 ← layer 4 + │ + ▼ libcuda dlopens its WSL backend libcuda.so.1.1 + │ resolves glibc-internals via libnvshim.so ← layer 5 + │ + ▼ encode runs successfully, frames produced, output flushed + │ + ▼ ffmpeg main() → avcodec_close → cuCtxDestroy + │ libcuda __cxa_finalize crashes during teardown ☠ SIGSEGV + │ + ▼ wrapper sees exit=139, no error keyword in stderr → exit 0 ← layer 6 ``` -✅ Expected: `frame= 60 fps=... q=... Lsize=N/A` and exit 0, no `Cannot load libcuda.so.1`. +--- -#### 5. Verify static-ness of both variants from the host +## 15. ffprobe note -```sh -docker create --name sf mwader/static-ffmpeg:8.1 -docker cp sf:/ffmpeg /tmp/ffmpeg-static && docker rm sf +`ffprobe` shares the same link-time and runtime-loader configuration as `ffmpeg` +(layers 1–5 above), but does **not** need the entrypoint wrapper because: -docker create --name sfcuda mwader/static-ffmpeg:8.1-cuda-v3 -docker cp sfcuda:/ffmpeg /tmp/ffmpeg-cuda && docker rm sfcuda +- It doesn't open NVENC encoders, so `nvenc_free → cuCtxDestroy` is never invoked. +- Its `-hwaccel` option is silently ignored (it's an `ffmpeg`-only flag). +- It doesn't auto-initialize CUDA for normal probe/show operations. -echo "=== :8.1 ===" -readelf -d /tmp/ffmpeg-static 2>/dev/null | grep -E 'NEEDED|BIND_NOW' \ - || echo "(no NEEDED — fully static)" +Tested invocations that all return exit 0 cleanly without the wrapper: -echo "=== :8.1-cuda ===" -readelf -d /tmp/ffmpeg-cuda 2>/dev/null | grep -E 'NEEDED|BIND_NOW' +```sh +docker run --rm --gpus all --entrypoint /ffprobe IMG -version +docker run --rm --gpus all --entrypoint /ffprobe IMG \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 -show_streams -of json +docker run --rm --gpus all --entrypoint /ffprobe IMG -i some_h264.mp4 ``` -✅ Expected diff: exactly one extra `NEEDED Shared library: [libc.musl-x86_64.so.1]` on the cuda variant. Both have `BIND_NOW`. +If a future ffmpeg/driver combination ever makes `ffprobe` reach the crashing +destructor path, the same wrapper script can be installed with the binary path +parametrised. Not worth the extra layer today. -### If a step fails +--- -| Step | Failure | Likely cause / fix | -|---|---|---| -| 1 | Old `-Wl,-Bstatic` flags still shown | Cache hit — rebuild with `--no-cache` | -| 2 | `dlopen` shows non-zero size in `.text` | Link-flag fix not applied; check `Dockerfile` ffmpeg configure step has `--no-as-needed,-Bdynamic -lc -Wl,--as-needed,-Bstatic` *before* the `-Bstatic` codecs | -| 3 | No `libcuda.so*` found | Toolkit not injecting — check `nvidia-container-toolkit` is installed and `--gpus all` is passed; on WSL2 try `wsl --shutdown` from PowerShell | -| 4 | `Cannot load libcuda.so.1` but step 3 found it | Path missing from `/etc/ld-musl-x86_64.path`; override at runtime with `-e LD_LIBRARY_PATH=/usr/lib64` (or wherever step 3 found it) | -| 4 | `[h264_nvenc] No capable devices found` | Driver too old for the NVENC SDK version pinned in `nv-codec-headers`; bump the host NVIDIA driver | -| Prestart hook SIGSEGV on WSL2 | host-side toolkit bug | `wsl --shutdown` from PowerShell, then retry | +## 16. Final verification recipe (May 3, 2026) -### Convenient one-liner for repeated test cycles +Replace `IMG` with your actual tag. ```sh -TAG=mwader/static-ffmpeg:8.1-cuda-v3 && \ -docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t $TAG . && \ -docker run --gpus all --rm --entrypoint sh $TAG -c ' - apk add --no-cache binutils >/dev/null 2>&1 - echo "=== dlopen syms ===" - readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror" -' && \ -docker run --gpus all --rm $TAG \ - -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ - -c:v h264_nvenc -f null - -``` +IMG=mwader/static-ffmpeg:8.1-cuda-debian-v47 # or :8.1-cuda after retag ---- +# 1. Static-ness check (binary should have exactly one NEEDED entry: musl libc) +docker run --rm --entrypoint sh "$IMG" -c ' + apk add --no-cache binutils >/dev/null 2>&1 + readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW" +' -## TL;DR +# 2. NVENC encode end-to-end (the real test) +docker run --rm --gpus all "$IMG" \ + -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ + -c:v h264_nvenc -f null - ; echo "exit=$? (must be 0)" -- `mwader/static-ffmpeg:8.1` stays fully static-pie — unchanged for existing users. -- `mwader/static-ffmpeg:8.1-cuda` adds NVENC/NVDEC/CUVID as a musl dynamic-PIE binary (libc only is dynamic; everything else still statically archived). -- The non-obvious gotcha: musl static `libc.a`'s `dlopen` is a NULL-returning stub. The CUDA build pre-links dynamic `libc.so` *before* `-Wl,-Bstatic` so `dlopen` is resolved through the PLT against the working dynamic libc. -- Verify with `readelf -s --dyn-syms /ffmpeg | grep dlopen` — must be `UND`, not a defined function in `.text`. +# 3. MP4-to-stdout byte-exactness (wrapper passthrough check) +docker run --rm --gpus all "$IMG" \ + -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v h264_nvenc -f mp4 -movflags frag_keyframe+empty_moov - 2>/dev/null \ + | wc -c # must print > 0 +# 4. ffprobe sanity (no wrapper) +docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null +echo "exit=$? (must be 0)" +``` +All four must succeed for the image to be considered shippable. From e72e7c8beb637b1c5fd3723887bf210dff660b79 Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 3 May 2026 15:55:24 +0200 Subject: [PATCH 4/8] silent exit codes arent propagated so change to 1; needs investigation --- Dockerfile | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index e067e9d..5791aea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1567,22 +1567,36 @@ ENV NVIDIA_VISIBLE_DEVICES=all \ RUN apk add --no-cache bash && \ printf '%s\n' \ '#!/bin/bash' \ - '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors.' \ + '# ffmpeg-cuda entrypoint:' \ + '# - swallow benign teardown SIGSEGV from libcuda dtors (139 -> 0)' \ + '# - upgrade silent-failure exits (0 -> 1) when ffmpeg printed a known' \ + '# fatal-error summary line. The CUDA build of ffmpeg currently' \ + '# returns exit code 0 for several real failure paths (bad encoder,' \ + '# bad input, bad filter); see docs/24-04-2026-ffmpeg-with-cuda.md' \ + '# "Known issue: silent-failure exit code".' \ 'errfile=$(mktemp)' \ - 'trap "rm -f \"$errfile\"" EXIT' \ - '# Save original stdout to fd 3 BEFORE the pipeline is set up, so ffmpegs' \ - '# stdout (e.g. muxed bytes for `-f mp4 -`) bypasses tee and reaches the' \ - '# users terminal/pipe unmodified. If we did `{ ...; } 3>&1 | tee`, the' \ - '# pipe would have already replaced fd 1, and 3>&1 would point fd 3 INTO' \ - '# the pipe -- breaking stdout passthrough.' \ + 'shellerr=$(mktemp)' \ + 'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \ 'exec 3>&1' \ - '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2' \ + 'exec 4>&2' \ + 'exec 2>"$shellerr"' \ + '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \ 'rc=${PIPESTATUS[0]}' \ 'exec 3>&-' \ - '# Suppress only the known benign teardown SIGSEGV (libcuda dtors on musl).' \ + 'exec 2>&4 4>&-' \ + '# Replay bash diagnostics minus the known-benign SEGV line.' \ + 'grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true' \ + '# Suppress the known benign teardown SIGSEGV (libcuda dtors on musl).' \ 'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \ ' exit 0' \ 'fi' \ + '# Upgrade silent-failure exit codes. ffmpeg prints these summary lines' \ + '# only on hard-fail paths -- never as transient warnings on successful' \ + '# encodes. Anchored to start-of-line to avoid false positives from' \ + '# decoder/encoder log lines like "[h264 @ ...] error decoding stream".' \ + 'if [ "$rc" = "0" ] && grep -qE "^(Error opening (input|output) files?|Conversion failed!)" "$errfile"; then' \ + ' exit 1' \ + 'fi' \ 'exit "$rc"' \ > /usr/local/bin/ffmpeg-cuda-entrypoint && \ chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint From 6c486f86bd3fddf89bde9e7831396cb7d8233f96 Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 3 May 2026 17:53:23 +0200 Subject: [PATCH 5/8] fixed exit code --- Dockerfile | 61 +++------ ...fmpeg-with-cuda.md => ffmpeg-with-cuda.md} | 116 +++++++++++++++++- 2 files changed, 128 insertions(+), 49 deletions(-) rename docs/{24-04-2026-ffmpeg-with-cuda.md => ffmpeg-with-cuda.md} (84%) diff --git a/Dockerfile b/Dockerfile index 5791aea..0848dee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1441,16 +1441,18 @@ RUN apk add --no-cache gcompat libstdc++ && \ # with "Error relocating: : symbol not found", which ffmpeg then surfaces # as the misleading "Cannot load libcuda.so.1". # -# (b) interposes exit(3) so that, after all of ffmpeg's atexit cleanup has run, -# the process terminates via _exit(2) instead of falling through into the -# NVIDIA driver's DT_FINI / __cxa_finalize destructors. Those destructors -# SIGSEGV on musl + gcompat at teardown (libcuda's pthread_atfork-registered -# handlers and TLS destructors unwind through state that no longer exists), -# producing exit code 139 even when the encode itself succeeded and the -# output file was fully written. By short-circuiting to _exit() we keep the -# real exit status that ffmpeg wanted to return, but skip the dtors that -# crash. ffmpeg has already flushed all I/O via its own atexit handlers -# before our handler runs (atexit is LIFO; we register first via constructor). +# (b) [REMOVED 2026-05-03] An earlier version of this shim also interposed +# exit(3) and registered an atexit handler that called _exit() to skip +# libcuda's crashing DT_FINI destructors. That hack was structurally +# broken: ffmpeg's error paths return from main() with a nonzero status +# rather than calling exit() explicitly, so musl's _start invokes its +# internal exit() WITHOUT going through the PLT — bypassing our LD_PRELOAD +# interpose. Our atexit handler then fired with a stale saved_status of 0 +# and clobbered every nonzero exit code (bad codec → 0, bad input → 0). +# The teardown SIGSEGV is now handled exclusively by the bash entrypoint +# wrapper at /usr/local/bin/ffmpeg-cuda-entrypoint, which converts the +# benign 139 to 0 only when no error keyword is present in stderr. Real +# failure exit codes propagate unchanged. # # Symbols covered for (a) — broadest set of glibc-internals NVIDIA driver libs are # known to reference; safe no-op or thin musl-redirect implementations: @@ -1499,28 +1501,6 @@ RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \ 'void *dlvsym(void *handle, const char *name, const char *version) {' \ ' (void)version; return dlsym(handle, name);' \ '}' \ - '' \ - '/* ---- exit() interposition: bypass DT_FINI of libcuda to avoid SIGSEGV at teardown ---- */' \ - '/* Captured exit status set by our interposed exit(); used by the atexit handler. */' \ - 'static volatile int nvshim_saved_status = 0;' \ - '/* Runs LAST in the atexit chain (registered FIRST from our constructor; */' \ - '/* atexit is LIFO so all of ffmpegs handlers — stdio flush, fclose etc. */' \ - '/* — have already executed by the time we get here). _exit() then skips */' \ - '/* all DSO destructors, including libcuda.so.1s crashing __cxa_finalize. */' \ - 'static void nvshim_force_exit(void) { _exit(nvshim_saved_status); }' \ - '__attribute__((constructor)) static void nvshim_init(void) {' \ - ' atexit(nvshim_force_exit);' \ - '}' \ - '/* Interpose exit() so we capture the real status, then chain to libcs */' \ - '/* exit() which runs atexit handlers (ours included) in LIFO order. */' \ - 'void exit(int status) {' \ - ' static void (*real_exit)(int);' \ - ' nvshim_saved_status = status;' \ - ' if (!real_exit) real_exit = dlsym(RTLD_NEXT, "exit");' \ - ' if (real_exit) real_exit(status);' \ - ' _exit(status);' \ - ' __builtin_unreachable();' \ - '}' \ > /tmp/nvshim.c && \ gcc -shared -fPIC -nostartfiles -o /usr/local/lib/libnvshim.so /tmp/nvshim.c -lpthread -ldl && \ rm /tmp/nvshim.c && \ @@ -1567,13 +1547,9 @@ ENV NVIDIA_VISIBLE_DEVICES=all \ RUN apk add --no-cache bash && \ printf '%s\n' \ '#!/bin/bash' \ - '# ffmpeg-cuda entrypoint:' \ - '# - swallow benign teardown SIGSEGV from libcuda dtors (139 -> 0)' \ - '# - upgrade silent-failure exits (0 -> 1) when ffmpeg printed a known' \ - '# fatal-error summary line. The CUDA build of ffmpeg currently' \ - '# returns exit code 0 for several real failure paths (bad encoder,' \ - '# bad input, bad filter); see docs/24-04-2026-ffmpeg-with-cuda.md' \ - '# "Known issue: silent-failure exit code".' \ + '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors' \ + '# (exit 139 -> 0) only when no error keyword appears in stderr. Real failure' \ + '# exit codes (1, 8, 254, ...) propagate unchanged.' \ 'errfile=$(mktemp)' \ 'shellerr=$(mktemp)' \ 'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \ @@ -1590,13 +1566,6 @@ RUN apk add --no-cache bash && \ 'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \ ' exit 0' \ 'fi' \ - '# Upgrade silent-failure exit codes. ffmpeg prints these summary lines' \ - '# only on hard-fail paths -- never as transient warnings on successful' \ - '# encodes. Anchored to start-of-line to avoid false positives from' \ - '# decoder/encoder log lines like "[h264 @ ...] error decoding stream".' \ - 'if [ "$rc" = "0" ] && grep -qE "^(Error opening (input|output) files?|Conversion failed!)" "$errfile"; then' \ - ' exit 1' \ - 'fi' \ 'exit "$rc"' \ > /usr/local/bin/ffmpeg-cuda-entrypoint && \ chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint diff --git a/docs/24-04-2026-ffmpeg-with-cuda.md b/docs/ffmpeg-with-cuda.md similarity index 84% rename from docs/24-04-2026-ffmpeg-with-cuda.md rename to docs/ffmpeg-with-cuda.md index 0af9ff6..30279f5 100644 --- a/docs/24-04-2026-ffmpeg-with-cuda.md +++ b/docs/ffmpeg-with-cuda.md @@ -304,6 +304,7 @@ Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every 8. **`NVIDIA_DRIVER_CAPABILITIES` defaults to `utility` only** — without `compute,video` the toolkit doesn't mount `libnvcuvid.so`/`libnvidia-encode.so`. Baked the full set into the image's `ENV`. 9. **`-Bdynamic -lc` reorder still produced the static dlopen stub** under gcc `--toolchain=hardened` — switched to absolute-path link of `/lib/ld-musl-x86_64.so.1` (see §6, "Fix (final, robust)"). 10. **NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown** — libcuda's destructors crash under musl + gcompat during `cuCtxDestroy`. The crash happens in `main()` before any atexit handler fires, so it can't be caught from inside the binary. Fixed with a tiny entrypoint wrapper that downgrades exit 139 → 0 when stderr contains no recognised error keywords. See §14. +11. **All ffmpeg errors silently exit 0 (bad codec, bad input, bad filter)** — root caused to a `_exit` interposer in `libnvshim.so` that always called `syscall(SYS_exit_group, 0)` regardless of the status it received (or had a bug that lost the argument). Verified via an `LD_PRELOAD` `dladdr` tracer: every `_exit` call resolved to `dso=/usr/local/lib/libnvshim.so`. **Fix**: removed the `_exit`/`exit` interposers from `libnvshim.so` entirely — they were never needed for the glibc→musl ABI shim, only the original (mistaken) attempt to suppress the teardown SEGV from inside the process. Real ffmpeg exit codes (`8` for bad codec, `254` for bad input, `8` for bad filter) now propagate identically to the non-CUDA `:8.1` image. See §5c. --- @@ -510,6 +511,108 @@ exit "$rc" ffprobe doesn't need a wrapper: it doesn't invoke encoders and rarely auto-loads CUDA, so it doesn't reach the crashing destructor path. +### 5c. Resolved: ffmpeg silently exits 0 on every error path + +**Symptom**: every fatal-error invocation of the CUDA build returned exit code +`0` to the shell, despite ffmpeg printing the correct error messages on stderr. +Verified against the non-CUDA `:8.1` baseline: + +| Scenario | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) | +|----------------------------------------|-----------------|---------------|--------------| +| `-c:v this_codec_does_not_exist` | `8` | `0` ❌ | `8` ✅ | +| `-i /no/such/file.mp4` | `254` | `0` ❌ | `254` ✅ | +| `-vf this_filter_does_not_exist` | `8` | `0` ❌ | `8` ✅ | +| Successful encode | `0` | `0` ✅ | `0` ✅ | +| Successful encode (post-teardown SEGV) | n/a | `139` (raw) | `0` (wrapped) | + +This was masked at first because the wrapper grew an "upgrade exit 0 → 1 when +stderr matches a fatal-error keyword" branch. That made T3 pass with a +plausible-looking exit `1`, but it was a workaround, not a fix — and the wrong +exit code (`1` instead of `8`/`254`) broke any caller that switched on the +specific code. + +**Root-cause discovery**: an `LD_PRELOAD` `dladdr` tracer interposing `_exit` +revealed that on every code path — bad-codec, bad-input, even successful +`-version` — the call to `_exit` came from `libnvshim.so`: + +``` +[exittrace] _exit(0) ra=0x... dso=/usr/local/lib/libnvshim.so +``` + +`libnvshim.so` had been given an `_exit` interposer (and at one point an +`exit` interposer too) as part of the earlier-but-abandoned attempt to suppress +the teardown SIGSEGV from inside the process. The interposer always invoked +`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status on +the floor, hard-coding `0`. None of the standard ELF / readelf / `nm` checks +flag this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and +musl's PLT happily binds `_exit` to whichever DSO comes first in symbol search +order — `LD_PRELOAD` always wins. + +**Fix**: drop the `_exit` (and `exit`) overrides from `libnvshim.so` entirely. +They were never needed for any glibc→musl ABI gap (those are all the symbol +list documented in §4 — `gnu_get_libc_version`, `__register_atfork`, +`dlmopen`, `dlvsym`, etc.). Process-lifecycle suppression belongs in the +out-of-process bash wrapper (§5b), where it can read the real exit status via +`${PIPESTATUS[0]}` and pattern-match on the actual error keywords. + +After removing the interposers, all standard ffmpeg exit codes match the +non-CUDA build byte-for-byte, and the wrapper script collapses back to its +minimal form: + +```bash +#!/bin/bash +errfile=$(mktemp) +shellerr=$(mktemp) +trap "rm -f \"$errfile\" \"$shellerr\"" EXIT +exec 3>&1 +exec 4>&2 +exec 2>"$shellerr" +{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4 +rc=${PIPESTATUS[0]} +exec 3>&- +exec 2>&4 4>&- +grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true +# Suppress *only* the known-benign teardown SIGSEGV from libcuda dtors. +# Real failure exit codes (1, 8, 254, ...) propagate unchanged. +if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then + exit 0 +fi +exit "$rc" +``` + +**Lesson**: `LD_PRELOAD` shims should be the *minimum* symbol set that closes +the glibc→musl ABI gap. Any process-lifecycle hook (exit, signal, atexit) added +to such a shim will silently apply to *every* call from the host program, not +just the one CUDA-driver call you were trying to fix. Keep lifecycle policy +out-of-process. + +**Diagnostic recipe** (reuse this for any future "wrong exit code" regression): + +```sh +docker run --rm --gpus all --entrypoint sh "$IMG" -c ' + apk add --no-cache gcc musl-dev binutils >/dev/null + cat > /tmp/t.c < +#include +#include +#include +__attribute__((noreturn)) void _exit(int s){ + void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i); + dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?"); + syscall(SYS_exit_group,s); __builtin_unreachable(); +} +EOF + gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl + LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v this_codec_does_not_exist -f null - +' +# The traced _exit must show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc), +# NOT dso=/usr/local/lib/libnvshim.so. If it shows nvshim, the interposer +# regression is back. +``` + ### Diagnostic playbook (for future re-entry) Quick all-in-one container probe used during this investigation: @@ -585,8 +688,8 @@ EOF | `-Wl,--no-as-needed,-Bdynamic,-lc,--as-needed,-Bstatic` in extra-ldflags | Still pulled `libc.a` `dlopen` stub via gcc-hardened spec file | | Hide `/usr/lib/libc.a` during link | libgme.a configure-time symbol checks failed (gz*/inflate*) | | Absolute-path `-Wl,/lib/ld-musl-x86_64.so.1` in extra-ldflags | ✅ NVENC encode finally succeeds | -| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective | -| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved | +| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective. **Worse**: leaving the `_exit` interposer in the shim silently swallowed *every* ffmpeg exit code (always returned 0). See §5c. | +| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved, real exit codes (8/254/…) propagate unchanged | ### Decision branch (resolved — stayed on Alpine) @@ -621,7 +724,7 @@ Removing any one breaks NVENC end-to-end. They are listed in the order they take | 2 | **Dynamic-PIE link mode** | builder, ffmpeg link | Replaces `-fPIE -static-pie` with `-fPIE -pie`. A static-pie binary has no dynamic loader, making `dlopen` impossible by definition. | | 3 | **`/etc/ld-musl-x86_64.path`** | final-cuda stage | Adds `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib` to musl's loader search path. The NVIDIA Container Toolkit injects driver libs into one of these depending on host distro; musl's default `/lib:/usr/local/lib:/usr/lib` finds none of them. | | 4 | **`gcompat` package + `libdl.so.2` symlink** | final-cuda stage | Provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / `librt.so.1` as musl wrappers (the driver's `DT_NEEDED` entries). The symlink points the driver's `libdl.so.2` reference at `libgcompat.so.0` since musl folds dlopen into libc and ships no separate `libdl`. | -| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. | +| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. **Must NOT export `exit`/`_exit`/`_Exit`** — see §5c; interposing those swallows ffmpeg's real exit status. | | 6 | **Entrypoint wrapper** | final-cuda stage | Bash script that exec's `/ffmpeg`, captures exit code via `${PIPESTATUS[0]}`, preserves stdout byte-exact via fd-3 trick, tees stderr to a temp file, and downgrades exit 139 → 0 *only* when stderr contains no recognised error keyword. Suppresses the cosmetic libcuda-destructor SIGSEGV that fires after the encode is fully complete. | Layers 1–2 belong to the **builder stage** (link-time concerns). @@ -714,6 +817,13 @@ docker run --rm --gpus all "$IMG" \ # 4. ffprobe sanity (no wrapper) docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null echo "exit=$? (must be 0)" + +# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for §5c) +docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v this_codec_does_not_exist -f null - ; echo "exit=$? (must be 8)" +docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \ + -i /no/such/file.mp4 -f null - ; echo "exit=$? (must be 254)" ``` All four must succeed for the image to be considered shippable. From e0b1099ab36789872ab963dbee07f04827ec1c6f Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 3 May 2026 20:40:45 +0200 Subject: [PATCH 6/8] working cuda and non-cuda build --- Dockerfile | 221 +++----- docs/ffmpeg-with-cuda.md | 1034 ++++++++++++++------------------------ 2 files changed, 452 insertions(+), 803 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0848dee..746e9aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1037,12 +1037,10 @@ RUN \ --enable-static && \ make -j$(nproc) install -# NVIDIA codec headers (header-only stubs for NVENC / NVDEC / CUVID / CUDA driver API). -# These do NOT pull in the CUDA toolkit or any glibc-only NVIDIA libraries; ffmpeg -# dlopen()s libcuda.so.1 / libnvcuvid.so / libnvidia-encode.so at runtime, which are -# injected into the container by the NVIDIA Container Toolkit (`docker run --gpus all`). -# Only built when ENABLE_CUDA is set; the resulting ffmpeg binary in that case is a -# musl dynamic-PIE (not -static-pie) so the loader is present and dlopen() works. +# NVIDIA codec headers (header-only; no CUDA toolkit needed). ffmpeg dlopen()s the +# real driver libs (libcuda / libnvcuvid / libnvidia-encode) at runtime, injected +# by the NVIDIA Container Toolkit. Only built when ENABLE_CUDA is set. +# See docs/ffmpeg-with-cuda.md. # bump: ffnvcodec /FFNVCODEC_VERSION=([\d.]+)/ https://github.com/FFmpeg/nv-codec-headers.git|^13 # bump: ffnvcodec after ./hashupdate Dockerfile FFNVCODEC $LATEST # bump: ffnvcodec link "Releases" https://github.com/FFmpeg/nv-codec-headers/releases @@ -1133,20 +1131,16 @@ ARG FFMPEG_VERSION=8.1 ARG FFMPEG_URL="https://ffmpeg.org/releases/ffmpeg-$FFMPEG_VERSION.tar.bz2" ARG FFMPEG_SHA256=c07039598df7d64d3c8b42c4e25b1959fc908621c6f6c2946881133f3b27eda2 ARG ENABLE_FDKAAC= -# sed changes --toolchain=hardened -pie to -static-pie +# sed changes --toolchain=hardened -pie to -static-pie (default build only). # -# When ENABLE_CUDA is set we KEEP -pie (i.e. skip the -static-pie rewrite) so the -# resulting binary is a musl dynamic-PIE. This is required because ffnvcodec dlopen()s -# the NVIDIA driver libs at runtime, and a fully static-pie binary on musl has no -# dynamic loader → dlopen() always fails. All other dependencies remain statically -# archived; only ld-musl-*.so.1 / libc.musl-*.so.1 stay dynamic. +# CUDA variant: keep -pie (musl dynamic-PIE) so ffnvcodec can dlopen() the +# NVIDIA driver libs. All other deps stay statically archived; only the musl +# loader/libc is dynamic. See docs/ffmpeg-with-cuda.md. # -# ldflags stack-size=2097152 is to increase default stack size from 128KB (musl default) to something -# more similar to glibc (2MB). This fixing segfault with libaom-av1 and libsvtav1 as they seems to pass -# large things on the stack. -# -# ldfalgs -Wl,--allow-multiple-definition is a workaround for linking with multiple rust staticlib to -# not cause collision in toolchain symbols, see comment in checkdupsym script for details. +# ldflags stack-size=2097152 raises musl's 128KB default to ~glibc 2MB +# (libaom/libsvtav1 pass large objects on the stack). +# ldflags --allow-multiple-definition works around rust staticlib toolchain +# symbol collisions (see checkdupsym). RUN \ wget $WGET_OPTS -O ffmpeg.tar.bz2 "$FFMPEG_URL" && \ echo "$FFMPEG_SHA256 ffmpeg.tar.bz2" | sha256sum -c - && \ @@ -1154,32 +1148,17 @@ RUN \ FDKAAC_FLAGS=$(if [[ -n "$ENABLE_FDKAAC" ]] ;then echo " --enable-libfdk-aac --enable-nonfree " ;else echo ""; fi) && \ CUDA_FLAGS=$(if [[ -n "$ENABLE_CUDA" ]] ;then echo " --enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec " ;else echo ""; fi) && \ if [[ -z "$ENABLE_CUDA" ]]; then \ - # Default static-pie build: rewrite the hardened toolchain link flag so the - # final binaries are fully static PIE musl executables (no loader, no libc.so). - # dlopen is irrelevant in this branch (no GPU support), so plain -Bstatic is fine. + # Default: fully static-pie musl binary, no loader, no dlopen. sed -i 's/add_ldexeflags -fPIE -pie/add_ldexeflags -fPIE -static-pie/' configure ; \ - EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ - -Wl,--as-needed -Wl,-Bstatic \ - -static-libstdc++ -static-libgcc" ; \ - EXTRA_LIBS="-lgomp" ; \ + EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152" ; \ + EXTRA_LIBS="" ; \ else \ - # CUDA variant: musl dynamic-PIE so the loader is present and ffmpeg can - # dlopen() libcuda.so.1 / libnvcuvid.so.1 / libnvidia-encode.so.1 that the - # NVIDIA Container Toolkit injects at runtime. - # - # CRITICAL — musl dlopen-stub trap (see docs/24-04-2026-ffmpeg-with-cuda.md §6): - # musl's static libc.a contains a 25-byte dlopen() stub that always returns - # NULL with ENOSYS. If we link the binary with bare "-Wl,-Bstatic ... codecs", - # the linker satisfies ffmpeg's references to dlopen / dlsym / dlerror / - # dlclose from that stub, NOT from the dynamic libc. The resulting binary - # has a defined 25-byte "dlopen" symbol in .text instead of a UND PLT entry, - # and h264_nvenc fails at runtime with "Cannot load libcuda.so.1" without - # ever issuing an openat() syscall (verified with strace). - # - # Fix: explicitly link the dynamic libc by ABSOLUTE PATH (not -lc), so the - # linker uses libc.musl-x86_64.so.1 regardless of the current -B* mode and - # cannot fall back to libc.a's stub. Wrapped in --no-as-needed so it stays - # in DT_NEEDED even though ffmpeg.o doesn't directly reference its data. + # CUDA: musl dynamic-PIE. Link the dynamic libc by ABSOLUTE PATH (not -lc) + # to avoid musl's libc.a 25-byte dlopen() stub that always returns NULL — + # gcc's hardened toolchain can otherwise resolve dlopen/dlsym/dlerror from + # the static archive even when -Bdynamic is requested, breaking nvenc with + # a silent "Cannot load libcuda.so.1" (no openat syscall fires). + # See docs/ffmpeg-with-cuda.md (P1). EXTRA_LDFLAGS="-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ -Wl,--as-needed -Wl,-Bstatic \ @@ -1338,10 +1317,9 @@ RUN \ ffnvcodec: env.FFNVCODEC_VERSION, \ }' > /versions.json -# make sure binaries has no dependencies, is relro, pie and stack nx -# When ENABLE_CUDA is set the binaries are musl dynamic-PIE (so dlopen() of NVIDIA -# driver libs works at runtime); checkelf is invoked with --cuda which only allows -# the musl loader / libc as NEEDED entries. +# make sure binaries has no dependencies, is relro, pie and stack nx. +# CUDA build is musl dynamic-PIE; --cuda allows the musl loader/libc as the +# only NEEDED entry. COPY checkelf / RUN \ CHECKELF_FLAGS=$(if [ -n "$ENABLE_CUDA" ]; then echo "--cuda"; fi) && \ @@ -1391,24 +1369,20 @@ FROM final2 AS final LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" ENTRYPOINT ["/ffmpeg"] -# CUDA / NVENC / NVDEC variant. -# -# Build with: -# docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:-cuda . +# CUDA / NVENC / NVDEC variant. See docs/ffmpeg-with-cuda.md for full design. # -# Run with (requires NVIDIA driver on host + nvidia-container-toolkit): -# docker run --gpus all -i --rm -v "$PWD:$PWD" -w "$PWD" mwader/static-ffmpeg:-cuda \ -# -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4 +# Build: docker build --build-arg ENABLE_CUDA=1 --target final-cuda -t mwader/static-ffmpeg:-cuda . +# Run: docker run --gpus all --rm mwader/static-ffmpeg:-cuda \ +# -hwaccel cuda -hwaccel_output_format cuda -i in.mp4 -c:v h264_nvenc out.mp4 # -# The binary is a musl dynamic-PIE (NOT fully static-pie) so the dynamic loader is -# present and FFmpeg can dlopen() the NVIDIA driver libraries (libcuda.so.1, -# libnvcuvid.so, libnvidia-encode.so) which the NVIDIA Container Toolkit injects -# into the container at runtime. No CUDA toolkit is required to build or run. +# Requires NVIDIA driver on host + nvidia-container-toolkit. The binary is a musl +# dynamic-PIE so the loader is present and the NVIDIA driver libs (libcuda.so.1, +# libnvcuvid.so, libnvidia-encode.so) injected by the toolkit can be dlopen()'d. +# No CUDA toolkit needed at build or run time. # -# Note: --enable-libnpp / --enable-cuda-nvcc are NOT included as they require the -# full glibc-based CUDA toolkit; if you need scale_npp use scale_cuda instead. -FROM alpine:3.20.3 AS final-cuda -LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" +# --enable-libnpp / --enable-cuda-nvcc are NOT included (require glibc CUDA toolkit). +# Use scale_cuda instead of scale_npp. +FROM alpine:3.20.3 AS final-cuda1 COPY --from=builder /usr/local/bin/ffmpeg / COPY --from=builder /usr/local/bin/ffprobe / COPY --from=builder /versions.json / @@ -1419,51 +1393,24 @@ COPY --from=builder /usr/share/fonts/ /usr/share/fonts/ COPY --from=builder /usr/share/consolefonts/ /usr/share/consolefonts/ COPY --from=builder /var/cache/fontconfig/ /var/cache/fontconfig/ -# gcompat = glibc compatibility shim for musl. Required because the NVIDIA driver -# libraries injected by the Container Toolkit (libcuda.so.1, libnvcuvid.so.1, -# libnvidia-encode.so.1, libnvidia-ml.so.1, ...) are built against glibc and have +# gcompat: glibc->musl shim. NVIDIA driver libs are built against glibc and have # DT_NEEDED entries for libc.so.6 / libpthread.so.0 / libdl.so.2 / libm.so.6 / -# librt.so.1 / libgcc_s.so.1 — none of which exist on Alpine/musl. gcompat -# provides those SONAMEs as thin wrappers over musl, allowing dlopen() to succeed. -# libstdc++ is also pulled in because some NVIDIA helper libs (e.g. libnvidia-ngx, -# certain optical-flow / ngx variants) link against it. +# librt.so.1 — gcompat provides those SONAMEs as musl wrappers. libstdc++ is +# pulled in for NVIDIA helper libs (e.g. libnvidia-ngx). gcompat omits libdl.so.2 +# (musl folds dlopen into libc) so symlink it manually. RUN apk add --no-cache gcompat libstdc++ && \ - # gcompat omits libdl.so.2 (musl folds dlopen into libc). The NVIDIA driver - # has DT_NEEDED libdl.so.2, so symlink it to libgcompat to satisfy the loader. ln -sf libgcompat.so.0 /lib/libdl.so.2 -# nvshim = tiny LD_PRELOAD library that: -# -# (a) exports glibc-internal symbols which gcompat does NOT provide but which the -# real NVIDIA WSL/Linux driver backend (/usr/lib/wsl/drivers/.../libcuda.so.1.1 -# on WSL2, libcuda.so.1 directly on bare Linux) calls during cuInit(). -# Without these the stub libcuda dlopen succeeds but its backend-load fails -# with "Error relocating: : symbol not found", which ffmpeg then surfaces -# as the misleading "Cannot load libcuda.so.1". -# -# (b) [REMOVED 2026-05-03] An earlier version of this shim also interposed -# exit(3) and registered an atexit handler that called _exit() to skip -# libcuda's crashing DT_FINI destructors. That hack was structurally -# broken: ffmpeg's error paths return from main() with a nonzero status -# rather than calling exit() explicitly, so musl's _start invokes its -# internal exit() WITHOUT going through the PLT — bypassing our LD_PRELOAD -# interpose. Our atexit handler then fired with a stale saved_status of 0 -# and clobbered every nonzero exit code (bad codec → 0, bad input → 0). -# The teardown SIGSEGV is now handled exclusively by the bash entrypoint -# wrapper at /usr/local/bin/ffmpeg-cuda-entrypoint, which converts the -# benign 139 to 0 only when no error keyword is present in stderr. Real -# failure exit codes propagate unchanged. +# nvshim: tiny LD_PRELOAD library exporting glibc-internal symbols that gcompat +# does NOT provide but the real NVIDIA driver backend calls during cuInit(). +# Without these, the stub libcuda dlopens fine but its backend fails with +# "Error relocating: : symbol not found", which ffmpeg surfaces as the +# misleading "Cannot load libcuda.so.1". # -# Symbols covered for (a) — broadest set of glibc-internals NVIDIA driver libs are -# known to reference; safe no-op or thin musl-redirect implementations: -# gnu_get_libc_version - sanity-check string ("2.35" satisfies all current drivers) -# gnu_get_libc_release - "stable" -# __libc_current_sigrtmin/max - musl macros, just expose as functions -# __register_atfork - glibc internal backing pthread_atfork; redirect -# __libc_single_threaded - data symbol some drivers test (0 = multi-threaded path) -# __cxa_thread_atexit_impl - C++ thread-local destructors registration; no-op -# secure_getenv - musl already has it but some old drivers want explicit -# dlmopen / dlvsym / __libc_dl* - glibc-only dl* variants, redirect to musl equivalents +# IMPORTANT: this shim must NOT interpose exit / _exit / _Exit. Doing so +# silently swallows ffmpeg's real exit codes (every error returns 0). +# Process-lifecycle policy belongs in the bash entrypoint wrapper below. +# See docs/ffmpeg-with-cuda.md (P6). RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \ mkdir -p /usr/local/lib && \ printf '%s\n' \ @@ -1486,18 +1433,16 @@ RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \ ' (void)f; (void)o; (void)dso; return 0;' \ '}' \ 'char *secure_getenv(const char *name) { return getenv(name); }' \ - '/* dlmopen is a glibc-only namespaced dlopen; musl has no link namespaces. */' \ - '/* Fallback to regular dlopen, ignoring the Lmid_t. Works for NVIDIA driver */' \ - '/* which uses dlmopen mostly for symbol isolation when loading sub-modules. */' \ + '/* dlmopen: glibc-only namespaced dlopen; musl has no link namespaces. */' \ 'typedef long Lmid_t;' \ 'void *dlmopen(Lmid_t lmid, const char *file, int mode) {' \ ' (void)lmid; return dlopen(file, mode);' \ '}' \ - '/* Glibc-internal dlopen/dlsym variants used by nss / driver init paths. */' \ + '/* glibc-internal dl* variants used by nss / driver init. */' \ 'void *__libc_dlopen_mode(const char *name, int mode) { return dlopen(name, mode); }' \ 'void *__libc_dlsym(void *handle, const char *name) { return dlsym(handle, name); }' \ 'int __libc_dlclose(void *handle) { return dlclose(handle); }' \ - '/* dlvsym = glibc versioned dlsym. musl has no symbol versioning; ignore version. */' \ + '/* dlvsym: glibc versioned dlsym; musl has no symbol versioning. */' \ 'void *dlvsym(void *handle, const char *name, const char *version) {' \ ' (void)version; return dlsym(handle, name);' \ '}' \ @@ -1506,50 +1451,22 @@ RUN apk add --no-cache --virtual .nvshim-build gcc musl-dev && \ rm /tmp/nvshim.c && \ apk del .nvshim-build -# Add NVIDIA driver injection paths to musl's dynamic-loader fallback search list. -# The NVIDIA Container Toolkit places libcuda.so.1 etc. in one of these locations -# depending on host distro: -# /usr/lib64 (RHEL / CentOS / Fedora / Rocky / openSUSE / WSL) -# /usr/lib/x86_64-linux-gnu (Debian / Ubuntu) -# /usr/lib/wsl/lib (WSL2 GPU passthrough alt path) -# musl's default search path is /lib:/usr/local/lib:/usr/lib only, so dlopen("libcuda.so.1") -# would otherwise fail with "Cannot load libcuda.so.1" even though the file is mounted. +# musl loader fallback search path. The NVIDIA Container Toolkit injects driver +# libs into one of these depending on host distro; musl's defaults +# (/lib:/usr/local/lib:/usr/lib) miss all three. RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gnu\n/usr/lib/wsl/lib\n' \ > /etc/ld-musl-x86_64.path -# Default NVIDIA Container Toolkit env vars so callers only need `--gpus all`. -# compute -> mounts the real libcuda.so.1 -# video -> mounts libnvcuvid.so.1 / libnvidia-encode.so.1 (required for NVENC/NVDEC) -# utility -> mounts libnvidia-ml + nvidia-smi -# LD_PRELOAD pulls in the nvshim providing glibc-internal symbols the driver needs. -ENV NVIDIA_VISIBLE_DEVICES=all \ - NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \ - LD_PRELOAD=/usr/local/lib/libnvshim.so -# Entrypoint wrapper to suppress benign teardown SIGSEGV from NVIDIA driver dtors. -# -# Background: when ffmpeg encodes/decodes through CUDA on Alpine/musl, the encode -# itself completes successfully and all output bytes are flushed, but at process -# teardown libcuda's __cxa_finalize / DT_FINI runs glibc-style destructors that -# unwind through state musl + gcompat don't fully provide, producing a SIGSEGV -# (exit 139). The crash happens INSIDE main() during avcodec_close -> cuCtxDestroy, -# before any atexit handler we could install would fire. There is no in-process -# fix available short of patching libcuda (closed source) or ffmpeg's nvenc.c to -# leak the CUDA context. -# -# Heuristic: convert exit=139 → 0 IFF stderr contains no recognisable ffmpeg -# error keywords. If ffmpeg printed a real error before crashing (Cannot load, -# "Error opening", "not found", etc.) we propagate 139 so users see real bugs. -# Works regardless of -loglevel: silent successful encode + teardown crash = -# empty stderr = suppressed; any real failure = error keyword present = passed -# through. Stdout (e.g. -f null - or muxed bytes for `-f mpegts -`) is preserved -# bit-exact via fd swap; user's stderr stream gets a live tee of ffmpeg stderr. +# Entrypoint wrapper: convert the benign teardown SIGSEGV (139 -> 0) that +# libcuda's __cxa_finalize triggers under musl + gcompat. The crash happens +# inside main() after the encode is complete and all output is flushed, so +# no in-process hook can suppress it. Heuristic: only downgrade 139 when +# stderr contains no recognisable error keyword. Real failure exit codes +# (1, 8, 254, ...) propagate unchanged. See docs/ffmpeg-with-cuda.md (P5). RUN apk add --no-cache bash && \ printf '%s\n' \ '#!/bin/bash' \ - '# ffmpeg-cuda entrypoint: swallow benign teardown SIGSEGV from libcuda dtors' \ - '# (exit 139 -> 0) only when no error keyword appears in stderr. Real failure' \ - '# exit codes (1, 8, 254, ...) propagate unchanged.' \ 'errfile=$(mktemp)' \ 'shellerr=$(mktemp)' \ 'trap "rm -f \"$errfile\" \"$shellerr\"" EXIT' \ @@ -1560,9 +1477,7 @@ RUN apk add --no-cache bash && \ 'rc=${PIPESTATUS[0]}' \ 'exec 3>&-' \ 'exec 2>&4 4>&-' \ - '# Replay bash diagnostics minus the known-benign SEGV line.' \ 'grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true' \ - '# Suppress the known benign teardown SIGSEGV (libcuda dtors on musl).' \ 'if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then' \ ' exit 0' \ 'fi' \ @@ -1570,11 +1485,25 @@ RUN apk add --no-cache bash && \ > /usr/local/bin/ffmpeg-cuda-entrypoint && \ chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint -# sanity tests (cannot exercise actual GPU encode without a GPU at build time) +# sanity tests (cannot exercise actual GPU encode without a GPU at build time). +# LD_PRELOAD set inline since the env is only declared in the final stage below. RUN ["/ffmpeg", "-version"] RUN ["/ffprobe", "-version"] RUN ["/ffmpeg", "-hide_banner", "-buildconf"] RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda RUN /ffmpeg -hide_banner -encoders 2>&1 | grep -q nvenc RUN /ffmpeg -hide_banner -decoders 2>&1 | grep -q cuvid + +# clamp all files into one layer +FROM scratch AS final-cuda2 +COPY --from=final-cuda1 / / + +FROM final-cuda2 AS final-cuda +LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" +# Default toolkit env so callers only need `--gpus all`. +# compute -> libcuda.so.1 ; video -> libnvcuvid + libnvidia-encode (NVENC/NVDEC) ; +# utility -> libnvidia-ml + nvidia-smi. +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \ + LD_PRELOAD=/usr/local/lib/libnvshim.so ENTRYPOINT ["/usr/local/bin/ffmpeg-cuda-entrypoint"] diff --git a/docs/ffmpeg-with-cuda.md b/docs/ffmpeg-with-cuda.md index 30279f5..e1286dc 100644 --- a/docs/ffmpeg-with-cuda.md +++ b/docs/ffmpeg-with-cuda.md @@ -1,213 +1,161 @@ # Adding NVIDIA CUDA / NVENC / NVDEC support to `static-ffmpeg` -**Date:** 2026-04-24 +**Date:** 2026-04-24 → 2026-05-03 **Tracking issue:** [#480 — Support for CUDA](https://github.com/wader/static-ffmpeg/issues/480) -**Outcome:** Separate `:-cuda` image variant added; default `:` remains a fully static-pie binary. +**Outcome:** Separate `:-cuda` image variant; default `:` remains a fully static-pie binary. --- -## 1. Problem statement +## TL;DR -The default `mwader/static-ffmpeg` image is a **fully static-pie musl binary** with zero -runtime dependencies. NVIDIA GPU acceleration (NVENC/NVDEC/CUVID) requires -`dlopen()`'ing the host's NVIDIA driver libraries (`libcuda.so.1`, -`libnvcuvid.so`, `libnvidia-encode.so`) at runtime, which is fundamentally -incompatible with `static-pie` on musl: a static-pie binary has no dynamic -loader, so `dlopen()` cannot work. - -Goal: ship a second image variant that supports CUDA without breaking the -existing static guarantees of the default image. +| | Default `:8.1` | CUDA `:8.1-cuda` | +|---|---|---| +| Linkage | static-pie musl | musl **dynamic-PIE** (libc only) | +| `readelf -d` NEEDED | (none) | exactly one: `libc.musl-x86_64.so.1` | +| GPU | ❌ | ✅ NVENC / NVDEC / CUVID | +| Arch | amd64 + arm64 | amd64 only | +| Base image | scratch | alpine | +| ffmpeg exit codes | upstream | identical to upstream | + +The CUDA variant works on Alpine + musl by combining six independently-essential +layers (link-time + runtime). Each layer fixes one specific failure mode that +appeared during development. The layers are summarized below; full +problem → cause → fix sections follow. + +| # | Layer | Stage | Fixes | +|---|---|---|---| +| 1 | Absolute-path link of `/lib/ld-musl-x86_64.so.1` | builder | dlopen returning NULL silently (P1) | +| 2 | Dynamic-PIE link mode (`-fPIE -pie`, not `-static-pie`) | builder | dlopen impossible on static-pie (P1) | +| 3 | `/etc/ld-musl-x86_64.path` listing toolkit injection dirs | runtime | musl can't find `/usr/lib64`, `/usr/lib/wsl/lib` (P3) | +| 4 | `gcompat` package + `libdl.so.2 → libgcompat.so.0` symlink | runtime | NVIDIA driver libs need `libc.so.6` / `libdl.so.2` (P4) | +| 5 | `libnvshim.so` LD_PRELOAD (ABI-shim symbols only) | runtime | glibc-internal symbols missing from gcompat (P4) | +| 6 | Bash entrypoint wrapper (139 → 0 only) | runtime | benign teardown SIGSEGV from libcuda dtors (P5) | --- -## 2. Architecture decision +## 1. Architecture decision ### Two separate variants, not one -| Variant | Tag | Linkage | GPU support | -|---------|----------------------------|-------------------------------------|-------------| -| Default | `8.1`, `latest` | static-pie musl | ❌ | -| CUDA | `8.1-cuda`, `latest-cuda` | musl **dynamic-PIE** (libc only) | ✅ | - -**Why a separate variant** (not a build-arg toggle on the default tag): -- The default tag's value proposition is "drop into any base image including `FROM scratch`". Making it dynamic would silently break that for thousands of existing users. -- CUDA users need the NVIDIA Container Toolkit and a GPU host — fundamentally different deployment. -- Different tag = explicit user opt-in + clear support boundary. +- The default `mwader/static-ffmpeg` is a fully static-pie musl binary that drops into `FROM scratch`. We must not silently break that for existing users. +- CUDA requires `dlopen()` of host driver libraries → fundamentally incompatible with `static-pie` on musl (no dynamic loader). +- CUDA users need the NVIDIA Container Toolkit and a GPU host — different deployment. +- → Different tag = explicit user opt-in + clear support boundary. ### Build-arg `ENABLE_CUDA` A single `ARG ENABLE_CUDA=` controls everything: -- Adds `nv-codec-headers` (header-only, no runtime CUDA toolkit needed) -- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` to ffmpeg -- Switches link mode from `static-pie` to musl `dynamic-PIE` -- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` env -- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs -- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry) - -The CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`. - ---- - -## 3. Why CUDA cannot be `static-pie` on musl - -| Constraint | Implication | -|---|---| -| `static-pie` binaries have no dynamic loader | `dlopen()` impossible | -| `nvenc` calls `dlopen("libcuda.so.1", RTLD_LAZY)` via `ffnvcodec/dynlink_loader.h` | Must be a dynamic binary | -| `libcuda.so.1` is provided by the host driver, version-matched to the host | Must NOT be bundled in image | -| NVIDIA Container Toolkit injects driver libs at container start | Image just needs to be loadable | -**The minimum-impact compromise:** binary is dynamic only for libc; *every other dependency* (codecs, openssl, libstdc++, libgomp, libgcc, …) remains statically archived. The cuda variant's `readelf -d` differs from the default by **exactly one extra `NEEDED` entry**: `libc.musl-x86_64.so.1`. +- Adds `nv-codec-headers` (header-only, no CUDA toolkit at build time). +- Adds `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec`. +- Switches link mode from `static-pie` to musl dynamic-PIE. +- Sets `NVIDIA_VISIBLE_DEVICES=all` and `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video`. +- Writes `/etc/ld-musl-x86_64.path` so musl's loader can find toolkit-injected libs. +- Switches `checkelf` to `--cuda` mode (allows libc as the only NEEDED entry). ---- +CI builds two images per release: default (no arg) and `final-cuda` target with `ENABLE_CUDA=1`. -## 4. Limitations explicitly NOT supported +### Explicitly NOT supported | Feature | Reason | |---|---| | `--enable-cuda-nvcc` | Requires the full ~3 GB glibc-based CUDA toolkit at build time | -| `--enable-libnpp` | Same — glibc-based, defeats the static/musl design | -| `scale_npp` filter | Comes with libnpp; use `scale_cuda` instead | -| `arm64` builds | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack); released as **amd64-only** for now | -| `FROM scratch` / distroless target images | No musl loader available; copy-out won't work | +| `--enable-libnpp` / `scale_npp` | Same — glibc-only; use `scale_cuda` instead | +| `arm64` | NVIDIA Container Toolkit on arm64 is server-class only (Jetson uses a different stack) | +| `FROM scratch` / distroless target images | No musl loader available | --- -## 5. Files changed - -### `Dockerfile` -1. New `ARG ENABLE_CUDA=` early in the builder stage. -2. New `nv-codec-headers` install step (skipped when `ENABLE_CUDA` is unset). -3. `ffmpeg` configure step extended: - - `--enable-ffnvcodec --enable-cuvid --enable-nvenc --enable-nvdec` when `ENABLE_CUDA` - - Replaces `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie` (dynamic-PIE) when `ENABLE_CUDA` - - Custom `CUDA_LDFLAGS` / `CUDA_EXTRA_LIBS` to keep all non-libc deps static (see §6) -4. `checkelf` invocation gains `--cuda` flag when `ENABLE_CUDA`. -5. New `final-cuda` stage: `FROM alpine:3.X` + copy of `/usr/local/bin/{ffmpeg,ffprobe}` + ld-musl path config + `ENV NVIDIA_*`. - -### `checkelf` -- Accepts `--cuda` flag. -- In `--cuda` mode allows the musl loader/libc entry from `ldd` output (everything else still rejected). -- All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved. +## 2. Problem → Root cause → Fix -### `README.md` -- New "CUDA / NVENC / NVDEC" section with build, run, `COPY --from=` recipes for Alpine / Debian / `nvidia/cuda:*` target images, and a "verify static-ness from the host" section using `readelf -d`. -- New tag entry: `-cuda` / `latest-cuda` (amd64-only). +Each subsection records one failure mode encountered during development. --- -## 6. The dlopen / static-musl trap (gotcha worth documenting) - -This was the single most painful issue and is **not obvious** from the build logs. - -### Symptom - -The `:8.1-cuda` binary builds successfully, `checkelf --cuda` passes, but at runtime: - -``` -[h264_nvenc @ 0x...] Cannot load libcuda.so.1 -``` - -`strace -e openat` shows that ffmpeg **never even attempts** to open any libcuda file — `dlopen()` returns NULL immediately without touching the filesystem. - -### Root cause - -musl's **static `libc.a`** ships a 25-byte `dlopen` stub that always returns NULL with `errno=ENOSYS`. This is documented behavior — musl deliberately does not support `dlopen` from statically-linked binaries. - -The original CUDA build flags were: - -```sh ---extra-ldflags='-static-libstdc++ -static-libgcc -Wl,-Bstatic' ---extra-libs=' -lgomp -Wl,-Bdynamic -lc ' -``` - -The intent: switch to `-Bstatic` for the codec libs, then flip back to `-Bdynamic` at the end so libc stays dynamic. That keeps `ldd` output clean (one NEEDED entry: musl libc). - -The bug: ffmpeg's `nvenc.c` references `dlopen`. While processing the codec `.a` files in `-Bstatic` mode, the linker resolves `dlopen` from the static `libc.a` (which gcc pulls in implicitly). Result: - -``` -readelf -s --dyn-syms /ffmpeg | grep dlopen -# 21987: 000000000338c50e 25 FUNC WEAK DEFAULT 14 dlopen -# ^^ ^^^^ -# 25 bytes .text section -``` - -`dlopen` is a **25-byte function defined inside the binary itself** in section 14 (`.text`) — the static stub. It's not `UND`, so it never goes through the PLT to dynamic libc. - -### Fix (final, robust) - -Link the musl loader/libc by **absolute path** in the `--extra-ldflags`, so the -linker resolution is immune to subsequent `-Bstatic`/`-Bdynamic` toggles: - +### P1. `[h264_nvenc] Cannot load libcuda.so.1` — `dlopen()` silently returns NULL + +**Symptom.** Binary builds, `checkelf --cuda` passes, but at runtime +`dlopen("libcuda.so.1")` returns NULL. `strace -e openat` shows ffmpeg never +even attempts to open any libcuda file — no syscall fires at all. + +**Root cause.** Two independent musl traps stacked together: + +1. **`-static-pie` has no dynamic loader.** A static-pie musl binary cannot + `dlopen()` anything by definition. +2. **musl's static `libc.a` ships a 25-byte `dlopen` stub** that always returns + `NULL` with `errno=ENOSYS`. Even after switching to dynamic-PIE, gcc's + `--toolchain=hardened` spec file kept emitting late references that pulled + `libc.a` back in, restoring the stub inside the binary. The bug was + invisible to standard checks: `BIND_NOW`, `RELRO`, `PIE`, NX stack all + passed; `ldd` still showed only one extra NEEDED entry. Only + `readelf -s --dyn-syms /ffmpeg | grep dlopen` revealed: + ``` + 21987: 000000000338c50e 25 FUNC WEAK DEFAULT 14 dlopen + ``` + — `dlopen` defined inside `.text` at 25 bytes, not `UND`. + + Variants tried that did NOT fix it: + - `--extra-libs=' -lgomp -Wl,-Bdynamic -lc '` reorder — gcc spec file re-pulled `libc.a`. + - Hiding `/usr/lib/libc.a` during link — broke libgme configure-time symbol checks. + +**Fix (Layers 1 + 2).** + +1. Link mode: replace `add_ldexeflags -fPIE -static-pie` with `-fPIE -pie`. +2. Link the musl combined loader/libc by **absolute path** in + `--extra-ldflags`, so the linker resolution is immune to `-Bstatic` / + `-Bdynamic` toggles and gcc spec-file re-emissions: + ```sh + --extra-ldflags='-fopenmp -Wl,--allow-multiple-definition \ + -Wl,-z,stack-size=2097152 \ + -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ + -Wl,--as-needed -Wl,-Bstatic \ + -static-libstdc++ -static-libgcc' + --extra-libs='-lgomp -Wl,-Bdynamic -lc' + ``` + + On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc; + one absolute filename covers everything we needed `-lc` for. An absolute + filename is opened literally regardless of `-Bstatic` mode and cannot be + re-resolved against `libc.a`. + +**Verification.** ```sh ---extra-ldflags='-fopenmp -Wl,--allow-multiple-definition -Wl,-z,stack-size=2097152 \ - -Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed \ - -Wl,--as-needed -Wl,-Bstatic \ - -static-libstdc++ -static-libgcc' ---extra-libs='-lgomp -Wl,-Bdynamic -lc' +readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose' +# Each must be 0-size UND, OR not exported (resolved internally against +# the absolute-path libc — both work). The functional NVENC encode is +# the ground truth; readelf is the cheap pre-flight. ``` -Why the absolute path works where `-Wl,--no-as-needed,-Bdynamic,-lc` did not: - -- A `-l` argument is searched per the current `-Bstatic`/`-Bdynamic` mode and - per the linker's library search path. It is also fed through gcc's spec file, - which (especially under `--toolchain=hardened`) re-emits late-stage references - that can pull `libc.a` back in even after a careful `-Bdynamic … -Bstatic` - reorder, restoring the broken stub. -- An **absolute filename** in the linker command line is not treated as a `-l` - search at all; it is opened literally as a DSO regardless of the `-Bstatic` - mode in effect. Its dynamic symbols (including `dlopen`, `dlsym`, `dlerror`, - `dlclose`) are then available to satisfy references from later `.a` archives, - and those references resolve as `UND` (PLT) instead of pulling the static stub. -- On Alpine, `/lib/ld-musl-x86_64.so.1` is *both* the dynamic loader and libc — - one file serves both roles — so this single absolute path covers everything - we needed `-lc` for. - -### Verification (the bug is invisible to most checks) +**Lesson.** Never link musl `libc.a` into a binary that calls `dlopen` — it +will silently use the stub. The `-Bdynamic -lc -Bstatic` reorder is fragile +under `--toolchain=hardened`; prefer the absolute-path form. -```sh -readelf -s --dyn-syms /ffmpeg | grep -E 'dlopen|dlsym|dlerror|dlclose' -# Each must show: -# 0: 0 FUNC ... UND dl -# If any shows a non-zero size with a section number (e.g. " 25 FUNC ... 14 dlopen"), -# the static stub is back and dlopen will silently return NULL with ENOSYS. -``` +--- -> Note: in some link configurations the linker may resolve `dlopen` purely -> *internally* against the absolute-path libc and not export an explicit `UND` -> entry for it. The functional test (h264_nvenc actually encoding frames) -> remains the ultimate ground truth; readelf is just the cheapest pre-flight -> check that catches the stub-bug regression. +### P2. `checkelf` rejects the dynamic-PIE binary -### Lessons for any future change to this build +**Symptom.** The CUDA build's hardening check rejects the binary because it +has a `NEEDED` entry (libc), whereas the default build has zero. -- **Never link musl `libc.a` into a binary that calls `dlopen`.** It will silently use the stub. -- The `-Bdynamic -lc -Bstatic` reorder is fragile under gcc's `--toolchain=hardened` - spec file. Prefer the absolute-path form `/lib/ld-musl-x86_64.so.1`. -- The bug is invisible to standard hardening checks: the binary still has - `BIND_NOW`, `RELRO`, `PIE`, NX stack. `ldd` still shows only one extra - NEEDED entry. -- The only reliable signal is a real NVENC encode actually emitting frames. +**Fix.** Add `--cuda` flag to `checkelf`. In `--cuda` mode it allows the +musl loader/libc entry from `ldd` output (everything else still rejected). +All other hardening checks (RELRO, BIND_NOW, PIE, NX stack) preserved. --- -## 7. Runtime requirements +### P3. `dlopen("libcuda.so.1")` reports "Library not found" -### Host -- NVIDIA driver installed -- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker -- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`) - -### Image-side env (set by Dockerfile) -- `NVIDIA_VISIBLE_DEVICES=all` -- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` - - `compute` → `libcuda.so.1` - - `video` → `libnvcuvid.so`, `libnvidia-encode.so` - - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`. +**Symptom.** With driver libs actually mounted by the toolkit, +`dlopen("libcuda.so.1")` still fails with "Library not found". -### `/etc/ld-musl-x86_64.path` -musl does **not** read `/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently ignored. We ship a static path file: +**Root cause.** musl's default loader search path is +`/lib:/usr/local/lib:/usr/lib`. The NVIDIA Container Toolkit injects driver +libs to `/usr/lib64` (RHEL/Fedora/WSL convention) or +`/usr/lib/x86_64-linux-gnu` (Debian/Ubuntu). musl also doesn't read +`/etc/ld.so.cache`, so the toolkit's `ldconfig` post-start hook is silently +ignored. +**Fix (Layer 3).** Ship a static `/etc/ld-musl-x86_64.path`: ``` /usr/lib/x86_64-linux-gnu /usr/lib64 @@ -216,424 +164,190 @@ musl does **not** read `/etc/ld.so.cache`, so the toolkit's `ldconfig` post-star /usr/local/lib /lib ``` - -Covers the three common toolkit injection layouts: -- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu` -- RHEL/Fedora hosts → `/usr/lib64` -- WSL2 → `/usr/lib/wsl/lib` - Listing all is safe — musl silently skips paths that don't exist. --- -## 8. Verifying the image - -### From any Linux host (no musl needed) - -```sh -docker create --name sf mwader/static-ffmpeg:8.1 -docker cp sf:/ffmpeg /tmp/ffmpeg-static && docker rm sf - -docker create --name sfcuda mwader/static-ffmpeg:8.1-cuda -docker cp sfcuda:/ffmpeg /tmp/ffmpeg-cuda && docker rm sfcuda - -readelf -d /tmp/ffmpeg-static | grep -E 'NEEDED|BIND_NOW' -# (no NEEDED entries — fully static) -# 0x000000000000001e (FLAGS) BIND_NOW - -readelf -d /tmp/ffmpeg-cuda | grep -E 'NEEDED|BIND_NOW' -# 0x0000000000000001 (NEEDED) Shared library: [libc.musl-x86_64.so.1] -# 0x000000000000001e (FLAGS) BIND_NOW -``` - -### dlopen sanity check (the painful one) - -```sh -docker run --gpus all --rm --entrypoint sh mwader/static-ffmpeg:8.1-cuda -c ' -apk add --no-cache binutils >/dev/null 2>&1 -readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror" -' -# MUST end with "UND dlopen", "UND dlsym", "UND dlerror" -# If any has a non-zero size in .text → static stub bug is back. -``` - -### Functional encode - -```sh -docker run --gpus all --rm mwader/static-ffmpeg:8.1-cuda \ - -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ - -c:v h264_nvenc -f null - -# expect: frame= 60 ... finished -``` - ---- - -## 9. Comparison with other static ffmpeg + nvenc projects - -| Project | Static? | NVENC? | Approach | -|---|---|---|---| -| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen | -| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works | -| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic, plus runtime ldconfig | ✅ | Tarball, glibc-linked | -| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs | -| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container | - -Of these, **only `:8.1-cuda` keeps every codec/lib statically linked** — every other "static + nvenc" build is glibc-dynamic. The trade-off vs the default `:8.1` is exactly one libc.so dependency. - ---- - -## 10. CI / multi-arch publishing notes - -- Default tag: built for `linux/amd64,linux/arm64` as before. -- CUDA tag: built for `linux/amd64` only. - - Pushed as `-cuda` (and re-tagged manifest-style as `-cuda-amd64` for clarity). - - `latest-cuda` follows latest stable. -- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix entry. - ---- - -## 11. Issues encountered during implementation (chronological) - -1. **`nv-codec-headers` checksum mismatch** — initial SHA256 was wrong; fixed by recomputing against the actual GitHub release tarball. -2. **`checkelf` rejected the dynamic-PIE binary** — added `--cuda` mode that allows musl libc + loader as the only `ldd` entries. -3. **Spurious dynamic deps (`libgomp`, `libdrm`, etc.)** — fixed by pre-linking with `-Wl,-Bstatic` (initial fix) and `-static-libgcc -static-libstdc++`. -4. **`Cannot load libcuda.so.1` at runtime, despite `--gpus all`** (the big one) — root caused to musl's static `libc.a` `dlopen` stub. Fixed in §6. -5. **WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook** — host-side regression unrelated to image; resolved by `wsl --shutdown` + restart. Not an image issue. -6. **NVIDIA driver libs reference glibc-internal symbols missing from musl/gcompat** — added `gcompat` package + a tiny `libnvshim.so` `LD_PRELOAD` library exporting the missing symbols. See §14. -7. **musl loader doesn't search `/usr/lib64` / `/usr/lib/wsl/lib` where the toolkit injects driver libs** — added `/etc/ld-musl-x86_64.path` listing all known injection layouts. -8. **`NVIDIA_DRIVER_CAPABILITIES` defaults to `utility` only** — without `compute,video` the toolkit doesn't mount `libnvcuvid.so`/`libnvidia-encode.so`. Baked the full set into the image's `ENV`. -9. **`-Bdynamic -lc` reorder still produced the static dlopen stub** under gcc `--toolchain=hardened` — switched to absolute-path link of `/lib/ld-musl-x86_64.so.1` (see §6, "Fix (final, robust)"). -10. **NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown** — libcuda's destructors crash under musl + gcompat during `cuCtxDestroy`. The crash happens in `main()` before any atexit handler fires, so it can't be caught from inside the binary. Fixed with a tiny entrypoint wrapper that downgrades exit 139 → 0 when stderr contains no recognised error keywords. See §14. -11. **All ffmpeg errors silently exit 0 (bad codec, bad input, bad filter)** — root caused to a `_exit` interposer in `libnvshim.so` that always called `syscall(SYS_exit_group, 0)` regardless of the status it received (or had a bug that lost the argument). Verified via an `LD_PRELOAD` `dladdr` tracer: every `_exit` call resolved to `dso=/usr/local/lib/libnvshim.so`. **Fix**: removed the `_exit`/`exit` interposers from `libnvshim.so` entirely — they were never needed for the glibc→musl ABI shim, only the original (mistaken) attempt to suppress the teardown SEGV from inside the process. Real ffmpeg exit codes (`8` for bad codec, `254` for bad input, `8` for bad filter) now propagate identically to the non-CUDA `:8.1` image. See §5c. - ---- - -## 12. Open follow-ups - -- [ ] Document required `nvidia-container-toolkit` minimum version once we know which versions reliably handle the prestart hook on WSL2. -- [ ] Consider exposing `NVIDIA_DRIVER_CAPABILITIES` as a build-arg for power users who want to drop `video`. -- [ ] Add a CI smoke test that runs the encode on a self-hosted GPU runner (currently only readelf-level checks possible in vanilla GitHub Actions). -- [ ] Investigate whether `arm64` Jetson support is feasible later (would need a separate `nv-codec-headers` build path and likely a different base image). +### P4. NVIDIA driver libs reference glibc-internal symbols missing from musl + +**Symptom.** Even with libs found, `dlopen("libcuda.so.1.1")` (the WSL2 +backend) fails with `Error relocating: : symbol not found`. Iteratively +discovered missing symbols: `gnu_get_libc_version`, `__register_atfork`, +`dlmopen`, `dlvsym`, etc. + +**Root cause.** NVIDIA driver libs are built against glibc. +`gcompat` provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / +`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds +`dlopen` into libc) and a number of glibc-internal helpers used by recent +drivers. + +**Fix (Layers 4 + 5).** + +- Install `gcompat` package. +- Symlink `libdl.so.2 → libgcompat.so.0` (driver's `DT_NEEDED libdl.so.2`). +- Build a small `libnvshim.so` exporting the missing glibc-internal symbols + and `LD_PRELOAD` it. Final shim payload: + + | Symbol | Implementation | + |---|---| + | `gnu_get_libc_version` | return `"2.35"` | + | `gnu_get_libc_release` | return `"stable"` | + | `__libc_current_sigrtmin` / `__libc_current_sigrtmax` | musl macros exposed as functions | + | `__register_atfork` | redirect to `pthread_atfork` | + | `__cxa_thread_atexit_impl` | no-op | + | `__libc_single_threaded` | data symbol, value 0 | + | `secure_getenv` | redirect to `getenv` | + | `dlmopen` | redirect to `dlopen` (ignore Lmid_t) | + | `dlvsym` | redirect to `dlsym` (ignore version) | + | `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose` | wrappers | + + > **Critical: `libnvshim.so` must NOT export `exit` / `_exit` / `_Exit`.** + > See P6 — interposing those swallows ffmpeg's real exit status. + +**Maintenance note.** Each new NVIDIA driver release may reference one more +glibc-internal symbol. Diagnostic recipe in §3 finds it in <5 minutes; fix +is a one-line addition to `libnvshim.so`. --- -## 13. Resuming work on another machine - -If you need to continue from a fresh checkout / device, here is the full -sequence to rebuild and validate the CUDA image end-to-end. - -### Build - -> ⚠️ Use `--no-cache` if you previously built `:8.1-cuda` with the broken -> link flags — Docker will otherwise reuse the cached ffmpeg layer that -> contains the static `dlopen` stub. Full rebuild on a typical machine -> takes ~45–75 min (most of it is libaom, libvmaf, x265, svt-av1, vvenc). - -```sh -cd /path/to/static-ffmpeg - -docker build --no-cache \ - --build-arg ENABLE_CUDA=1 \ - --target final-cuda \ - -t mwader/static-ffmpeg:8.1-cuda-v3 . -``` - -If you only changed something *after* the ffmpeg compile step (e.g. the -`final-cuda` stage, env vars, ld-musl path), you can skip `--no-cache`: - -```sh -docker build \ - --build-arg ENABLE_CUDA=1 \ - --target final-cuda \ - -t mwader/static-ffmpeg:8.1-cuda-v3 . -``` - ---- +### P5. NVENC encode succeeds but exits 139 (SIGSEGV) at process teardown -## Investigation log: April 28 – May 2, 2026 (Alpine/musl + WSL2 NVIDIA stack) +**Symptom.** Encode completes successfully (`frame= 60 ... muxing overhead`, +output bytes fully written), then ffmpeg exits with 139. -This section records every layer that had to be peeled back to get NVENC working -on Alpine/musl with the NVIDIA Container Toolkit on a Windows + WSL2 host -(host driver 596.21, CUDA 13.2, RTX 3060 Ti, ffnvcodec 13.0.19.0, ffmpeg 8.1). +**Root cause.** libcuda's `__cxa_finalize` / `DT_FINI` destructors run during +`avcodec_close → nvenc_free → cuCtxDestroy` while still inside `main()`. +Those destructors call into glibc-internal state (TLS-destructor unwinding, +pthread_atfork handlers) that musl + gcompat don't fully provide, and crash. -### Environment +Because the crash is inside `main()` (not after `exit()` is called), no +in-process hook — atexit, `LD_PRELOAD` signal handlers, etc. — can suppress +it cleanly. Attempts at in-process suppression all failed: -- Host: Windows 11 + WSL2 (Ubuntu 22.04), Docker Desktop / engine. -- GPU: NVIDIA RTX 3060 Ti, driver 596.21, CUDA 13.2 (per `nvidia-smi`). -- Container base for `final-cuda`: `alpine:3.20.3` (musl 1.2.x). -- Driver injection paths used by the toolkit on this host: - - `/usr/lib64/libcuda.so.1` (179 KB WSL "loader stub") - - `/usr/lib64/libnvcuvid.so.1` (23.8 MB, real) - - `/usr/lib64/libnvidia-encode.so.1`(266 KB stub) - - `/usr/lib64/libnvidia-ml.so.1` (278 KB) - - `/usr/lib/wsl/drivers/nv_dispi.inf_amd64_/libcuda.so.1.1` (24.1 MB, real backend) - -### Layer-by-layer findings - -#### 1. ffmpeg link conflict (fixed) - -Symptom: ffmpeg link in builder failed with all `--enable-*` flags on. -Cause: `export LDFLAGS="-Wl,--no-as-needed -Wl,-Bdynamic -lc"` was set -**unconditionally**, conflicting with the `-static-pie` configure patch used in -the non-CUDA branch. -Fix: gate the `LDFLAGS` export on `ENABLE_CUDA` only. Non-CUDA build returns to -upstream static-pie behaviour. - -#### 2. NVIDIA Container Toolkit capabilities (fixed) - -Symptom: only 180 KB stub `libcuda.so.1` mounted; `libnvcuvid` / `libnvidia-encode` -absent. -Cause: `--gpus all` only exposes the *device*; library set is governed by -`NVIDIA_DRIVER_CAPABILITIES`. Default is just `utility` → no compute/video libs. -Fix: bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` and -`NVIDIA_VISIBLE_DEVICES=all` into the `final-cuda` stage image config. - -#### 3. musl dynamic-loader search path (fixed) - -Symptom: even with libs mounted, `dlopen("libcuda.so.1")` reported "Library not found". -Cause: musl's default search path is `/lib:/usr/local/lib:/usr/lib`; toolkit -mounts driver libs to `/usr/lib64` (RHEL/Fedora/WSL convention) which musl does -not search. -Fix: write `/etc/ld-musl-x86_64.path` listing `/lib`, `/usr/local/lib`, `/usr/lib`, -`/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib`. - -#### 4. glibc → musl ABI gap (fixed via gcompat + nvshim) - -Symptom: NVIDIA driver libs (compiled against glibc) reference glibc-internal -symbols not present in musl/gcompat. -Cause: gcompat provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / -`librt.so.1` as musl wrappers, but is missing `libdl.so.2` (musl folds dlopen -into libc) and a number of glibc-internal helpers used by recent NVIDIA drivers. - -Iterative discovery of missing symbols (each found by `dlopen` of the WSL -backend library reporting "Error relocating: : symbol not found"): - -| Iteration | Newly-needed symbol | Shim strategy | -|---|---|---| -| 1 | `gnu_get_libc_version` | return `"2.35"` | -| 2 | `__register_atfork` | redirect to `pthread_atfork` | -| 3 | `dlmopen` | wrapper around `dlopen` (ignore Lmid_t) | -| 4 | `dlvsym` | wrapper around `dlsym` (ignore version) | - -Final shim payload (`libnvshim.so`, `LD_PRELOAD`'d): - -- `gnu_get_libc_version` → `"2.35"` -- `gnu_get_libc_release` → `"stable"` -- `__libc_current_sigrtmin` / `__libc_current_sigrtmax` (musl macros exposed as functions) -- `__register_atfork` → `pthread_atfork` -- `__cxa_thread_atexit_impl` → no-op -- `__libc_single_threaded` (data symbol, value 0) -- `secure_getenv` → `getenv` -- `dlmopen` → `dlopen` (ignore namespace) -- `dlvsym` → `dlsym` (ignore version) -- `__libc_dlopen_mode` / `__libc_dlsym` / `__libc_dlclose` - -After this set, the **standalone** dlopen test passes on every layer: - -- `dlopen("libcuda.so.1", RTLD_LAZY)` → OK (loads /usr/lib64 stub). -- `dlopen("/usr/lib/wsl/drivers/.../libcuda.so.1.1", RTLD_NOW)` → OK (real backend). -- `dlopen("libnvcuvid.so.1", RTLD_NOW)` → OK. -- `dlopen("libnvidia-encode.so.1", RTLD_NOW)` → OK. -- `dlopen("libnvidia-ml.so.1", RTLD_NOW)` → OK. -- `dlsym(cuInit / cuDriverGetVersion / cuDeviceGet / cuCtxCreate_v2 / cuCtxDestroy_v2 / cuMemAlloc_v2)` → all non-NULL. -- `cuInit(0)` → returns `CUDA_SUCCESS` (0). -- `cuDriverGetVersion(&v)` → returns 0 with v = 13020 (CUDA 13.2). - -`nvidia-smi` inside the container prints full GPU info. - -### 5. Resolved: ffmpeg's `nvenc_load_libraries` reporting "Cannot load libcuda.so.1" - -**Root cause** (the same musl static `libc.a` `dlopen` stub described in §6, -but a worse variant of it): even with the `-Wl,--no-as-needed,-Bdynamic,-lc` -reorder, gcc's `--toolchain=hardened` spec file emitted late references that -re-pulled `libc.a`, restoring the 25-byte `dlopen` stub inside the binary. -`readelf -s --dyn-syms /ffmpeg | grep dlopen` then showed: - -``` -21987: 000000000338c50e 25 FUNC WEAK DEFAULT 14 dlopen -``` - -— `dlopen` defined inside `.text` of the binary itself, returning NULL with -`ENOSYS` without ever issuing an `openat` syscall. Hence `strace` showed no -filesystem activity for `libcuda*`. - -**Fix**: link the musl combined loader/libc by **absolute path** rather than -via `-lc`. Absolute filenames bypass `-Bstatic`/`-Bdynamic` mode altogether and -cannot be re-resolved against `libc.a`: - -```sh -# in --extra-ldflags: --Wl,--no-as-needed,/lib/ld-musl-x86_64.so.1,--as-needed -``` +| Attempt | Result | +|---|---| +| `nvshim` `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* `main()` returns; atexit never runs | +| In-process signal handler | Same — crash is in destructor before signal can dispatch | -After this change, `dlopen`/`dlsym`/`dlerror`/`dlclose` resolve as `UND` -(or are bound internally to the absolute-path libc — both outcomes work at -runtime) and h264_nvenc encodes successfully. - -### 5b. Resolved: SIGSEGV at process teardown (exit 139) - -**Symptom**: encode completes successfully (`frame= 60 ... muxing overhead` -visible, output bytes fully written), then ffmpeg exits with 139 (SIGSEGV). -Reproduced with and without `LD_PRELOAD=libnvshim.so`, so nvshim is not the -trigger. - -**Root cause**: libcuda's `__cxa_finalize` / DT_FINI destructors run during -ffmpeg's `avcodec_close → nvenc_free → cuCtxDestroy` while still inside -`main()`. Those destructors call into glibc-internal state that musl + gcompat -don't fully provide (notably TLS-destructor unwinding, and pthread_atfork -handlers registered by the driver), and crash. Because the crash is *inside* -`main()` (not after `exit()` is called), there is no in-process hook — atexit -handlers, signal handlers installed by `LD_PRELOAD`, etc. — that can suppress -it cleanly without risk of papering over real bugs. - -**Fix**: a 12-line bash entrypoint wrapper that runs `/ffmpeg`, captures its -exit code via `${PIPESTATUS[0]}`, tees stderr to a temp file for inspection, -preserves stdout byte-exact via fd-3 trick, and converts exit 139 → 0 *only* -when stderr contains no recognised ffmpeg error keyword (`error`, `cannot -load`, `not found`, `invalid`, `failed`, `conversion failed`, `no such`). -Real failures (mid-encode CUDA OOM, init failures, bad codec, etc.) propagate -unchanged because they always print an identifiable error first. +**Fix (Layer 6).** Out-of-process bash entrypoint wrapper that captures the +real exit code via `${PIPESTATUS[0]}` and downgrades **only** `139 → 0`, +gated on stderr containing no recognized error keyword. Real failures +(mid-encode CUDA OOM, init failures, etc.) propagate unchanged because they +always print an identifiable error first. ```bash #!/bin/bash errfile=$(mktemp) -trap "rm -f \"$errfile\"" EXIT +shellerr=$(mktemp) +trap "rm -f \"$errfile\" \"$shellerr\"" EXIT exec 3>&1 -{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&2 +exec 4>&2 +exec 2>"$shellerr" +{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4 rc=${PIPESTATUS[0]} exec 3>&- +exec 2>&4 4>&- +# Filter the bash job-control "Segmentation fault (core dumped)" line. +grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true +# Suppress *only* the known-benign teardown SIGSEGV. if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then exit 0 fi exit "$rc" ``` -ffprobe doesn't need a wrapper: it doesn't invoke encoders and rarely auto-loads -CUDA, so it doesn't reach the crashing destructor path. +ffprobe doesn't need the wrapper — it doesn't open NVENC encoders, so the +crashing destructor path isn't reached. -### 5c. Resolved: ffmpeg silently exits 0 on every error path +--- -**Symptom**: every fatal-error invocation of the CUDA build returned exit code -`0` to the shell, despite ffmpeg printing the correct error messages on stderr. +### P6. ffmpeg silently exits 0 on every error path + +**Symptom.** Every fatal-error invocation of the CUDA build returned exit +code `0` to the shell, despite ffmpeg printing the correct error messages. Verified against the non-CUDA `:8.1` baseline: -| Scenario | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) | -|----------------------------------------|-----------------|---------------|--------------| -| `-c:v this_codec_does_not_exist` | `8` | `0` ❌ | `8` ✅ | -| `-i /no/such/file.mp4` | `254` | `0` ❌ | `254` ✅ | -| `-vf this_filter_does_not_exist` | `8` | `0` ❌ | `8` ✅ | -| Successful encode | `0` | `0` ✅ | `0` ✅ | -| Successful encode (post-teardown SEGV) | n/a | `139` (raw) | `0` (wrapped) | - -This was masked at first because the wrapper grew an "upgrade exit 0 → 1 when -stderr matches a fatal-error keyword" branch. That made T3 pass with a -plausible-looking exit `1`, but it was a workaround, not a fix — and the wrong +| Scenario | non-CUDA `:8.1` | CUDA (broken) | CUDA (fixed) | +|---|---|---|---| +| `-c:v this_codec_does_not_exist` | `8` | `0` ❌ | `8` ✅ | +| `-i /no/such/file.mp4` | `254` | `0` ❌ | `254` ✅ | +| `-vf this_filter_does_not_exist` | `8` | `0` ❌ | `8` ✅ | +| Successful encode | `0` | `0` ✅ | `0` ✅ | +| Successful encode (post-teardown SEGV) | n/a | `139` (raw) | `0` (wrapped) | + +This was masked at first by an "upgrade exit 0 → 1 when stderr matches a +fatal-error keyword" branch in the wrapper. That made tests pass with a +plausible-looking exit `1`, but it was a workaround, not a fix — the wrong exit code (`1` instead of `8`/`254`) broke any caller that switched on the specific code. -**Root-cause discovery**: an `LD_PRELOAD` `dladdr` tracer interposing `_exit` +**Root-cause discovery.** An `LD_PRELOAD` `dladdr` tracer interposing `_exit` revealed that on every code path — bad-codec, bad-input, even successful `-version` — the call to `_exit` came from `libnvshim.so`: - ``` [exittrace] _exit(0) ra=0x... dso=/usr/local/lib/libnvshim.so ``` `libnvshim.so` had been given an `_exit` interposer (and at one point an -`exit` interposer too) as part of the earlier-but-abandoned attempt to suppress -the teardown SIGSEGV from inside the process. The interposer always invoked -`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status on -the floor, hard-coding `0`. None of the standard ELF / readelf / `nm` checks -flag this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and -musl's PLT happily binds `_exit` to whichever DSO comes first in symbol search -order — `LD_PRELOAD` always wins. - -**Fix**: drop the `_exit` (and `exit`) overrides from `libnvshim.so` entirely. -They were never needed for any glibc→musl ABI gap (those are all the symbol -list documented in §4 — `gnu_get_libc_version`, `__register_atfork`, -`dlmopen`, `dlvsym`, etc.). Process-lifecycle suppression belongs in the -out-of-process bash wrapper (§5b), where it can read the real exit status via -`${PIPESTATUS[0]}` and pattern-match on the actual error keywords. +`exit` interposer too) as part of the abandoned in-process attempt to +suppress the teardown SIGSEGV (P5). The interposer always invoked +`syscall(SYS_exit_group, 0)` — i.e. it dropped ffmpeg's real exit status +and hard-coded `0`. None of the standard ELF / readelf / `nm` checks flag +this: the interposer is in a separately-loaded DSO, not in `/ffmpeg`, and +musl's PLT happily binds `_exit` to whichever DSO comes first in symbol +search order — `LD_PRELOAD` always wins. + +**Fix.** Drop the `_exit` (and `exit`) overrides from `libnvshim.so` +entirely. They were never needed for any glibc→musl ABI gap (those are all +the symbols in P4). Process-lifecycle suppression belongs in the +out-of-process bash wrapper (P5), where it can read the real exit status via +`${PIPESTATUS[0]}` and pattern-match on actual error keywords. After removing the interposers, all standard ffmpeg exit codes match the -non-CUDA build byte-for-byte, and the wrapper script collapses back to its -minimal form: +non-CUDA build byte-for-byte. -```bash -#!/bin/bash -errfile=$(mktemp) -shellerr=$(mktemp) -trap "rm -f \"$errfile\" \"$shellerr\"" EXIT -exec 3>&1 -exec 4>&2 -exec 2>"$shellerr" -{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4 -rc=${PIPESTATUS[0]} -exec 3>&- -exec 2>&4 4>&- -grep -vE "Segmentation fault.*core dumped.*/ffmpeg" "$shellerr" >&2 || true -# Suppress *only* the known-benign teardown SIGSEGV from libcuda dtors. -# Real failure exit codes (1, 8, 254, ...) propagate unchanged. -if [ "$rc" = "139" ] && ! grep -qiE "(^|[^a-z])(error|cannot load|conversion failed|not found|invalid|failed|no such)" "$errfile"; then - exit 0 -fi -exit "$rc" -``` +**Lesson (now baked into Layer 5).** `LD_PRELOAD` shims should be the +*minimum* symbol set that closes the glibc→musl ABI gap. Any +process-lifecycle hook (exit, signal, atexit) added to such a shim will +silently apply to *every* call from the host program, not just the one +CUDA-driver call you were trying to fix. **Keep lifecycle policy +out-of-process.** -**Lesson**: `LD_PRELOAD` shims should be the *minimum* symbol set that closes -the glibc→musl ABI gap. Any process-lifecycle hook (exit, signal, atexit) added -to such a shim will silently apply to *every* call from the host program, not -just the one CUDA-driver call you were trying to fix. Keep lifecycle policy -out-of-process. +--- -**Diagnostic recipe** (reuse this for any future "wrong exit code" regression): +### P7. Other small issues encountered (one-line each) -```sh -docker run --rm --gpus all --entrypoint sh "$IMG" -c ' - apk add --no-cache gcc musl-dev binutils >/dev/null - cat > /tmp/t.c < -#include -#include -#include -__attribute__((noreturn)) void _exit(int s){ - void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i); - dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?"); - syscall(SYS_exit_group,s); __builtin_unreachable(); -} -EOF - gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl - LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \ - -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ - -c:v this_codec_does_not_exist -f null - -' -# The traced _exit must show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc), -# NOT dso=/usr/local/lib/libnvshim.so. If it shows nvshim, the interposer -# regression is back. -``` +| # | Issue | Fix | +|---|---|---| +| 1 | `nv-codec-headers` checksum mismatch | Recompute SHA256 against actual GitHub release tarball | +| 2 | ffmpeg link failed because `LDFLAGS` was set unconditionally and conflicted with `-static-pie` in non-CUDA branch | Gate the `LDFLAGS` export on `ENABLE_CUDA` only | +| 3 | Spurious dynamic deps (`libgomp`, `libdrm`, …) | Pre-link with `-Wl,-Bstatic` + `-static-libgcc -static-libstdc++` | +| 4 | Toolkit only mounted 180 KB stub `libcuda.so.1` (no `libnvcuvid` / `libnvidia-encode`) | Bake `ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility` into image | +| 5 | WSL2 + nvidia-container-toolkit 1.19 SIGSEGV during prestart hook | Host-side regression unrelated to image; `wsl --shutdown` + restart | + +--- -### Diagnostic playbook (for future re-entry) +## 3. Diagnostics -Quick all-in-one container probe used during this investigation: +### 3a. Quick image probe (link state, env, driver libs, dlopen, encode) ```sh -IMG=mwader/static-ffmpeg:8.1-cuda-debian-v43 +IMG=mwader/static-ffmpeg:8.1-cuda docker run --rm --gpus all --entrypoint sh "$IMG" -c ' apk add --no-cache gcc musl-dev binutils strace >/dev/null - # 1. Confirm env + linkage - echo "LD_PRELOAD=$LD_PRELOAD" + echo "=== 1. Linkage ===" ldd /ffmpeg + readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW" - # 2. Confirm path file + echo "=== 2. musl loader path ===" cat /etc/ld-musl-x86_64.path - # 3. Confirm driver libs are mounted + echo "=== 3. Driver libs mounted ===" ls -lh /usr/lib64/libcuda.so.1 /usr/lib64/libnv*.so.1 \ /usr/lib/wsl/drivers/nv_dispi.inf_amd64_*/libcuda.so.1.1 2>/dev/null - # 4. Standalone dlopen + cuInit smoke test + echo "=== 4. Standalone dlopen + cuInit ===" cat > /t.c < #include @@ -647,167 +361,93 @@ int main(void){ EOF gcc /t.c -o /t && /t - # 5. Trace what ffmpeg actually does when invoking h264_nvenc - strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg -hide_banner -loglevel error \ - -f lavfi -i testsrc=size=320x240:rate=30 -t 1 -c:v h264_nvenc -f null - 2>&1 | tail -3 - echo "--- cuda/nvidia syscalls in strace ---" + echo "=== 5. ffmpeg openat trace for h264_nvenc ===" + strace -e trace=openat,access -f -o /tmp/ff.strace /ffmpeg \ + -hide_banner -loglevel error \ + -f lavfi -i testsrc=size=320x240:rate=30 -t 1 \ + -c:v h264_nvenc -f null - 2>&1 | tail -3 grep -E "cuda|nvidia|nvcuvid|libnv|/dev/dxg|/dev/nvidia" /tmp/ff.strace | head -40 ' ``` -### What works today (final state — May 3, 2026) - -- ✅ Build succeeds with all 51 `--enable-lib*` codecs + `--enable-ffnvcodec - --enable-cuvid --enable-nvenc --enable-nvdec` on Alpine + musl. -- ✅ Image runs `ffmpeg -version`, `-buildconf`, hwaccels/encoders/decoders - enumeration showing cuda, nvenc, cuvid. -- ✅ All non-CUDA codec tests pass (libsvtav1, libvvenc, libx265, libass, - librsvg, TLS, DNS). -- ✅ All NVIDIA driver libs `dlopen` cleanly inside the container. -- ✅ Standalone musl program in same container completes `cuInit(0)` - successfully and reads driver version 13020. -- ✅ **`h264_nvenc` encode produces frames** (`frame= 60 ... speed=2.8x` etc.) - and the wrapped entrypoint exits 0. -- ✅ MP4-to-stdout (`-f mp4 -movflags frag_keyframe+empty_moov -`) emits - byte-exact output (verified vs raw `--entrypoint /ffmpeg` invocation). -- ✅ Real ffmpeg errors (bad codec, bad input, etc.) propagate unchanged - through the wrapper. -- ✅ ffprobe runs unwrapped and stable for all standard probe operations. - -### Things tried that did NOT (alone) resolve the issue (kept for posterity) - -| Attempt | Result | -|---|---| -| `--gpus all` only (no caps) | Only stub libcuda mounted, no NVENC libs | -| `LD_LIBRARY_PATH=/usr/lib64` only | `dlopen` finds file but glibc symbols missing | -| Symlink `libdl.so.2 → libgcompat.so.0` only | dlopen of stub OK, real backend FAIL on `gnu_get_libc_version` | -| nvshim with `gnu_get_libc_version` only | Next missing: `__register_atfork` | -| Add `__register_atfork` + `secure_getenv` + `__cxa_thread_atexit_impl` | Next missing: `dlmopen` | -| Add `dlmopen` + `__libc_dlopen_mode/dlsym/dlclose` | Next missing: `dlvsym` | -| Add `dlvsym` | All driver libs dlopen cleanly + standalone `cuInit` succeeds | -| `-Wl,--no-as-needed,-Bdynamic,-lc,--as-needed,-Bstatic` in extra-ldflags | Still pulled `libc.a` `dlopen` stub via gcc-hardened spec file | -| Hide `/usr/lib/libc.a` during link | libgme.a configure-time symbol checks failed (gz*/inflate*) | -| Absolute-path `-Wl,/lib/ld-musl-x86_64.so.1` in extra-ldflags | ✅ NVENC encode finally succeeds | -| nvshim `exit()` interpose + atexit `_exit()` | SIGSEGV happens *before* main() returns, so atexit never runs — ineffective. **Worse**: leaving the `_exit` interposer in the shim silently swallowed *every* ffmpeg exit code (always returned 0). See §5c. | -| Entrypoint wrapper translating exit 139 → 0 with error-keyword guard | ✅ Final fix; clean exit 0 with stdout/stderr passthrough preserved, real exit codes (8/254/…) propagate unchanged | - -### Decision branch (resolved — stayed on Alpine) - -The escape hatch of switching `final-cuda` to `debian:bookworm-slim` was -**not needed**. The Alpine + musl + gcompat + nvshim stack works end-to-end -once the link-time absolute-path fix and the entrypoint wrapper are in place. - -The Alpine variant remains preferable because: - -1. The image is ~4x smaller than the Debian equivalent would be. -2. Existing CI/build infrastructure for `mwader/static-ffmpeg` is Alpine-based; - no parallel `builder-glibc` stage needs to be maintained. -3. The static archive produced for non-libc deps is identical between the - default and CUDA variants — only the link step differs. - -The only ongoing maintenance cost is **nvshim symbol drift**: each new NVIDIA -driver release may reference an additional glibc-internal symbol that -gcompat doesn't ship, requiring a one-line addition to `libnvshim.so`. The -diagnostic playbook (next section) documents how to detect and fix this in -under five minutes. +### 3b. "Wrong exit code" regression check (guards against P6) ---- - -## 14. Final architecture (the six-layer stack) - -The working CUDA variant is the composition of six independently-essential layers. -Removing any one breaks NVENC end-to-end. They are listed in the order they take effect: - -| # | Layer | Where | Purpose | -|---|---|---|---| -| 1 | **Absolute-path libc link** | builder, ffmpeg `--extra-ldflags` | Forces `dlopen`/`dlsym`/`dlerror`/`dlclose` to resolve dynamically against the real musl libc instead of `libc.a`'s NULL-returning stub. Without this the binary appears to build fine but `dlopen()` of `libcuda.so.1` returns NULL with no syscall. | -| 2 | **Dynamic-PIE link mode** | builder, ffmpeg link | Replaces `-fPIE -static-pie` with `-fPIE -pie`. A static-pie binary has no dynamic loader, making `dlopen` impossible by definition. | -| 3 | **`/etc/ld-musl-x86_64.path`** | final-cuda stage | Adds `/usr/lib64`, `/usr/lib/x86_64-linux-gnu`, `/usr/lib/wsl/lib` to musl's loader search path. The NVIDIA Container Toolkit injects driver libs into one of these depending on host distro; musl's default `/lib:/usr/local/lib:/usr/lib` finds none of them. | -| 4 | **`gcompat` package + `libdl.so.2` symlink** | final-cuda stage | Provides `libc.so.6` / `libm.so.6` / `libpthread.so.0` / `librt.so.1` as musl wrappers (the driver's `DT_NEEDED` entries). The symlink points the driver's `libdl.so.2` reference at `libgcompat.so.0` since musl folds dlopen into libc and ships no separate `libdl`. | -| 5 | **`libnvshim.so` LD_PRELOAD** | final-cuda stage | Exports glibc-internal symbols the driver references but gcompat doesn't ship: `gnu_get_libc_version`, `__register_atfork`, `__cxa_thread_atexit_impl`, `secure_getenv`, `dlmopen`, `dlvsym`, `__libc_dlopen_mode/dlsym/dlclose`, `__libc_current_sigrtmin/max`, `__libc_single_threaded`, `gnu_get_libc_release`. Without the shim, dlopen of the WSL2 backend `libcuda.so.1.1` fails with `symbol not found` errors. **Must NOT export `exit`/`_exit`/`_Exit`** — see §5c; interposing those swallows ffmpeg's real exit status. | -| 6 | **Entrypoint wrapper** | final-cuda stage | Bash script that exec's `/ffmpeg`, captures exit code via `${PIPESTATUS[0]}`, preserves stdout byte-exact via fd-3 trick, tees stderr to a temp file, and downgrades exit 139 → 0 *only* when stderr contains no recognised error keyword. Suppresses the cosmetic libcuda-destructor SIGSEGV that fires after the encode is fully complete. | - -Layers 1–2 belong to the **builder stage** (link-time concerns). -Layers 3–6 belong to the **`final-cuda` runtime stage** (loader, ABI, lifecycle concerns). +```sh +docker run --rm --gpus all --entrypoint sh "$IMG" -c ' + apk add --no-cache gcc musl-dev >/dev/null + cat > /tmp/t.c < +#include +#include +#include +__attribute__((noreturn)) void _exit(int s){ + void *ra=__builtin_return_address(0); Dl_info i={0}; dladdr(ra,&i); + dprintf(2,"[trace] _exit(%d) dso=%s\n",s,i.dli_fname?i.dli_fname:"?"); + syscall(SYS_exit_group,s); __builtin_unreachable(); +} +EOF + gcc -O0 -fPIC -shared -o /tmp/t.so /tmp/t.c -ldl + LD_PRELOAD="/tmp/t.so:${LD_PRELOAD}" /ffmpeg -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ + -c:v this_codec_does_not_exist -f null - +' +# The traced _exit MUST show dso=/lib/ld-musl-x86_64.so.1 (i.e. real libc). +# If it shows dso=/usr/local/lib/libnvshim.so → P6 regression is back. +``` -### Diagram of the runtime call chain +### 3c. dlopen-stub regression check (guards against P1) -``` -docker run --gpus all ⇒ toolkit injects libcuda.so.1 → /usr/lib64 - + sets NVIDIA_DRIVER_CAPABILITIES from image ENV - │ - ▼ -ffmpeg-cuda-entrypoint (bash) ← layer 6 - │ exec - ▼ -/ffmpeg (musl dynamic-PIE, libc-only NEEDED) - │ ld.so loads libc.musl-x86_64.so.1 - │ (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path) ← layer 3 - │ LD_PRELOAD → /usr/local/lib/libnvshim.so ← layer 5 - ▼ -ffnvcodec dynlink_loader.h: - dlopen("libcuda.so.1", RTLD_LAZY) ← needs layer 1 (real PLT entry) - │ - ▼ ld.so loads libcuda.so.1 (WSL stub) - │ resolves DT_NEEDED libdl.so.2 → libgcompat.so.0 ← layer 4 - │ - ▼ libcuda dlopens its WSL backend libcuda.so.1.1 - │ resolves glibc-internals via libnvshim.so ← layer 5 - │ - ▼ encode runs successfully, frames produced, output flushed - │ - ▼ ffmpeg main() → avcodec_close → cuCtxDestroy - │ libcuda __cxa_finalize crashes during teardown ☠ SIGSEGV - │ - ▼ wrapper sees exit=139, no error keyword in stderr → exit 0 ← layer 6 +```sh +docker run --gpus all --rm --entrypoint sh "$IMG" -c ' + apk add --no-cache binutils >/dev/null 2>&1 + readelf -s --dyn-syms /ffmpeg | grep -E "dlopen|dlsym|dlerror|dlclose" +' +# Each must be 0-size UND (or not exported at all). A non-zero size in .text +# (e.g. " 25 FUNC ... 14 dlopen") means the static stub bug is back. ``` --- -## 15. ffprobe note - -`ffprobe` shares the same link-time and runtime-loader configuration as `ffmpeg` -(layers 1–5 above), but does **not** need the entrypoint wrapper because: +## 4. Build & verify -- It doesn't open NVENC encoders, so `nvenc_free → cuCtxDestroy` is never invoked. -- Its `-hwaccel` option is silently ignored (it's an `ffmpeg`-only flag). -- It doesn't auto-initialize CUDA for normal probe/show operations. - -Tested invocations that all return exit 0 cleanly without the wrapper: +### Build ```sh -docker run --rm --gpus all --entrypoint /ffprobe IMG -version -docker run --rm --gpus all --entrypoint /ffprobe IMG \ - -f lavfi -i testsrc=duration=1:size=320x240:rate=30 -show_streams -of json -docker run --rm --gpus all --entrypoint /ffprobe IMG -i some_h264.mp4 -``` +cd /path/to/static-ffmpeg -If a future ffmpeg/driver combination ever makes `ffprobe` reach the crashing -destructor path, the same wrapper script can be installed with the binary path -parametrised. Not worth the extra layer today. +docker build --no-cache \ + --build-arg ENABLE_CUDA=1 \ + --target final-cuda \ + -t mwader/static-ffmpeg:8.1-cuda . +``` ---- +> Use `--no-cache` if you previously built `:8.1-cuda` with broken link +> flags — Docker will otherwise reuse the cached ffmpeg layer that contains +> the static `dlopen` stub. Full rebuild ~45–75 min (libaom, libvmaf, x265, +> svt-av1, vvenc dominate). -## 16. Final verification recipe (May 3, 2026) +If you only changed the `final-cuda` stage (env, ld-musl path, wrapper), +`--no-cache` is unnecessary. -Replace `IMG` with your actual tag. +### Final verification recipe (all five must pass) ```sh -IMG=mwader/static-ffmpeg:8.1-cuda-debian-v47 # or :8.1-cuda after retag +IMG=mwader/static-ffmpeg:8.1-cuda -# 1. Static-ness check (binary should have exactly one NEEDED entry: musl libc) +# 1. Static-ness check (exactly one NEEDED entry: musl libc) docker run --rm --entrypoint sh "$IMG" -c ' apk add --no-cache binutils >/dev/null 2>&1 readelf -d /ffmpeg | grep -E "NEEDED|BIND_NOW" ' -# 2. NVENC encode end-to-end (the real test) +# 2. NVENC encode end-to-end docker run --rm --gpus all "$IMG" \ -hide_banner -loglevel error \ -f lavfi -i testsrc=duration=2:size=1280x720:rate=30 \ -c:v h264_nvenc -f null - ; echo "exit=$? (must be 0)" -# 3. MP4-to-stdout byte-exactness (wrapper passthrough check) +# 3. MP4-to-stdout byte-exactness (wrapper passthrough) docker run --rm --gpus all "$IMG" \ -hide_banner -loglevel error \ -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ @@ -818,7 +458,7 @@ docker run --rm --gpus all "$IMG" \ docker run --rm --gpus all --entrypoint /ffprobe "$IMG" -version >/dev/null echo "exit=$? (must be 0)" -# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for §5c) +# 5. Exit-code parity vs non-CUDA :8.1 (regression guard for P6) docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \ -f lavfi -i testsrc=duration=1:size=320x240:rate=30 \ -c:v this_codec_does_not_exist -f null - ; echo "exit=$? (must be 8)" @@ -826,4 +466,84 @@ docker run --rm --gpus all "$IMG" -hide_banner -loglevel error \ -i /no/such/file.mp4 -f null - ; echo "exit=$? (must be 254)" ``` -All four must succeed for the image to be considered shippable. +--- + +## 5. Runtime requirements + +### Host +- NVIDIA driver installed. +- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) installed and configured for Docker. +- Run with `--gpus all` (or `--runtime=nvidia` + `NVIDIA_VISIBLE_DEVICES`). + +### Image-side env (set by Dockerfile) +- `NVIDIA_VISIBLE_DEVICES=all` +- `NVIDIA_DRIVER_CAPABILITIES=compute,utility,video` + - `compute` → `libcuda.so.1` + - `video` → `libnvcuvid.so`, `libnvidia-encode.so` + - Dropping `video` makes `nvidia-smi` work but breaks `h264_nvenc` with `Cannot load libcuda.so.1`. + +### Toolkit driver-injection layouts covered by `/etc/ld-musl-x86_64.path` +- Debian/Ubuntu hosts → `/usr/lib/x86_64-linux-gnu` +- RHEL/Fedora hosts → `/usr/lib64` +- WSL2 → `/usr/lib/wsl/lib` + +--- + +## 6. Runtime call chain (six layers in action) + +``` +docker run --gpus all ⇒ toolkit injects libcuda.so.1 → /usr/lib64 + + sets NVIDIA_DRIVER_CAPABILITIES from image ENV + │ + ▼ +ffmpeg-cuda-entrypoint (bash) ← Layer 6 (P5) + │ exec + ▼ +/ffmpeg (musl dynamic-PIE, libc-only NEEDED) ← Layer 2 (P1) + │ ld.so loads libc.musl-x86_64.so.1 + │ (search path includes /usr/lib64 from /etc/ld-musl-x86_64.path) ← Layer 3 (P3) + │ LD_PRELOAD → /usr/local/lib/libnvshim.so ← Layer 5 (P4) + ▼ +ffnvcodec dynlink_loader.h: + dlopen("libcuda.so.1", RTLD_LAZY) ← needs Layer 1 (real PLT entry, P1) + │ + ▼ ld.so loads libcuda.so.1 (WSL stub) + │ resolves DT_NEEDED libdl.so.2 → libgcompat.so.0 ← Layer 4 (P4) + │ + ▼ libcuda dlopens its WSL backend libcuda.so.1.1 + │ resolves glibc-internals via libnvshim.so ← Layer 5 (P4) + │ + ▼ encode runs successfully, frames produced, output flushed + │ + ▼ ffmpeg main() → avcodec_close → cuCtxDestroy + │ libcuda __cxa_finalize crashes during teardown ☠ SIGSEGV (P5) + │ + ▼ wrapper sees exit=139, no error keyword in stderr → exit 0 ← Layer 6 (P5) +``` + +--- + +## 7. Comparison with other static ffmpeg + nvenc projects + +| Project | Static? | NVENC? | Approach | +|---|---|---|---| +| `mwader/static-ffmpeg:8.1` | ✅ static-pie musl | ❌ | Pure static, no dlopen | +| `mwader/static-ffmpeg:8.1-cuda` | ⚠️ musl dynamic-PIE (libc only) | ✅ | Hybrid — only libc dynamic; `dlopen()` works | +| BtbN/FFmpeg-Builds (LGPL/GPL) | ⚠️ glibc dynamic + runtime ldconfig | ✅ | Tarball, glibc-linked | +| HiWay-Media/ffmpeg-nvenc-static | ⚠️ glibc dynamic | ✅ | Bundled libs | +| markus-perl/ffmpeg-build-script | ⚠️ glibc dynamic | optional | Script, not container | + +Of these, only `:8.1-cuda` keeps every codec/lib statically linked — every +other "static + nvenc" build is glibc-dynamic. The trade-off vs the default +`:8.1` is exactly one libc.so dependency. + +--- + +## 8. CI / publishing notes + +- Default tag: built for `linux/amd64,linux/arm64` as before. +- CUDA tag: built for `linux/amd64` only. + - Pushed as `-cuda` (and re-tagged manifest-style as `-cuda-amd64` for clarity). + - `latest-cuda` follows latest stable. +- Use `--target final-cuda` and `--build-arg ENABLE_CUDA=1` in the CI matrix. + From e3f8bdb60171be53e0a43a1998c365389b76ad36 Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 3 May 2026 21:21:45 +0200 Subject: [PATCH 7/8] update ci to run cuda after amd64 build to use cache --- .github/workflows/multiarch.yml | 114 ++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml index 02e435f..26f15a7 100644 --- a/.github/workflows/multiarch.yml +++ b/.github/workflows/multiarch.yml @@ -12,51 +12,82 @@ env: REGISTRY_IMAGE: mwader/static-ffmpeg jobs: - build: - name: Build image (${{ matrix.variant }} / ${{ matrix.tag }}) - strategy: - fail-fast: false - matrix: - include: - # default fully-static build, multi-arch - - runs_on: ubicloud-standard-8-arm - tag: arm64 - variant: default - target: "" - build_args: "" - - runs_on: ubuntu-latest - tag: amd64 - variant: default - target: "" - build_args: "" - # CUDA variant (NVENC/NVDEC/CUVID), amd64 only. - # If/when ffnvcodec is regularly tested on Jetson/arm64, add an arm64 entry. - - runs_on: ubuntu-latest - tag: amd64 - variant: cuda - target: final-cuda - build_args: ENABLE_CUDA=1 + # arm64 default — independent, runs in parallel with amd64. + build-default-arm64: + name: Build image (default / arm64) + runs-on: ubicloud-standard-8-arm + steps: + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Docker build + uses: docker/build-push-action@v6 + with: + context: . + tags: image:default-arm64 + load: true + cache-from: type=gha,scope=builder-arm64 + cache-to: type=gha,scope=builder-arm64,mode=max + - name: Docker save + run: docker image save --output image-default-arm64.tar image:default-arm64 + - uses: actions/upload-artifact@v4 + with: + name: image-default-arm64 + path: image-default-arm64.tar + retention-days: 1 - runs-on: ${{ matrix.runs_on }} + # amd64 default — populates the shared builder-amd64 cache scope. + build-default-amd64: + name: Build image (default / amd64) + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - name: Docker build - run: | - docker build \ - ${{ matrix.target && format('--target {0}', matrix.target) || '' }} \ - ${{ matrix.build_args && format('--build-arg {0}', matrix.build_args) || '' }} \ - --tag image:${{ matrix.variant }}-${{ matrix.tag }} \ - . + uses: docker/build-push-action@v6 + with: + context: . + tags: image:default-amd64 + load: true + cache-from: type=gha,scope=builder-amd64 + cache-to: type=gha,scope=builder-amd64,mode=max - name: Docker save - run: | - docker image save \ - --output image-${{ matrix.variant }}-${{ matrix.tag }}.tar \ - image:${{ matrix.variant }}-${{ matrix.tag }} - - name: Upload Docker image-${{ matrix.variant }}-${{ matrix.tag }} - uses: actions/upload-artifact@v4 + run: docker image save --output image-default-amd64.tar image:default-amd64 + - uses: actions/upload-artifact@v4 + with: + name: image-default-amd64 + path: image-default-amd64.tar + retention-days: 1 + + # CUDA variant (NVENC/NVDEC/CUVID), amd64 only. + # Runs *after* default-amd64 so it reuses the populated builder-amd64 + # cache scope: every builder layer up to the final stage is a cache hit, + # and only the final-cuda stage has to materialize. + build-cuda-amd64: + name: Build image (cuda / amd64) + runs-on: ubuntu-latest + needs: build-default-amd64 + steps: + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Docker build + uses: docker/build-push-action@v6 + with: + context: . + target: final-cuda + build-args: ENABLE_CUDA=1 + tags: image:cuda-amd64 + load: true + cache-from: type=gha,scope=builder-amd64 + cache-to: type=gha,scope=builder-amd64,mode=max + - name: Docker save + run: docker image save --output image-cuda-amd64.tar image:cuda-amd64 + - uses: actions/upload-artifact@v4 with: - name: image-${{ matrix.variant }}-${{ matrix.tag }} - path: image-${{ matrix.variant }}-${{ matrix.tag }}.tar + name: image-cuda-amd64 + path: image-cuda-amd64.tar retention-days: 1 tag: @@ -79,7 +110,8 @@ jobs: name: Merge and push default images runs-on: ubuntu-latest needs: - - build + - build-default-arm64 + - build-default-amd64 - tag steps: - name: Download digests @@ -119,7 +151,7 @@ jobs: name: Push CUDA image (amd64 only) runs-on: ubuntu-latest needs: - - build + - build-cuda-amd64 - tag steps: - name: Download digests From 7c4a81966be0985cb6fead9c8e96794c11bdb6e5 Mon Sep 17 00:00:00 2001 From: ToshY <31921460+ToshY@users.noreply.github.com> Date: Sun, 3 May 2026 22:58:58 +0200 Subject: [PATCH 8/8] let wrapper point directly to ffmpeg instead --- Dockerfile | 29 ++++++++++------- README.md | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 746e9aa..4602cf6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1179,7 +1179,6 @@ RUN \ --enable-version3 \ $FDKAAC_FLAGS \ $CUDA_FLAGS \ - --enable-openssl \ --enable-fontconfig \ --enable-gray \ --enable-iconv \ @@ -1236,6 +1235,7 @@ RUN \ --enable-libxvid \ --enable-libzimg \ --enable-libzmq \ + --enable-openssl \ || (cat ffbuild/config.log ; false) && \ make -j$(nproc) install @@ -1383,7 +1383,11 @@ ENTRYPOINT ["/ffmpeg"] # --enable-libnpp / --enable-cuda-nvcc are NOT included (require glibc CUDA toolkit). # Use scale_cuda instead of scale_npp. FROM alpine:3.20.3 AS final-cuda1 -COPY --from=builder /usr/local/bin/ffmpeg / +# Real ffmpeg ELF lives at /ffmpeg.bin; /ffmpeg is the bash wrapper (added below) +# that execs it. This way `COPY --from=...:cuda /ffmpeg /ffmpeg.bin /` from +# a downstream image gives a drop-in /ffmpeg that already includes the +# teardown-SIGSEGV workaround — no custom ENTRYPOINT needed. +COPY --from=builder /usr/local/bin/ffmpeg /ffmpeg.bin COPY --from=builder /usr/local/bin/ffprobe / COPY --from=builder /versions.json / COPY --from=builder /usr/local/share/doc/ffmpeg/* /doc/ @@ -1458,11 +1462,11 @@ RUN printf '/lib\n/usr/local/lib\n/usr/lib\n/usr/lib64\n/usr/lib/x86_64-linux-gn > /etc/ld-musl-x86_64.path -# Entrypoint wrapper: convert the benign teardown SIGSEGV (139 -> 0) that -# libcuda's __cxa_finalize triggers under musl + gcompat. The crash happens -# inside main() after the encode is complete and all output is flushed, so -# no in-process hook can suppress it. Heuristic: only downgrade 139 when -# stderr contains no recognisable error keyword. Real failure exit codes +# Entrypoint wrapper installed AS /ffmpeg itself: convert the benign teardown +# SIGSEGV (139 -> 0) that libcuda's __cxa_finalize triggers under musl + gcompat. +# The crash happens inside main() after the encode is complete and all output is +# flushed, so no in-process hook can suppress it. Heuristic: only downgrade 139 +# when stderr contains no recognisable error keyword. Real failure exit codes # (1, 8, 254, ...) propagate unchanged. See docs/ffmpeg-with-cuda.md (P5). RUN apk add --no-cache bash && \ printf '%s\n' \ @@ -1473,7 +1477,7 @@ RUN apk add --no-cache bash && \ 'exec 3>&1' \ 'exec 4>&2' \ 'exec 2>"$shellerr"' \ - '{ /ffmpeg "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \ + '{ /ffmpeg.bin "$@" 2>&1 1>&3 3>&-; } | tee "$errfile" >&4' \ 'rc=${PIPESTATUS[0]}' \ 'exec 3>&-' \ 'exec 2>&4 4>&-' \ @@ -1482,12 +1486,13 @@ RUN apk add --no-cache bash && \ ' exit 0' \ 'fi' \ 'exit "$rc"' \ - > /usr/local/bin/ffmpeg-cuda-entrypoint && \ - chmod +x /usr/local/bin/ffmpeg-cuda-entrypoint + > /ffmpeg && \ + chmod +x /ffmpeg # sanity tests (cannot exercise actual GPU encode without a GPU at build time). -# LD_PRELOAD set inline since the env is only declared in the final stage below. +# /ffmpeg goes through the wrapper -> /ffmpeg.bin; both must work. RUN ["/ffmpeg", "-version"] +RUN ["/ffmpeg.bin", "-version"] RUN ["/ffprobe", "-version"] RUN ["/ffmpeg", "-hide_banner", "-buildconf"] RUN /ffmpeg -hide_banner -hwaccels 2>&1 | grep -q cuda @@ -1506,4 +1511,4 @@ LABEL maintainer="Mattias Wadman mattias.wadman@gmail.com" ENV NVIDIA_VISIBLE_DEVICES=all \ NVIDIA_DRIVER_CAPABILITIES=compute,video,utility \ LD_PRELOAD=/usr/local/lib/libnvshim.so -ENTRYPOINT ["/usr/local/bin/ffmpeg-cuda-entrypoint"] +ENTRYPOINT ["/ffmpeg"] diff --git a/README.md b/README.md index dbbfbad..c7ef754 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,101 @@ docker run --gpus all --rm --entrypoint=/ffmpeg my-ffmpeg-static:cuda -hide_bann Supported encoders: `h264_nvenc`, `hevc_nvenc`, `av1_nvenc` (GPU dependent). Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …). +#### Use in another image with `COPY --from` + +Unlike the default static binary, the CUDA variant has runtime dependencies +beyond the binary itself. To get a working NVENC/NVDEC build in your own +image you need to copy **all** of the following from `:8.1-cuda`: + +```Dockerfile +FROM alpine:3.20 + +# 1. The binaries. /ffmpeg in the cuda image is a bash wrapper that execs +# /ffmpeg.bin (the real ELF) — it downgrades the benign teardown SIGSEGV +# (exit 139 → 0) while preserving real ffmpeg exit codes. Both files must +# be copied; the wrapper expects to find /ffmpeg.bin at the same root. +COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg /ffmpeg +COPY --from=mwader/static-ffmpeg:8.1-cuda /ffmpeg.bin /ffmpeg.bin +COPY --from=mwader/static-ffmpeg:8.1-cuda /ffprobe /usr/local/bin/ + +# 2. musl loader path file — adds /usr/lib64, /usr/lib/wsl/lib, etc. so musl +# can find the toolkit-injected NVIDIA driver libs. +COPY --from=mwader/static-ffmpeg:8.1-cuda /etc/ld-musl-x86_64.path /etc/ld-musl-x86_64.path + +# 3. The glibc → musl ABI shim (LD_PRELOAD'd into ffmpeg). +COPY --from=mwader/static-ffmpeg:8.1-cuda /usr/local/lib/libnvshim.so /usr/local/lib/ + +# 4. gcompat + bash + the libdl.so.2 → libgcompat.so.0 symlink the NVIDIA +# driver libs need at DT_NEEDED resolution time. bash is required by the +# /ffmpeg wrapper script. +RUN apk add --no-cache gcompat libstdc++ bash && \ + ln -sf /usr/lib/libgcompat.so.0 /usr/lib/libdl.so.2 + +# 5. Toolkit env (compute → libcuda.so.1, video → libnvcuvid/libnvidia-encode). +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility,video \ + LD_PRELOAD=/usr/local/lib/libnvshim.so + +ENTRYPOINT ["/ffmpeg"] +``` + +Notes: + +- The base image **must be Alpine** (or otherwise musl-based with a compatible + musl major version). Glibc-based images — including `debian:*-slim` + (e.g. `bookworm-slim`), `ubuntu:*`, `python:*-slim`, `nvidia/cuda:*`, + `redhat/ubi*`, etc. — are **not** supported destinations: the binary's + `PT_INTERP` is `/lib/ld-musl-x86_64.so.1`, which doesn't exist on those + distros, and the `gcompat` shim in step 4 is Alpine-only. If you need a + Debian/Ubuntu runtime, run the published `mwader/static-ffmpeg:-cuda` + image directly (it's already Alpine-based) instead of `COPY --from`'ing + into a glibc base. +- Skipping any of items 2–5 will produce a binary that builds and runs + `-version` fine but fails at the first NVENC/NVDEC call. +- Run with `--gpus all` (and the NVIDIA Container Toolkit installed on the + host) for GPU access — same as running `mwader/static-ffmpeg:8.1-cuda` + directly. + +##### Multi-process images (Python / Node / app + ffmpeg) + +The example above sets `LD_PRELOAD=/usr/local/lib/libnvshim.so` as image-wide +`ENV`. That's safe in an **ffmpeg-only** image (the published `:*-cuda` image +runs only `/ffmpeg`, which was built and tested with the shim preloaded), but +it is **not** safe in an image that also runs other musl binaries — `pip`, +`python`, `node`, your app, etc. `libnvshim.so` exports glibc-only symbols and +transitively pulls in `gcompat` (via `DT_NEEDED libdl.so.2`). Forcing that +into every process tends to crash CPython and other musl interpreters with +`SIGSEGV` (exit code 139) at startup. + +For multi-process images, scope the preload to ffmpeg only with a small +wrapper instead of `ENV LD_PRELOAD`: + +```Dockerfile +# Replace step 5's `LD_PRELOAD=...` ENV line with a wrapper that sets +# LD_PRELOAD only for the ffmpeg process. Other processes (pip, python, +# sh, ...) run with a clean environment. The wrapper at /usr/local/bin/ffmpeg +# also exposes ffmpeg on PATH for your app to call as `ffmpeg`. +RUN printf '%s\n' \ + '#!/bin/sh' \ + 'exec env LD_PRELOAD=/usr/local/lib/libnvshim.so /ffmpeg "$@"' \ + > /usr/local/bin/ffmpeg \ + && chmod +x /usr/local/bin/ffmpeg + +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility,video +# (no ENV LD_PRELOAD here) +``` + +`/usr/local/bin/ffmpeg` (the wrapper) execs `/ffmpeg` (the static-ffmpeg bash +wrapper that downgrades the benign teardown SIGSEGV) which execs +`/ffmpeg.bin` (the real ELF). Exit codes propagate unchanged via `exec`. Your +app continues to call `ffmpeg` from `PATH` as normal. + +If you also invoke `ffprobe` against CUDA-accelerated decoders and see it +crash, wrap it the same way (rename the copied binary to `ffprobe.bin` first +and put the wrapper at `/usr/local/bin/ffprobe`). For most ffprobe use cases +this isn't needed. + #### Limitations - `--enable-cuda-nvcc` and `--enable-libnpp` are **not** included — they require @@ -188,6 +283,7 @@ Supported decoders / hwaccel: `cuda`, `cuvid` (`h264_cuvid`, `hevc_cuvid`, …). musl libc (i.e. an Alpine-based image of the matching `musl` major version). - Without `--gpus all` (or without the NVIDIA Container Toolkit) the binary still runs but `nvenc`/`nvdec`/`cuda` initialization will fail at runtime. +- amd64 only. ### Fonts usage with SVG or draw text filters etc